In [1]:
from package import process_dataset
from package import encoder
from package import models

import numpy as np

from sklearn.model_selection import train_test_split
import pandas as pd

Loaded  7 encoders.




In [2]:
description = pd.read_csv("data/dicionario.csv", encoding='latin-1')
train = pd.read_csv("data/dados_treino_hackaton.csv", encoding='latin-1', index_col=False)
test = pd.read_csv("data/dados_teste_x_hackaton.csv", encoding='latin-1', index_col=False)

del train['Unnamed: 0']
del test['Unnamed: 0']

In [3]:
train = encoder.encode_DataFrame(train)
train = process_dataset.processColumns(train)

In [4]:
train_cols, target = process_dataset.generate_labels(train)

In [5]:
X = process_dataset.scaleData(train[train_cols])
y = train[target]

Using saved scaler.


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear regression

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
lin = LinearRegression()
lin.fit(X_train, y_train)

In [9]:
pred = lin.predict(X_test)

In [10]:
ypred = models.convertPredicted(pred)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.73      0.75      0.74        63
           1       0.73      0.72      0.72        60

    accuracy                           0.73       123
   macro avg       0.73      0.73      0.73       123
weighted avg       0.73      0.73      0.73       123

Accuracy score:  0.7317073170731707
Mean Squared Error:  0.2682926829268293
roc_auc score:  0.7313492063492064





## Decision tree regressor

In [237]:
from sklearn.tree import DecisionTreeRegressor

In [238]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

In [239]:
models.run_analysis(tree.predict(X_test), y_test)

              precision    recall  f1-score   support

           0       0.60      0.59      0.59        63
           1       0.57      0.58      0.58        60

    accuracy                           0.59       123
   macro avg       0.59      0.59      0.59       123
weighted avg       0.59      0.59      0.59       123

Acc score:  0.5853658536585366
Mean Squared Error:  0.4146341463414634





## Random Forest Regressor

In [240]:
from sklearn.ensemble import RandomForestRegressor

In [241]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

In [242]:
ypred = models.convertPredicted(forest.predict(X_test))

In [243]:
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.69      0.78      0.73        63
           1       0.73      0.63      0.68        60

    accuracy                           0.71       123
   macro avg       0.71      0.71      0.70       123
weighted avg       0.71      0.71      0.71       123

Acc score:  0.7073170731707317
Mean Squared Error:  0.2926829268292683





## Testing polynomial regressor

In [244]:
from sklearn.preprocessing import PolynomialFeatures

In [245]:
p = PolynomialFeatures(degree = 3, include_bias=False)

In [246]:
pX_train = p.fit_transform(X_train)

In [247]:
lin = LinearRegression()
lin.fit(pX_train, y_train)

In [248]:
pred = lin.predict(p.transform(X_test))
ypred = models.convertPredicted(pred)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.61      0.60      0.61        63
           1       0.59      0.60      0.60        60

    accuracy                           0.60       123
   macro avg       0.60      0.60      0.60       123
weighted avg       0.60      0.60      0.60       123

Acc score:  0.6016260162601627
Mean Squared Error:  0.3983739837398374





## SGD Regression

In [249]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

from sklearn.base import clone

In [250]:
sgd = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True, penalty=None, learning_rate="constant", eta0=0.0005)

In [251]:
minimal_error = float("inf")
best_epoch = None
best_model = None

for epoch in range(6000):
    sgd.fit(X_train, y_train)
    y_val_pred = sgd.predict(X_test)
    val_error = mean_squared_error(y_test, y_val_pred)
    if val_error < minimal_error:
        minimal_error = val_error
        best_epoch = epoch
        best_model = clone(sgd)


In [252]:
ypred = models.convertPredicted(sgd.predict(X_test))
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.72      0.75      0.73        63
           1       0.72      0.70      0.71        60

    accuracy                           0.72       123
   macro avg       0.72      0.72      0.72       123
weighted avg       0.72      0.72      0.72       123

Acc score:  0.7235772357723578
Mean Squared Error:  0.2764227642276423





## Logistic Regression

In [253]:
from sklearn.linear_model import LogisticRegression

In [254]:
log = LogisticRegression()

In [255]:
log.fit(X_train, y_train)

In [256]:
ypred = log.predict(X_test)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.70      0.73      0.71        63
           1       0.70      0.67      0.68        60

    accuracy                           0.70       123
   macro avg       0.70      0.70      0.70       123
weighted avg       0.70      0.70      0.70       123

Acc score:  0.6991869918699187
Mean Squared Error:  0.3008130081300813





## Support vector machines

In [257]:
from sklearn.svm import LinearSVC

In [258]:
svm = LinearSVC()
svm.fit(X_train, y_train)

In [259]:
ypred = svm.predict(X_test)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.71      0.73      0.72        63
           1       0.71      0.68      0.69        60

    accuracy                           0.71       123
   macro avg       0.71      0.71      0.71       123
weighted avg       0.71      0.71      0.71       123

Acc score:  0.7073170731707317
Mean Squared Error:  0.2926829268292683





In [260]:
from sklearn.svm import SVC

In [261]:
svm = SVC(kernel='poly', C=10)
svm.fit(X_train, y_train)

In [262]:
ypred = svm.predict(X_test)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.67      0.79      0.72        63
           1       0.73      0.58      0.65        60

    accuracy                           0.69       123
   macro avg       0.70      0.69      0.69       123
weighted avg       0.70      0.69      0.69       123

Acc score:  0.6910569105691057
Mean Squared Error:  0.3089430894308943





## Decision tree classifier

In [263]:
from sklearn.tree import DecisionTreeClassifier

In [264]:
tree = DecisionTreeClassifier(max_depth=2)
tree.fit(X_train, y_train)

In [265]:
ypred = tree.predict(X_test)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.64      0.78      0.71        63
           1       0.70      0.55      0.62        60

    accuracy                           0.67       123
   macro avg       0.67      0.66      0.66       123
weighted avg       0.67      0.67      0.66       123

Acc score:  0.6666666666666666
Mean Squared Error:  0.3333333333333333





In [266]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

In [267]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [268]:
voting = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard'
)
voting.fit(X_train, y_train)

In [269]:
ypred = voting.predict(X_test)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.70      0.73      0.71        63
           1       0.70      0.67      0.68        60

    accuracy                           0.70       123
   macro avg       0.70      0.70      0.70       123
weighted avg       0.70      0.70      0.70       123

Acc score:  0.6991869918699187
Mean Squared Error:  0.3008130081300813





In [270]:
import xgboost

In [271]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train)

In [272]:
ypred = xgb.predict(X_test)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.69      0.75      0.72        63
           1       0.71      0.65      0.68        60

    accuracy                           0.70       123
   macro avg       0.70      0.70      0.70       123
weighted avg       0.70      0.70      0.70       123

Acc score:  0.6991869918699187
Mean Squared Error:  0.3008130081300813





In [273]:
xgb = xgboost.XGBClassifier(
        max_depth=2,
        gamma=0.1,
        eta=1.5,
        reg_alpha=1.1,
        reg_lambda=0.7)
xgb.fit(X_train, y_train)

In [274]:
ypred = xgb.predict(X_test)
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.83      0.83      0.83        63
           1       0.82      0.82      0.82        60

    accuracy                           0.82       123
   macro avg       0.82      0.82      0.82       123
weighted avg       0.82      0.82      0.82       123

Acc score:  0.8211382113821138
Mean Squared Error:  0.17886178861788618





In [275]:
2, 0.1, 1.5, 1.1, 0.7

(2, 0.1, 1.5, 1.1, 0.7)

# Fine Tuning

In [276]:
from sklearn.model_selection import GridSearchCV

In [277]:
param_grid = [
    {'n_estimators': [3, 7, 10, 20, 30], 'max_features': [2, 4, 6, 8, 10]},
    {'bootstrap': [False], 'n_estimators': [3, 7, 10, 12], 'max_features': [2, 3, 4, 8, 12, 15]},
]

In [225]:
forest = RandomForestRegressor()

In [226]:
grid = GridSearchCV(forest, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True, refit=True)

In [227]:
grid.fit(X_train, y_train)

In [228]:
grid.best_params_

{'max_features': 8, 'n_estimators': 20}

In [229]:
ypred = models.convertPredicted(grid.predict(X_test))

In [230]:
models.run_analysis(ypred, y_test)

              precision    recall  f1-score   support

           0       0.68      0.72      0.70        32
           1       0.68      0.63      0.66        30

    accuracy                           0.68        62
   macro avg       0.68      0.68      0.68        62
weighted avg       0.68      0.68      0.68        62

Acc score:  0.6774193548387096
Mean Squared Error:  0.3225806451612903





## Importancia de colunas

In [231]:
sorted(zip(grid.best_estimator_.feature_importances_, train_cols))

[(0.0, 'Comissão sobre Parceiros'),
 (0.0, 'Cybersecurity'),
 (0.0, 'Gestão da Saúde'),
 (0.0009061432151626896, 'Treinamentos'),
 (0.0011057800669506863, 'Equilíbrio fiscal'),
 (0.0012688623046126304, 'Concorrentes'),
 (0.0018842817524956305, 'Gestão da Receita'),
 (0.0022585654289072415, 'Gestão da Educação'),
 (0.0023442465423815077, 'Gestão da Segurança Viária'),
 (0.002437032610642893, 'ESG'),
 (0.004718118740241302, 'Gestão de operações projetizadas'),
 (0.005046645878543388, 'Software'),
 (0.00629021696878313, 'Gestão Estratégica'),
 (0.006333070634008842, 'Skill_dev'),
 (0.007648293107398466, 'Gestão de pessoas'),
 (0.009087164528959957, 'Gestão de Gastos'),
 (0.010133155882246064, 'Produtos digitais'),
 (0.012340944077548936, 'n_solucoes'),
 (0.013077637011706963, 'Desdobramento de metas'),
 (0.014260963069052657, 'Processes Excellence'),
 (0.014614156711792215, 'Gestão da Receita_per_Gestão de Gastos'),
 (0.017525654034329513, 'numero_relacionamentos_convertidos'),
 (0.029176

## Using cross validation

In [25]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [20]:
scores = cross_val_score(tree, X, y, scoring="neg_mean_squared_error", cv=10)

In [32]:
display_scores(np.sqrt(-scores))

Scores:  [0.62217102 0.5956834  0.65991202 0.52790958 0.70128687 0.74657689
 0.6401844  0.6140433  0.62725005 0.65286255]
Mean:  0.6387880074544369
Std:  0.05598606940195762


In [22]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Std: ", scores.std())

In [29]:
models.run_analysis(cross_val_predict(tree, X, y, cv=10), y)

              precision    recall  f1-score   support

           0       0.62      0.59      0.60       326
           1       0.56      0.59      0.57       287

    accuracy                           0.59       613
   macro avg       0.59      0.59      0.59       613
weighted avg       0.59      0.59      0.59       613

Acc score:  0.5889070146818923
Mean Squared Error:  0.4110929853181077



