In [1]:
# Importando os pacotes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump, load

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

from hpsklearn import HyperoptEstimator, knn, random_forest, svc_sigmoid, sgd, gaussian_nb
from hyperopt import tpe

import functions as f
from DataProcesser import DataProcesser

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [2]:
# Importando o dataset de treinamento
dataset = pd.read_csv('../datasets/treino.csv')

In [3]:
dataset_x = dataset.drop(['id', 'classe'], axis=1)
dataset_y = dataset['classe']

In [4]:
dp_dataset = DataProcesser(X=dataset_x, y=dataset_y)

In [5]:
dataset_processado = dp_dataset.process_train_data(with_target_column=False)

In [6]:
dataset_processado.head()

Unnamed: 0,num_gestacoes,glicose,pressao_sanguinea,grossura_pele,insulina,bmi,indice_historico,idade,num_gestacoes_miss,glicose_miss,pressao_sanguinea_miss,grossura_pele_miss,insulina_miss,bmi_miss,indice_historico_miss,idade_miss,missing_total
0,6.0,148.0,72.0,35.0,207.514563,33.6,0.627,50,0,0,0,0,1,0,0,0,1
1,1.0,85.0,66.0,29.0,126.649038,26.6,0.351,31,0,0,0,0,1,0,0,0,1
2,8.0,183.0,64.0,33.014286,207.514563,23.3,0.672,32,0,0,0,1,1,0,0,0,2
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0,0,0,0,0,0,0,0,0
4,5.68,137.0,40.0,35.0,168.0,43.1,2.288,33,1,0,0,0,0,0,0,0,1


In [7]:
x_treino, x_validacao, y_treino, y_validacao = train_test_split(dataset_x, dataset_y, test_size=0.25, random_state=27)

In [8]:
dp_treino = DataProcesser(X=x_treino, y=y_treino)
mean_dict = dp_treino.get_means_by_column()

In [9]:
dp_validacao = DataProcesser(X=x_validacao, mean_dict=mean_dict)

In [10]:
x_treino_processado = dp_treino.process_train_data(with_target_column=False)
x_validacao_processado = dp_validacao.process_test_data()

In [11]:
n_evals = 10
trial_timeout = 300

random_forest_estimator = HyperoptEstimator(classifier=random_forest('rf'),
                                            algo=tpe.suggest,
                                            preprocessing=[],
                                            max_evals=n_evals,
                                            trial_timeout=trial_timeout)

knn_estimator = HyperoptEstimator(classifier=knn('knn'),
                                  algo=tpe.suggest,
                                  preprocessing=[],
                                  max_evals=n_evals,
                                  trial_timeout=trial_timeout)

svc_estimator = HyperoptEstimator(classifier=svc_sigmoid('svc'),
                                  algo=tpe.suggest,
                                  preprocessing=[],
                                  max_evals=n_evals,
                                  trial_timeout=trial_timeout)

sgd_estimator = HyperoptEstimator(classifier=sgd('sgd'),
                                  algo=tpe.suggest,
                                  preprocessing=[],
                                  max_evals=n_evals,
                                  trial_timeout=trial_timeout)  

gaussian_nb_estimator = HyperoptEstimator(classifier=gaussian_nb('gnb'),
                                          algo=tpe.suggest,
                                          preprocessing=[],
                                          max_evals=n_evals,
                                          trial_timeout=trial_timeout)                                                                       

In [12]:
estimators_list = [random_forest_estimator, knn_estimator, svc_estimator, sgd_estimator, gaussian_nb_estimator]
names = ['rf', 'knn', 'svc', 'sgd', 'gnb']

models_dict = {}
score_dict = {}

In [13]:
for estimator, name in zip(estimators_list, names):
    print(f'\nsearching {name.upper()} best parameters...')

    estimator.fit(x_treino_processado, y_treino)
    estimator.retrain_best_model_on_full_data(x_treino_processado, y_treino)

    best_model = estimator.best_model()['learner']

    models_dict[name] = best_model
    score_dict[name] = estimator.score(x_validacao_processado, y_validacao)


searching RF best parameters...
100%|██████████| 1/1 [00:02<00:00,  2.76s/trial, best loss: 0.15555555555555556]
100%|██████████| 2/2 [00:04<00:00,  4.68s/trial, best loss: 0.11111111111111116]
100%|██████████| 3/3 [00:06<00:00,  6.89s/trial, best loss: 0.11111111111111116]
100%|██████████| 4/4 [00:03<00:00,  3.28s/trial, best loss: 0.11111111111111116]
100%|██████████| 5/5 [00:02<00:00,  2.90s/trial, best loss: 0.11111111111111116]
100%|██████████| 6/6 [00:02<00:00,  2.70s/trial, best loss: 0.11111111111111116]
100%|██████████| 7/7 [00:02<00:00,  2.60s/trial, best loss: 0.11111111111111116]
100%|██████████| 8/8 [00:04<00:00,  4.62s/trial, best loss: 0.11111111111111116]
100%|██████████| 9/9 [00:02<00:00,  2.72s/trial, best loss: 0.11111111111111116]
100%|██████████| 10/10 [00:02<00:00,  2.79s/trial, best loss: 0.11111111111111116]

searching KNN best parameters...
100%|██████████| 1/1 [00:02<00:00,  2.51s/trial, best loss: 0.15555555555555556]
100%|██████████| 2/2 [00:02<00:00,  2.53

In [14]:
best_model_of_all = models_dict[max(score_dict, key=score_dict.get)]
standard_rf = RandomForestClassifier()

In [15]:
score_dict

{'rf': 0.7, 'knn': 0.6266666666666667, 'svc': 0.66, 'sgd': 0.64, 'gnb': 0.78}

In [16]:
hyperopt_model_eval = np.mean(cross_val_score(estimator=best_model_of_all,
                                              X=dataset_processado,
                                              y=dataset_y,
                                              scoring='accuracy',
                                              cv=3))

In [17]:
standard_model_eval = np.mean(cross_val_score(estimator=standard_rf,
                                              X=dataset_processado,
                                              y=dataset_y,
                                              scoring='accuracy',
                                              cv=3))

In [18]:
print(f'hyperopt model scored: {hyperopt_model_eval:0.2f}')
print(f'standard model scored: {standard_model_eval:0.2f}')

if hyperopt_model_eval > standard_model_eval:
    print(f'\nTraining hyperopt model...')
    final_model = best_model_of_all.fit(dataset_processado, dataset_y)
else:
    print(f'\nTraining standard model...')
    final_model = standard_rf.fit(dataset_processado, dataset_y)

hyperopt model scored: 0.72
standard model scored: 0.89

Training standard model...


In [19]:
stacked_dataframe = pd.DataFrame()
stacked_dataframe_val = pd.DataFrame()

for model in models_dict.values():

    try:
        model.fit(x_treino_processado, y_treino)

        predictions_df = pd.DataFrame(model.predict_proba(x_treino_processado))
        predictions_df_val = pd.DataFrame(model.predict_proba(x_validacao_processado))

    except Exception:
        model.fit(x_treino_processado, y_treino)

        predictions_df = pd.DataFrame(model.predict(x_treino_processado), columns=[1])
        predictions_df_val = pd.DataFrame(model.predict(x_validacao_processado), columns=[1])

    stacked_dataframe = pd.concat([stacked_dataframe, predictions_df], axis=1)
    stacked_dataframe_val = pd.concat([stacked_dataframe_val, predictions_df_val], axis=1)

In [20]:
x_treino_stacked = stacked_dataframe.loc[:, 1].copy()
x_val_stacked = stacked_dataframe_val.loc[:, 1].copy()

x_treino_stacked.columns = models_dict.keys()
x_val_stacked.columns = models_dict.keys()

In [21]:
np.mean(cross_val_score(estimator=standard_rf,
                        X=x_treino_stacked,
                        y=y_treino,
                        scoring='accuracy',
                        cv=3))

1.0

In [22]:
standard_rf2 = RandomForestClassifier()
stacked_model = standard_rf2.fit(x_treino_stacked, y_treino)

In [23]:
print(accuracy_score(stacked_model.predict(x_val_stacked), y_validacao))

0.6666666666666666


In [24]:
f.save_model(final_model, 'models', 'ia_doctor')
f.save_model(mean_dict, 'imputers', 'mean')

In [2]:
loaded_model = f.load_last_model('models')
imputer = f.load_last_model('imputers')

models/ia_doctor_v2.h5
imputers/mean_v2.h5


In [26]:
dataset_teste = pd.read_csv('../datasets/teste.csv')

In [27]:
predictions_proba = f.process_and_predict_proba(dataset=dataset_teste, model=loaded_model, imputer=imputer, drop_cols='id')
pd.DataFrame(predictions_proba).head(10)

Unnamed: 0,0,1
0,0.68,0.32
1,0.65,0.35
2,0.61,0.39
3,0.73,0.27
4,0.55,0.45
5,0.54,0.46
6,0.44,0.56
7,0.99,0.01
8,0.64,0.36
9,0.92,0.08


In [28]:
predictions = f.process_and_predict(dataset=dataset_teste, model=loaded_model, imputer=imputer, drop_cols='id')

In [29]:
dataset_teste['classe'] = predictions
resposta = dataset_teste.loc[:, ['id', 'classe']]
resposta.to_csv('../datasets/best_answer.csv', index=False)