In [None]:
# Plotar Curva ROC
# https://queirozf.com/entries/visualizing-machine-learning-models-examples-with-scikit-learn-and-matplotlib#plot-roc-curve-and-auc
# https://rstudio-pubs-static.s3.amazonaws.com/439808_98cf197211d94a68a1728e29f7a5c253.html

### Carregando a Base de Dados

In [1]:
import pandas as pd
import numpy as np

import import_ipynb
import modelofuncoes as mf

importing Jupyter notebook from modelofuncoes.ipynb


In [6]:
urlBase = "Vacinas/Regioes/OC/oc-vacina-7030-sudeste.xlsx"
alvo = 'evolucaoCaso'

modeloDados = pd.read_excel(urlBase)
modeloDados.shape

(15335, 25)

In [7]:
#colunasRemovidas = ['dataPrimeiraDose', 'dataSegundaDose']

#colunasRemovidas = ['disturbiosOlfativos', 'disturbiosGustatorios', 'puerpera', 'fragilidadeImuno', 
#                    'assintomatico', 'dataPrimeiraDose', 'dataSegundaDose', 'obesidade', 'renal', 
#                    'respiratoria']

# OC 6040
#colunasRemovidas = ['dataPrimeiraDose', 'dataSegundaDose', 'assintomatico', 'puerpera', 'disturbiosGustatorios', 
#                    'disturbiosOlfativos', 'gestante', 'tosse', 'respiratoria', 'fragilidadeImuno']

# IC 6040
#colunasRemovidas = ['dataPrimeiraDose', 'dataSegundaDose', 'respiratoria', 'assintomatico', 'fragilidadeImuno',
#                    'puerpera', 'disturbiosGustatorios', 'disturbiosOlfativos']

# IC 7030
#colunasRemovidas = ['dataPrimeiraDose', 'dataSegundaDose', 'assintomatico', 'puerpera', 'disturbiosGustatorios', 
#                    'disturbiosOlfativos', 'obesidade', 'febre', 'tosse', 'obesidade', 'respiratoria']

# OC 6040 (Atributos < 1%)
#colunasRemovidas = ['dataPrimeiraDose', 'dataSegundaDose', 'assintomatico', 'puerpera', 'disturbiosGustatorios', 
#                    'disturbiosOlfativos', 'gestante', 'tosse', 'respiratoria', 'fragilidadeImuno', 'racaCor', 
#                    'sexo', 'febre', 'renal', 'imunossupressao', 'obesidade']

# OC 7030 (Atributos < 1%)
colunasRemovidas = ['dataPrimeiraDose', 'dataSegundaDose', 'disturbiosGustatorios', 'disturbiosOlfativos', 
                    'puerpera', 'gestante', 'tosse', 'assintomatico', 'respiratoria', 'racaCor', 
                    'fragilidadeImuno', 'febre', 'sexo', 'imunossupressao', 'obesidade', 'renal']

In [8]:
# Removendos as Colunas Selecionadas
modeloDados = modeloDados.drop(columns = colunasRemovidas)
modeloDados.shape

(15335, 9)

### Criando o Modelo

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(modeloDados.drop(alvo,axis=1),
                                                    modeloDados[alvo],
                                                    test_size=0.3,
                                                    random_state=42);

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
modeloClassificador = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=2333, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [11]:
modeloClassificador

GradientBoostingClassifier(random_state=2333)

### Analisando a Acurácia

In [12]:
from sklearn.metrics import accuracy_score

# Fit nos dados
modeloClassificador.fit(x_train, y_train)

#Realizando a predição
resultado = modeloClassificador.predict(x_test)

# Verificando a acurácia
print(accuracy_score(y_test, resultado))

0.8676374701151923


### Cross Validation

In [13]:
from sklearn.model_selection import cross_val_score

kfolds = 10
data, target = modeloDados.drop(columns = [alvo]), modeloDados[alvo]
scores = cross_val_score(modeloClassificador, data, target, cv = kfolds, scoring = "accuracy") 

In [14]:
print("-> Resultado dos k testes", scores)
print("-> Média dos testes", scores.mean())
print("-> Desvio Padrão dos testes", scores.std())

-> Resultado dos k testes [0.87288136 0.86571056 0.86831812 0.84028683 0.85658409 0.88193085
 0.89889106 0.8838878  0.8714938  0.84670581]
-> Média dos testes 0.868669029291272
-> Desvio Padrão dos testes 0.016701601980872616


### Importância das Variáveis

In [15]:
feature_importances = pd.DataFrame(modeloClassificador.feature_importances_,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
faixaetaria,0.721222
dispneia,0.135519
qntVacinas,0.040218
dorDeGarganta,0.034481
diabetes,0.017638
cardiaca,0.017625
dorDeCabeca,0.017518
coriza,0.015778


### Avaliando os Resultados

In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

hiperparametrosAB = AdaBoostClassifier()
hiperparametrosRF = RandomForestClassifier()
hiperparametrosLR = LogisticRegression()
hiperparametrosGB = GradientBoostingClassifier()

In [17]:
base = {}

base["Sudeste OC [70/30]"] = urlBase
valoresAlvo = [0, 1]
modelos = {"AB": hiperparametrosAB, "RF": hiperparametrosRF, 
           "LR": hiperparametrosLR, "GB": hiperparametrosGB }

In [18]:
desempenhoDosModelos = mf.testarModelos(base, alvo, colunasRemovidas, modelos, valoresAlvo)

Index(['evolucaoCaso', 'dorDeGarganta', 'dispneia', 'coriza', 'dorDeCabeca',
       'diabetes', 'cardiaca', 'faixaetaria', 'qntVacinas'],
      dtype='object')


In [19]:
desempenhoDosModelos.drop(columns=['Hiperparâmetros'])

Unnamed: 0,Base,Modelo,Acurácia,Precisão (0),Recall (0),F1-Score (0),Precisão (1),Recall (1),F1-Score (1),3 Kfolds,5 Kfolds
0,Sudeste OC [70/30],AB,86.568%,79.003%,56.792%,66.081%,88.07%,95.482%,91.626%,86.619%,86.554%
0,Sudeste OC [70/30],RF,86.047%,77.143%,56.038%,64.918%,87.836%,95.03%,91.291%,86.195%,85.973%
0,Sudeste OC [70/30],LR,86.438%,78.238%,56.981%,65.939%,88.091%,95.256%,91.533%,86.704%,86.704%
0,Sudeste OC [70/30],GB,86.764%,80.35%,56.321%,66.223%,87.999%,95.877%,91.769%,86.841%,86.834%


### Curva Roc

In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics

y_preds = modeloClassificador.predict_proba(x_test)

# take the second column because the classifier outputs scores for the 0 class as well
preds = y_preds[:,1]

# fpr (false-positive-rate) e tpr (true-positive-rate)
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

# Construindo o Gráfico
plt.clf()
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc_score))
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()

In [None]:
y_preds