In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Dados/diabetes.csv")

In [3]:
df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


In [4]:
df.shape

(768, 9)

In [5]:
df.columns

Index(['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [6]:
#Criando um dicionário para mapaeamento
name_to_class = {
    'tested_positive': 1,
    'tested_negative': 0
}

df['class'] = df['class'].map(name_to_class)

df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.describe()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
#Transformando em array
labels = np.array(df['class'])

#Ordem das feautures
feauture_list = list(df.columns)

#Removendo a coluna do DF
df = df.drop('class', axis=1)

#Verificando
df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [9]:
data = np.array(df)

In [10]:
from sklearn.model_selection import train_test_split

#Divindo em treino e teste
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.25, random_state=42)

In [11]:
#importando o modelo SVM
from sklearn.svm import SVC

#Treinando o SVM com o Kernel RBF
classifier1 = SVC(kernel='rbf').fit(train_data, train_labels)

#Aplicando o modelo treinado
predicitions1_labels = classifier1.predict(test_data)

In [12]:
#Avaliando o Modelo
from sklearn import metrics

print('\nMatriz de Confusão:\n', metrics.confusion_matrix(test_labels, predicitions1_labels))
print('\nAcurácia:\n', metrics.accuracy_score(test_labels, predicitions1_labels))
print('\nF1:\n', metrics.f1_score(test_labels, predicitions1_labels))
print('\nAUROC:\n', metrics.roc_auc_score(test_labels, predicitions1_labels))


Matriz de Confusão:
 [[106  17]
 [ 35  34]]

Acurácia:
 0.7291666666666666

F1:
 0.5666666666666667

AUROC:
 0.6772711205372923


In [13]:
#TREINANDO O SVM COM O KERNEL SIGMOIDAL
classifier2 = SVC(kernel='sigmoid').fit(train_data, train_labels)

#Aplicando o modelo treinado
predicitions2_labels = classifier2.predict(test_data)

In [14]:
#Avaliando o Modelo

print('\nMatriz de Confusão:\n', metrics.confusion_matrix(test_labels, predicitions2_labels))
print('\nAcurácia:\n', metrics.accuracy_score(test_labels, predicitions2_labels))
print('\nF1:\n', metrics.f1_score(test_labels, predicitions2_labels))
print('\nAUROC:\n', metrics.roc_auc_score(test_labels, predicitions2_labels))


Matriz de Confusão:
 [[94 29]
 [57 12]]

Acurácia:
 0.5520833333333334

F1:
 0.21818181818181817

AUROC:
 0.46907034287734173


In [15]:
#Treinando o SVM com o Kernel RBF, com largura da gaussiana alterada
classifier3 = SVC(kernel='rbf', gamma='auto').fit(train_data, train_labels)

#Aplicando o modelo treinado
predicitions3_labels = classifier3.predict(test_data)

In [16]:
#Avaliando o Modelo

print('\nMatriz de Confusão:\n', metrics.confusion_matrix(test_labels, predicitions3_labels))
print('\nAcurácia:\n', metrics.accuracy_score(test_labels, predicitions3_labels))
print('\nF1:\n', metrics.f1_score(test_labels, predicitions3_labels))
print('\nAUROC:\n', metrics.roc_auc_score(test_labels, predicitions3_labels))


Matriz de Confusão:
 [[123   0]
 [ 69   0]]

Acurácia:
 0.640625

F1:
 0.0

AUROC:
 0.5


In [17]:
#Treinando o SVM com o Kernel RBF, com largura da gaussiana alterada
classifier4 = SVC(C = 0.1, kernel='rbf', gamma=0.0001).fit(train_data, train_labels)

#Aplicando o modelo treinado
predicitions4_labels = classifier4.predict(test_data)

In [18]:
#Avaliando o Modelo

print('\nMatriz de Confusão:\n', metrics.confusion_matrix(test_labels, predicitions4_labels))
print('\nAcurácia:\n', metrics.accuracy_score(test_labels, predicitions4_labels))
print('\nF1:\n', metrics.f1_score(test_labels, predicitions4_labels))
print('\nAUROC:\n', metrics.roc_auc_score(test_labels, predicitions4_labels))


Matriz de Confusão:
 [[112  11]
 [ 43  26]]

Acurácia:
 0.71875

F1:
 0.49056603773584906

AUROC:
 0.6436903499469776


In [19]:
#UTILIZANDO O RANDOM FOREST REGRESSOR
from sklearn.ensemble import RandomForestClassifier

#Treinando o MOdelo
classifier5 = RandomForestClassifier(n_estimators=10, random_state=42).fit(train_data, train_labels)

#Predições
predicitions5_labels = classifier5.predict(test_data)

In [20]:
print('\nMatriz de Confusão:\n', metrics.confusion_matrix(test_labels, predicitions5_labels))
print('\nAcurácia:\n', metrics.accuracy_score(test_labels, predicitions5_labels))
print('\nF1:\n', metrics.f1_score(test_labels, predicitions5_labels))
print('\nAUROC:\n', metrics.roc_auc_score(test_labels, predicitions5_labels))


Matriz de Confusão:
 [[97 26]
 [24 45]]

Acurácia:
 0.7395833333333334

F1:
 0.6428571428571428

AUROC:
 0.72039589961117


In [21]:
#Treinando o Modelo
classifier6 = RandomForestClassifier(n_estimators=100, random_state=42).fit(train_data, train_labels)
#Predições
predicitions6_labels = classifier6.predict(test_data)

In [22]:
print('\nMatriz de Confusão:\n', metrics.confusion_matrix(test_labels, predicitions6_labels))
print('\nAcurácia:\n', metrics.accuracy_score(test_labels, predicitions6_labels))
print('\nF1:\n', metrics.f1_score(test_labels, predicitions6_labels))
print('\nAUROC:\n', metrics.roc_auc_score(test_labels, predicitions6_labels))


Matriz de Confusão:
 [[96 27]
 [24 45]]

Acurácia:
 0.734375

F1:
 0.6382978723404256

AUROC:
 0.7163308589607635


In [23]:
#Treinando o Modelo
classifier7 = RandomForestClassifier(n_estimators=100, bootstrap=False, random_state=42).fit(train_data, train_labels)
#Predições
predicitions7_labels = classifier7.predict(test_data)

In [24]:
print('\nMatriz de Confusão:\n', metrics.confusion_matrix(test_labels, predicitions7_labels))
print('\nAcurácia:\n', metrics.accuracy_score(test_labels, predicitions7_labels))
print('\nF1:\n', metrics.f1_score(test_labels, predicitions7_labels))
print('\nAUROC:\n', metrics.roc_auc_score(test_labels, predicitions7_labels))


Matriz de Confusão:
 [[93 30]
 [24 45]]

Acurácia:
 0.71875

F1:
 0.6250000000000001

AUROC:
 0.704135737009544


In [25]:
#GRID SEARCH
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [26]:
#Definindo Hiperparâmetros
hiperparam = {'kernel': ('sigmoid', 'rbf'), 'C': [0.01, 1, 10]}

#Definindo o tipo de validação cruzada e o numero de folds
cv_strat = StratifiedKFold(n_splits=10)

#Instanciando o classificador
classifier = SVC()

#Definindo a estratégia de score a partir da metrica f1
f1 = make_scorer(f1_score)

#Instanciando e modelando o grid search com os hiperparametros e a validação definidas
grid_cv = GridSearchCV(classifier, hiperparam, cv = cv_strat, scoring=f1)
grid_cv.fit(data, labels)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=SVC(),
             param_grid={'C': [0.01, 1, 10], 'kernel': ('sigmoid', 'rbf')},
             scoring=make_scorer(f1_score))

In [27]:
#Vamos olhar todos os resultados
print('Resumo de todos os resultados encontrados:\n\n', grid_cv.cv_results_)

Resumo de todos os resultados encontrados:

 {'mean_fit_time': array([0.01704819, 0.01192367, 0.0149945 , 0.01393597, 0.00998812,
       0.0135056 ]), 'std_fit_time': array([0.00409958, 0.00408502, 0.00447338, 0.00438877, 0.00045597,
       0.00562489]), 'mean_score_time': array([0.00151806, 0.00204821, 0.00299091, 0.0010716 , 0.00298698,
       0.        ]), 'std_score_time': array([0.00231106, 0.00409667, 0.00324855, 0.00198416, 0.00456452,
       0.        ]), 'param_C': masked_array(data=[0.01, 0.01, 1, 1, 10, 10],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['sigmoid', 'rbf', 'sigmoid', 'rbf', 'sigmoid', 'rbf'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 0.01, 'kernel': 'sigmoid'}, {'C': 0.01, 'kernel': 'rbf'}, {'C': 1, 'kernel': 'sigmoid'}, {'C': 1, 'kernel': 'rbf'}, {'C': 10, 'kernel': 'sigmoid

In [30]:
print('Melhor Resultado F1:', grid_cv.best_score_)
print('\nMelhor configuração de hiperparametros', grid_cv.best_params_)
print('\n Configurações de todos os hiperparâmetros do melhor estimado encontrado pelo GridSearch\n', grid_cv.best_estimator_)

Melhor Resultado F1: 0.6086487385278134

Melhor configuração de hiperparametros {'C': 10, 'kernel': 'rbf'}

 Configurações de todos os hiperparâmetros do melhor estimado encontrado pelo GridSearch
 SVC(C=10)


In [31]:
#RANDOM SEARCH
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [33]:
#Definindo o tipo de validação cruzada e o numero de folds
cv_strat = StratifiedKFold(n_splits=10)

#Definindo a estratégia de score a partir da metrica f1
f1 = make_scorer(f1_score)

#Definindo os hiperparametros
distributions = dict(kernel=['sigmoid', 'rbf'], C = uniform(loc=0, scale=10))

#Instanciando o classificador
classifier = SVC()

#Instanciando e modelando com os hiperparametros e a validação definidas
random_csv = RandomizedSearchCV(classifier, distributions, cv = cv_strat, scoring=f1, random_state = 42, n_iter = 10)
random_csv.fit(data, labels)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
                   estimator=SVC(),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000293702A4040>,
                                        'kernel': ['sigmoid', 'rbf']},
                   random_state=42, scoring=make_scorer(f1_score))

In [34]:
#Vamos olhar todos os resultados
print('Resumo de todos os resultados encontrados:\n\n', random_csv.cv_results_)

Resumo de todos os resultados encontrados:

 {'mean_fit_time': array([0.01316307, 0.01142123, 0.00922818, 0.01026418, 0.01072474,
       0.00833545, 0.00937338, 0.0100117 , 0.01306586, 0.01003559]), 'std_fit_time': array([0.00515548, 0.00458689, 0.00243467, 0.00019359, 0.00340272,
       0.00332225, 0.00260722, 0.00391016, 0.00695126, 0.00708602]), 'mean_score_time': array([0.        , 0.00123353, 0.00124259, 0.00225949, 0.00144415,
       0.00203166, 0.00242414, 0.00227056, 0.        , 0.00312521]), 'std_score_time': array([0.        , 0.00244727, 0.00245329, 0.00306195, 0.00241373,
       0.0036252 , 0.00370297, 0.00305626, 0.        , 0.00625045]), 'param_C': masked_array(data=[3.745401188473625, 1.834347898661638,
                   5.986584841970366, 4.458327528535912,
                   0.5808361216819946, 3.337086111390218,
                   7.080725777960454, 0.5641157902710026,
                   8.324426408004218, 0.007787658410143283],
             mask=[False, False, False