In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Dados/diabetes.csv")

In [3]:
df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


In [4]:
df.shape

(768, 9)

In [5]:
df.columns

Index(['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [6]:
#Pre Processamento
name_to_class = {
    'tested_negative': 0,
    'tested_positive': 1
}

#Substituindo o valores categóricos
df['class'] = df['class'].map(name_to_class)

In [7]:
df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
df.describe()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
#Armazenando os Labels
labels = np.array(df['class'])

#Salvando a ordem das features
features_list = list(df.columns)

#Removendo a coluna de labels
df = df.drop('class', axis=1)

In [10]:
#Convertendo o df para array
data = np.array(df)

In [12]:
#TREINO E TESTE
from sklearn.model_selection import train_test_split

In [13]:
#Dividindo em treino e teste
train_data1, test_data1, train_labels1, test_labels1 = train_test_split(data, labels, test_size=0.25, random_state=42)

In [14]:
train_data2, test_data2, train_labels2, test_labels2 = train_test_split(data, labels, test_size=0.25, random_state=123)

In [16]:
train_data3, test_data3, train_labels3, test_labels3 = train_test_split(data, labels, test_size=0.35, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestClassifier

#Treinando o modelo com diferentes separações
classifier1 = RandomForestClassifier(n_estimators=10, random_state=42).fit(train_data1, train_labels1)
classifier2 = RandomForestClassifier(n_estimators=10, random_state=42).fit(train_data2, train_labels2)
classifier3 = RandomForestClassifier(n_estimators=10, random_state=42).fit(train_data3, train_labels3)

In [18]:
#Fazendo as predições
predicitions1_labels = classifier1.predict(test_data1)
predicitions2_labels = classifier2.predict(test_data2)
predicitions3_labels = classifier3.predict(test_data3)

In [20]:
#Importando biblioteca para calcular as métricas
from sklearn import metrics

print('\nAcurácia Modelo 1:\n', metrics.accuracy_score(test_labels1, predicitions1_labels))
print('\nAcurácia Modelo 2:\n', metrics.accuracy_score(test_labels2, predicitions2_labels))
print('\nAcurácia Modelo 3:\n', metrics.accuracy_score(test_labels3, predicitions3_labels))


Acurácia Modelo 1:
 0.7395833333333334

Acurácia Modelo 2:
 0.7447916666666666

Acurácia Modelo 3:
 0.758364312267658


In [23]:
print("\nClassification Report M1\n", metrics.classification_report(test_labels1, predicitions1_labels))
print("\nClassification Report M2\n", metrics.classification_report(test_labels2, predicitions2_labels))
print("\nClassification Report M3\n", metrics.classification_report(test_labels3, predicitions3_labels))


Classification Report M1
               precision    recall  f1-score   support

           0       0.80      0.79      0.80       123
           1       0.63      0.65      0.64        69

    accuracy                           0.74       192
   macro avg       0.72      0.72      0.72       192
weighted avg       0.74      0.74      0.74       192


Classification Report M2
               precision    recall  f1-score   support

           0       0.77      0.84      0.80       119
           1       0.69      0.59      0.64        73

    accuracy                           0.74       192
   macro avg       0.73      0.71      0.72       192
weighted avg       0.74      0.74      0.74       192


Classification Report M3
               precision    recall  f1-score   support

           0       0.81      0.85      0.82       181
           1       0.65      0.58      0.61        88

    accuracy                           0.76       269
   macro avg       0.73      0.71      0.72    

In [61]:
#VALIDAÇÃO CRUZADA
from sklearn.model_selection import cross_val_score

classifier_cv = RandomForestClassifier(n_estimators=10, random_state=42)
scores_cv = cross_val_score(classifier_cv, data, labels, cv=10)
scores_cv

array([0.7012987 , 0.80519481, 0.72727273, 0.7012987 , 0.77922078,
       0.77922078, 0.81818182, 0.81818182, 0.73684211, 0.77631579])

In [62]:
print("Acurária: ", round(scores_cv.mean(),2))
print("Desvio Padrão: ", round(scores_cv.std(),3) * 2)

Acurária:  0.76
Desvio Padrão:  0.086


In [65]:
#Validação cruzada Estratificada
from sklearn.model_selection import StratifiedKFold

classifier_cv = RandomForestClassifier(n_estimators=10, random_state=42)
cv_strat = StratifiedKFold(n_splits=10)
scores_cv_strat = cross_val_score(classifier_cv, data, labels, cv=cv_strat)

In [66]:
print("Acurária: ", round(scores_cv_strat.mean(),2))
print("Desvio Padrão: ", round(scores_cv_strat.std(),3) * 2)

Acurária:  0.76
Desvio Padrão:  0.086


In [72]:
#VALIDAÇÃO lEAVE ONE OUT
from sklearn.model_selection import LeaveOneOut

classifier_cv = RandomForestClassifier(n_estimators=10, random_state=42)

loo = LeaveOneOut()

scores_loo = cross_val_score(classifier_cv, data, labels, cv=loo)

In [73]:
print("Acurária: ", round(scores_loo.mean(),2))
print("Desvio Padrão: ", round(scores_loo.std(),3) * 2)

Acurária:  0.75
Desvio Padrão:  0.87
