In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Dados/diabetes.csv")

In [3]:
df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


In [4]:
df.shape

(768, 9)

In [5]:
df.columns

Index(['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [6]:
name_to_class = {
    'tested_negative': 0,
    'tested_positive' : 1
}

In [7]:
#Substituindo os valores categóricos pelo mapamento
df['class'] = df['class'].map(name_to_class)

df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
#Analisando um resumo da base
df.describe()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
#Amazenando os labels em um array
labels = np.array(df['class'])

#Salvando a ordem das features
feauture_list = list(df.columns)

In [10]:
#Removendo  a coluna de labels do df original
df = df.drop('class', axis = 1)

In [11]:
#Convertendo o df para um array
data = np.array(df)

In [12]:
from sklearn.model_selection import train_test_split

#Divindo em treino e teste
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.25, random_state = 42)

In [13]:
#Baseline
baseline_preds = np.random.choice([0,1], size = len(test_labels))

print(baseline_preds)

[1 1 0 0 1 0 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 1 0
 0 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1
 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 0 1 1
 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 0 0 0 0 0 1 1 1 1 0 1 0 1 1 0 1 0 1
 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 0 0 1 0 1
 0 1 0 1 0 0 1]


In [14]:
#Calculando Metricas
from sklearn import metrics

print('Matriz de Confusão\n', metrics.confusion_matrix(test_labels, baseline_preds))
print('Acuracia:', metrics.accuracy_score(test_labels, baseline_preds))
print('Acuracia Balanceada por Classe:', metrics.balanced_accuracy_score(test_labels, baseline_preds))
print('Precision:', metrics.precision_score(test_labels, baseline_preds))
print('Recall:', metrics.recall_score(test_labels, baseline_preds))
print('F1:', metrics.f1_score(test_labels, baseline_preds))
print('AUCROC', metrics.roc_auc_score(test_labels, baseline_preds))

Matriz de Confusão
 [[58 65]
 [35 34]]
Acuracia: 0.4791666666666667
Acuracia Balanceada por Classe: 0.4821491693177802
Precision: 0.3434343434343434
Recall: 0.4927536231884058
F1: 0.40476190476190477
AUCROC 0.4821491693177801


In [15]:
print('Classification Report\n', metrics.classification_report(test_labels, baseline_preds))

Classification Report
               precision    recall  f1-score   support

           0       0.62      0.47      0.54       123
           1       0.34      0.49      0.40        69

    accuracy                           0.48       192
   macro avg       0.48      0.48      0.47       192
weighted avg       0.52      0.48      0.49       192



In [16]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

#Treinanmento
knn_reg = KNeighborsClassifier().fit(train_data, train_labels)

In [19]:
#Predição
predictions1_labels = knn_reg.predict(test_data)

p = pd.DataFrame({'Real': test_labels, 'Previsto': predictions1_labels})
p.head(10)

Unnamed: 0,Real,Previsto
0,0,1
1,0,0
2,0,0
3,0,1
4,0,0
5,0,1
6,0,0
7,0,0
8,0,1
9,0,1


In [20]:
#Avaliando o Modelo
print('Matriz de Confusão\n', metrics.confusion_matrix(test_labels, predictions1_labels))
print('Acuracia:', metrics.accuracy_score(test_labels, predictions1_labels))
print('Acuracia Balanceada por Classe:', metrics.balanced_accuracy_score(test_labels, predictions1_labels))
print('Precision:', metrics.precision_score(test_labels, predictions1_labels))
print('Recall:', metrics.recall_score(test_labels, predictions1_labels))
print('F1:', metrics.f1_score(test_labels, predictions1_labels))
print('AUCROC', metrics.roc_auc_score(test_labels, predictions1_labels))

Matriz de Confusão
 [[88 35]
 [31 38]]
Acuracia: 0.65625
Acuracia Balanceada por Classe: 0.6330858960763521
Precision: 0.5205479452054794
Recall: 0.5507246376811594
F1: 0.5352112676056338
AUCROC 0.633085896076352


In [21]:
print('Classification Report\n', metrics.classification_report(test_labels, predictions1_labels))

Classification Report
               precision    recall  f1-score   support

           0       0.74      0.72      0.73       123
           1       0.52      0.55      0.54        69

    accuracy                           0.66       192
   macro avg       0.63      0.63      0.63       192
weighted avg       0.66      0.66      0.66       192



In [22]:
#SVM
from sklearn.svm import SVC

svc_class = SVC(kernel='rbf')

svc_class.fit(train_data, train_labels)

SVC()

In [31]:
#Predição
predictions2_labels = svc_class.predict(test_data)

p = pd.DataFrame({'Real': test_labels, 'Previsto': predictions2_labels})
p.head(10)

Unnamed: 0,Real,Previsto
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,1
9,0,1


In [32]:
#Avaliando o Modelo
print('Matriz de Confusão\n', metrics.confusion_matrix(test_labels, predictions2_labels))
print('Acuracia:', metrics.accuracy_score(test_labels, predictions2_labels))
print('Acuracia Balanceada por Classe:', metrics.balanced_accuracy_score(test_labels, predictions2_labels))
print('Precision:', metrics.precision_score(test_labels, predictions2_labels))
print('Recall:', metrics.recall_score(test_labels, predictions2_labels))
print('F1:', metrics.f1_score(test_labels, predictions2_labels))
print('AUCROC', metrics.roc_auc_score(test_labels, predictions2_labels))

Matriz de Confusão
 [[106  17]
 [ 35  34]]
Acuracia: 0.7291666666666666
Acuracia Balanceada por Classe: 0.6772711205372923
Precision: 0.6666666666666666
Recall: 0.4927536231884058
F1: 0.5666666666666667
AUCROC 0.6772711205372923


In [33]:
print('Classification Report\n', metrics.classification_report(test_labels, predictions2_labels))

Classification Report
               precision    recall  f1-score   support

           0       0.75      0.86      0.80       123
           1       0.67      0.49      0.57        69

    accuracy                           0.73       192
   macro avg       0.71      0.68      0.68       192
weighted avg       0.72      0.73      0.72       192



In [34]:
#Random Forest Classifers
from sklearn.ensemble import RandomForestClassifier

rdf_class = RandomForestClassifier(n_estimators=10, random_state=42).fit(train_data, train_labels)

In [35]:
#Predição
predictions3_labels = rdf_class.predict(test_data)

p = pd.DataFrame({'Real': test_labels, 'Previso': predictions3_labels})
p.head(10)

Unnamed: 0,Real,Previso
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,1
6,0,0
7,0,1
8,0,1
9,0,1


In [36]:
print('Matriz de Confusão\n', metrics.confusion_matrix(test_labels, predictions3_labels))
print('Acuracia:', metrics.accuracy_score(test_labels, predictions3_labels))
print('Acuracia Balanceada por Classe:', metrics.balanced_accuracy_score(test_labels, predictions3_labels))
print('Precision:', metrics.precision_score(test_labels, predictions3_labels))
print('Recall:', metrics.recall_score(test_labels, predictions3_labels))
print('F1:', metrics.f1_score(test_labels, predictions3_labels))
print('AUCROC', metrics.roc_auc_score(test_labels, predictions3_labels))

Matriz de Confusão
 [[97 26]
 [24 45]]
Acuracia: 0.7395833333333334
Acuracia Balanceada por Classe: 0.72039589961117
Precision: 0.6338028169014085
Recall: 0.6521739130434783
F1: 0.6428571428571428
AUCROC 0.72039589961117


In [37]:
print('Classification Report\n', metrics.classification_report(test_labels, predictions3_labels))

Classification Report
               precision    recall  f1-score   support

           0       0.80      0.79      0.80       123
           1       0.63      0.65      0.64        69

    accuracy                           0.74       192
   macro avg       0.72      0.72      0.72       192
weighted avg       0.74      0.74      0.74       192

