## Algoritms spot-checking 

Import Libraries

In [36]:
import pandas as pd 
import numpy as np

#pre-process
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

#models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

#metrics
from sklearn.metrics import accuracy_score, classification_report

#plot
from sklearn.metrics import confusion_matrix, roc_curve, auc, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

Load database

In [4]:
data = pd.read_csv('../diabd_dataset.csv')

### Qualquer pré processamento (só pra teste)

Train and test split

In [19]:
# Agora separar X e y e fazer o split (robusto a variações no nome da coluna alvo)
# Normaliza nomes de colunas (remove espaços acidentais) e detecta a coluna alvo
data.columns = data.columns.str.strip()
target = 'diabetic' if 'diabetic' in data.columns else data.columns[-1]
print('Usando coluna alvo:', target)
# Separar features e target
X = data.drop(target, axis=1)
y = data[target].copy()
# Se y for texto com 'Yes'/'No', codifica para 1/0 (mantém se já for numérico)
if y.dtype == object:
    y = y.str.strip().map({'Yes': 1, 'No': 0}).fillna(y)
# Agora o split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Usando coluna alvo: diabetic_Yes
Shapes: (4230, 14) (1058, 14) (4230,) (1058,)


Pre process

In [23]:
# Verifica quantos NaNs existem antes
print('NaNs em X_train (antes):', X_train.isna().sum().sum())
print('NaNs em X_test (antes):', X_test.isna().sum().sum())
# Garantir que y seja numérico (mapear Yes/No para 1/0 se necessário)
if y_train.dtype == object or y_test.dtype == object:
    y_train = y_train.astype(str).str.strip().map({'Yes': 1, 'No': 0}).fillna(y_train)
    y_test = y_test.astype(str).str.strip().map({'Yes': 1, 'No': 0}).fillna(y_test)
# Se houver valores não-numéricos em y, tenta converter e remove linhas inválidas do treino
y_train = pd.to_numeric(y_train, errors='coerce')
if y_train.isna().any():
    mask = y_train.notna()
    print('Removendo', (~mask).sum(), 'amostras com y faltante do treino')
    X_train = X_train.loc[mask]
    y_train = y_train.loc[mask]
# As dependências sklearn e numpy já foram importadas em outra célula, não reimportamos aqui

# Separar colunas numéricas e categóricas
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()
print('numéricas:', numeric_cols)
print('categóricas:', categorical_cols)
# Construir pré-processador: média para numéricas, moda+one-hot para categóricas
numeric_transformer = SimpleImputer(strategy='mean')
# Use sparse_output=False (novo parâmetro) em vez de sparse=False para compatibilidade com versões recentes do sklearn
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])
# Aplicar pré-processamento
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)
print('Shape X_train_proc:', X_train_proc.shape)
# Verifica NaNs após pré-processamento (não deve haver)
print('NaNs em X_train_proc:', np.isnan(X_train_proc).sum())


#print a table with 5 first rows of X_train_proc
print('First 5 rows of X_train_proc:')
print(pd.DataFrame(X_train_proc).head())   

NaNs em X_train (antes): 4230
NaNs em X_test (antes): 1058
numéricas: ['age', 'gender', 'pulse_rate', 'systolic_bp', 'diastolic_bp', 'glucose', 'height', 'weight', 'bmi', 'family_diabetes', 'hypertensive', 'family_hypertension', 'cardiovascular_disease', 'stroke']
categóricas: []
Shape X_train_proc: (4230, 13)
NaNs em X_train_proc: 0
First 5 rows of X_train_proc:
     0     1      2     3     4     5     6      7    8    9    10   11   12
0  22.0  72.0  132.0  76.0  5.74  1.55  65.1  27.12  1.0  0.0  1.0  0.0  0.0
1  45.0  78.0  172.0  99.0  6.49  1.42  53.7  26.54  0.0  0.0  0.0  0.0  0.0
2  35.0  96.0  105.0  69.0  5.04  1.50  49.0  21.82  0.0  0.0  0.0  0.0  0.0
3  51.0  57.0  121.0  78.0  6.07  1.57  42.6  17.18  0.0  0.0  0.0  0.0  0.0
4  36.0  68.0  126.0  88.0  9.74  1.55  52.0  21.66  0.0  0.0  0.0  0.0  0.0




###  Nayve Bayes

In [None]:
model = GaussianNB()
model.fit(X_train_proc, y_train)
predictions = model.predict(X_test_proc)
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy: 0.8950850661625709
              precision    recall  f1-score   support

       False       0.97      0.91      0.94       982
        True       0.37      0.67      0.48        76

    accuracy                           0.90      1058
   macro avg       0.67      0.79      0.71      1058
weighted avg       0.93      0.90      0.91      1058



### Random Forest

In [27]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train_proc, y_train)
predictions = model.predict(X_test_proc)
print("Random Forest Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Random Forest Accuracy: 0.94234404536862
              precision    recall  f1-score   support

       False       0.94      1.00      0.97       982
        True       0.94      0.21      0.34        76

    accuracy                           0.94      1058
   macro avg       0.94      0.60      0.66      1058
weighted avg       0.94      0.94      0.92      1058



### Decision Tree

In [29]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_proc, y_train)    
predictions = model.predict(X_test_proc)
print("Decision Tree Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions)) 

Decision Tree Accuracy: 0.9035916824196597
              precision    recall  f1-score   support

       False       0.95      0.95      0.95       982
        True       0.33      0.33      0.33        76

    accuracy                           0.90      1058
   macro avg       0.64      0.64      0.64      1058
weighted avg       0.90      0.90      0.90      1058



### Logistic Regression

In [31]:
model = LogisticRegression(max_iter=2000)   
model.fit(X_train_proc, y_train)
predictions = model.predict(X_test_proc)
print("Logistic Regression Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Logistic Regression Accuracy: 0.9395085066162571
              precision    recall  f1-score   support

       False       0.95      0.99      0.97       982
        True       0.71      0.26      0.38        76

    accuracy                           0.94      1058
   macro avg       0.83      0.63      0.68      1058
weighted avg       0.93      0.94      0.93      1058



### Neural Network

In [32]:
model = MLPClassifier(max_iter=1000, random_state=42)
model.fit(X_train_proc, y_train)
predictions = model.predict(X_test_proc)
print("Neural Network Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Neural Network Accuracy: 0.94234404536862
              precision    recall  f1-score   support

       False       0.95      0.99      0.97       982
        True       0.76      0.29      0.42        76

    accuracy                           0.94      1058
   macro avg       0.85      0.64      0.69      1058
weighted avg       0.93      0.94      0.93      1058



### KNN (5 instances)

In [41]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_proc, y_train)
predictions = model.predict(X_test_proc)
print("KNN (5 instances) Accuracy:", accuracy_score(y_test, predictions))   
print(classification_report(y_test, predictions))

KNN (5 instances) Accuracy: 0.9291115311909263
              precision    recall  f1-score   support

       False       0.93      0.99      0.96       982
        True       0.55      0.08      0.14        76

    accuracy                           0.93      1058
   macro avg       0.74      0.54      0.55      1058
weighted avg       0.91      0.93      0.90      1058

