In [19]:
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline

In [20]:
df = pd.read_csv(r"C:\Users\fabri\Desktop\my_repo\env\projects\project_diabetes\diabetes.csv")

In [21]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Sostituisco 0 con NaN per colonne specifiche prima dell'imputazione
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
X[cols_with_zero] = X[cols_with_zero].replace(0, np.nan)

In [22]:
# pipeline con SMOTEENN e RandomForest.
smote_enn = SMOTEENN(random_state=42)
pipeline = ImbPipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler()),
('smote_enn', smote_enn),
('classifier', RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)),
    ])

In [23]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1')
print(f"CV F1-score: {scores.mean():.3f} ± {scores.std():.3f}")

CV F1-score: 0.689 ± 0.033


In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.67      0.75       100
           1       0.57      0.80      0.66        54

    accuracy                           0.71       154
   macro avg       0.71      0.73      0.71       154
weighted avg       0.76      0.71      0.72       154

Confusion Matrix:
 [[67 33]
 [11 43]]
