In [None]:

# Caso 2: Predicción de Diabetes

# --- Teoría ---
"""
Clasificación binaria: predecir diagnóstico de diabetes.
Se imputan ceros como datos faltantes. Modelos: LogReg, RF, XGB, AdaBoost.
Métrica clínica clave: Recall (sensibilidad).
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

data = pd.read_csv("diabetes.csv")
cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for c in cols:
    data[c].replace(0,np.nan,inplace=True)
    data[c].fillna(data[c].median(), inplace=True)

X = data.drop('Outcome',axis=1)
y = data['Outcome']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
scaler=StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}
results={}
for n,m in models.items():
    m.fit(X_train,y_train)
    yp=m.predict(X_test)
    yp_prob=m.predict_proba(X_test)[:,1]
    results[n]={'report':classification_report(y_test,yp,output_dict=True),
                'roc_auc':roc_auc_score(y_test,yp_prob)}
    print(f"\nModelo: {n}")
    print(classification_report(y_test,yp))

roc_scores={k:v['roc_auc'] for k,v in results.items()}
plt.bar(roc_scores.keys(),roc_scores.values())
plt.title('Comparación AUC ROC')
plt.show()
