In [1]:
# Modelo Predictivo – Diabetes Dataset

# Este notebook demuestra paso a paso la creación de un modelo predictivo
# utilizando datos sintéticos basados en el dataset original de condiciones médicas.

# Objetivos:
# - Generar datos de entrenamiento realistas
# - Entrenar un modelo de clasificación
# - Visualizar resultados
# - Preparar datos para persistencia en PostgreSQL



In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix




In [None]:
df = pd.read_csv("../data/cleaned/diabetes_cleaned.csv")
df.head()


Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,53,male,non-smoker,27.340637,134.832624,135.082933,Pneumonia
1,2,User0002,30,male,non-smoker,27.340637,105.315064,135.082933,Diabetic
2,3,User0003,18,male,non-smoker,35.612486,134.832624,135.082933,Pneumonia
3,4,User0004,53,male,non-smoker,27.340637,99.119829,135.082933,Pneumonia
4,5,User0005,76,male,non-smoker,27.340637,134.832624,135.082933,Diabetic


In [None]:
df.info()
df.describe()
df["condition"].value_counts()



condition
Diabetic     6013
Pneumonia    2527
Cancer       1460
Name: count, dtype: int64

In [None]:
df_model = df[
    [
        "age",
        "bmi",
        "blood_pressure",
        "glucose_levels",
        "gender",
        "smoking_status",
        "condition",
    ]
]

In [None]:
encoder = LabelEncoder()

df_model["gender"] = encoder.fit_transform(df_model["gender"])
df_model["smoking_status"] = encoder.fit_transform(df_model["smoking_status"])
df_model["condition"] = encoder.fit_transform(df_model["condition"])



In [None]:
X = df_model.drop("condition", axis=1)
y = df_model["condition"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [None]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)




ValueError: could not convert string to float: 'User9070'

In [None]:
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Matriz de Confusión")
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.show()


In [None]:
results = X_test.copy()
results["real_condition"] = y_test.values
results["predicted_condition"] = y_pred

results.head(10)



In [None]:
proba = model.predict_proba(X_test)

plt.hist(proba.max(axis=1), bins=20)
plt.title("Distribución de Confianza del Modelo")
plt.xlabel("Probabilidad máxima")
plt.ylabel("Cantidad de casos")
plt.show()
