In [64]:
!pip install xgboost imbalanced-learn
!pip install scikeras
!pip install imbalanced-learn




In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/content/drive/MyDrive/diabetes.csv')
print(df.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [66]:
zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[zero_cols] = df[zero_cols].replace(0, np.nan)
df[zero_cols] = df[zero_cols].fillna(df[zero_cols].median())

# Feature Engineering
df['Age_BMI'] = df['Age'] * df['BMI']
df['Glucose2'] = df['Glucose'] ** 2
df['Is_Obese'] = (df['BMI'] > 30).astype(int)

# Prepare features
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Apply SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

# Compute class weights from original y (not y_res)
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(y)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weights = dict(zip(classes, weights))

# XGBoost tuning
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}
xgb_base = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=weights[1]/weights[0])
xgb_grid = GridSearchCV(xgb_base, xgb_params, cv=3, scoring='roc_auc', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

# Random Forest tuning
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [6, 8],
    'max_features': ['sqrt', 'log2']
}
rf = RandomForestClassifier(random_state=42, class_weight=class_weights)
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='roc_auc', n_jobs=-1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

# Logistic Regression meta learner tuning
meta_params = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs'],
    'penalty': ['l2']
}
meta = LogisticRegression(max_iter=1000, class_weight=class_weights)
meta_grid = GridSearchCV(meta, meta_params, cv=3, scoring='roc_auc', n_jobs=-1)
meta_grid.fit(X_train, y_train)
best_meta = meta_grid.best_estimator_

# Stacking ensemble
stack = StackingClassifier(
    estimators=[('xgb', best_xgb), ('rf', best_rf)],
    final_estimator=best_meta,
    passthrough=True,
    n_jobs=-1
)

# Cross-validation on stacking ensemble
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
probs = np.zeros(len(y_test))
preds = np.zeros(len(y_test))

for train_idx, val_idx in kf.split(X_train, y_train):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    stack.fit(X_tr, y_tr)
    probs += stack.predict_proba(X_test)[:, 1] / kf.n_splits
    preds += stack.predict(X_test) / kf.n_splits

final_preds = (preds >= 0.5).astype(int)

In [None]:
# Results
print("Cross-Validated Stacking Ensemble Results")
print(classification_report(y_test, final_preds))
print("ROC AUC (CV ensemble):", roc_auc_score(y_test, probs))