In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline

# Load dataset (replace with your path)
df = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Diabetes.csv')

X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('clf', XGBClassifier(eval_metric='logloss',random_state=42))
])

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20],
    'clf__learning_rate': [0.01, 0.1, 0.2],
    'clf__subsample': [0.7, 1],
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

#fit model with best parameters
grid.fit(X_train, y_train)

# Get best parameters
print("Best parameters:", grid.best_params_)

# Train final model with best params
best_model = grid.best_estimator_

#train and predict
y_pred = grid.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

df_results = X_test.copy()

# Safe and efficient assignment of predictions using numpy
df_results['Prediction'] = np.where(y_pred == 1, 'Diabetic', 'Non-Diabetic')
df_results['Actual'] = np.where(y_test == 1, 'Diabetic', 'Non-Diabetic')

df_results.to_csv("diabetes_predictions_xgb.csv", index=False)

# Find mismatches
mismatches = df_results[df_results['Prediction'] != df_results['Actual']]

# Separate into types
false_positives = df_results[
    (df_results['Prediction'] == 'Diabetic') & 
    (df_results['Actual'] == 'Non-Diabetic')
]
print(false_positives)

false_negatives = df_results[
    (df_results['Prediction'] == 'Non-Diabetic') & 
    (df_results['Actual'] == 'Diabetic')
]
print(false_negatives)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'clf__learning_rate': 0.01, 'clf__max_depth': 10, 'clf__n_estimators': 300, 'clf__subsample': 0.7}
     pregnancies  glucose  diastolic  triceps  insulin   bmi    dpf  age  \
660           10      162         84        0        0  27.7  0.182   54   
531            0      107         76        0        0  45.3  0.686   24   
756            7      137         90       41        0  32.0  0.391   39   
247            0      165         90       33      680  52.3  0.427   23   
335            0      165         76       43      255  47.9  0.259   26   
389            3      100         68       23       81  31.6  0.949   28   
727            0      141         84       26        0  32.4  0.433   22   
724            1      111         94        0        0  32.8  0.265   45   
212            7      179         95       31        0  34.2  0.164   60   
260            3      191         68       15      130  30.9 

In [5]:
# Save false positives and false negatives to separate CSV files
mismatches.to_csv("diabetes_mismatches_xgb.csv", index=False)
false_positives.to_csv("false_positives_xgb.csv", index=False)
false_negatives.to_csv("false_negatives_xgb.csv", index=False)