In [None]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset (replace with your path)
df = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Diabetes.csv')

# Prepare features and target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('clf', LGBMClassifier(random_state=42))
])

param_grid = {
    'clf__n_estimators': [50, 100, 150],
    'clf__max_depth': [3, 5, 7, -1],  # -1 means no limit
    'clf__learning_rate': [0.01, 0.1, 0.2],
    'clf__num_leaves': [31, 50, 100],
    'clf__subsample': [0.7, 1],
    'clf__colsample_bytree': [0.7, 1]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)

y_pred = grid.predict(X_test)
y_prob = grid.predict_proba(X_test)[:, 1]

df_results = X_test.copy()

df_results['Prediction'] = np.where(y_pred == 1, 'Diabetic', 'Non-Diabetic')
df_results['Actual'] = np.where(y_test == 1, 'Diabetic', 'Non-Diabetic')

df_results.to_csv("diabetes_predictions_lgbm.csv", index=False)

# Find mismatches
mismatches = df_results[df_results['Prediction'] != df_results['Actual']]

# Separate into types
false_positives = df_results[
    (df_results['Prediction'] == 'Diabetic') &
    (df_results['Actual'] == 'Non-Diabetic')
]
false_negatives = df_results[
    (df_results['Prediction'] == 'Non-Diabetic') & 
    (df_results['Actual'] == 'Diabetic')
]


In [3]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7878787878787878

Confusion Matrix:
 [[115  35]
 [ 14  67]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.77      0.82       150
           1       0.66      0.83      0.73        81

    accuracy                           0.79       231
   macro avg       0.77      0.80      0.78       231
weighted avg       0.81      0.79      0.79       231

