In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
df = pd.read_csv('./dataset/processed_dataset.csv')

In [3]:
# Split the data into training and testing sets
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(f_classif)),
    ('smote', SMOTE(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
])

In [5]:
# Define parameters for GridSearch
param_grid = {
    'feature_selection__k': [5, 7, 9],
    'gb__n_estimators': [100, 200, 300],
    'gb__learning_rate': [0.01, 0.1, 0.3],
    'gb__max_depth': [3, 4, 5],
    'gb__min_samples_split': [2, 5],
    'gb__subsample': [0.8, 1.0]
}

In [6]:
# Perform GridSearch
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

Starting Grid Search...


KeyboardInterrupt: 

In [None]:
# Get best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:,1]

In [None]:
# Print model evaluation
print("\nBest Parameters:", grid_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Plot ROC Curve
plt.figure(figsize=(10, 6))
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Gradient Boosting Model')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Plot Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Gradient Boosting Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Feature Importance
feature_selector = best_model.named_steps['feature_selection']
selected_features_mask = feature_selector.get_support()
selected_features = X.columns[selected_features_mask].tolist()

feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': best_model.named_steps['gb'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance (Gradient Boosting)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

In [None]:
# Learning Curves
from sklearn.model_selection import learning_curve

train_sizes, train_scores, val_scores = learning_curve(
    best_model, X_train, y_train, cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy'
)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score')
plt.plot(train_sizes, np.mean(val_scores, axis=1), label='Cross-validation score')
plt.xlabel('Training Size')
plt.ylabel('Accuracy Score')
plt.title('Learning Curves - Gradient Boosting Model')
plt.legend(loc='best')
plt.grid(True)
plt.show()

In [None]:
# Plot training deviance
test_score = np.zeros((params['gb__n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(best_model.named_steps['gb'].staged_predict(X_test)):
    test_score[i] = best_model.named_steps['gb'].loss_(y_test, y_pred)

plt.figure(figsize=(10, 6))
plt.plot(np.arange(params['gb__n_estimators']) + 1, best_model.named_steps['gb'].train_score_,
         'b-', label='Training Set Deviance')
plt.plot(np.arange(params['gb__n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
plt.title('Training and Test Deviance vs Boosting Iterations')
plt.show()

In [None]:
# Cross-validation Scores Distribution
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring='accuracy')

plt.figure(figsize=(8, 6))
plt.hist(cv_scores, bins=20)
plt.axvline(cv_scores.mean(), color='red', linestyle='dashed', linewidth=2)
plt.xlabel('Accuracy Score')
plt.ylabel('Frequency')
plt.title('Distribution of Cross-validation Scores - Gradient Boosting Model')
plt.show()

print("\nCross-validation Scores Summary:")
print(f"Mean Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

In [None]:
# Feature Partial Dependence Plots
from sklearn.inspection import partial_dependence

def plot_partial_dependence(model, X, features, feature_names):
    plt.figure(figsize=(15, 5))
    for i, feature in enumerate(features):
        plt.subplot(1, len(features), i+1)
        pd_result = partial_dependence(model, X, [feature])
        plt.plot(pd_result[1][0], pd_result[0][0])
        plt.xlabel(feature_names[feature])
        plt.ylabel('Partial dependence')
    plt.suptitle('Partial Dependence Plots')
    plt.tight_layout()
    plt.show()

# Select top 3 important features for partial dependence plots
top_features = feature_importance.head(3)['feature'].index.tolist()
plot_partial_dependence(best_model.named_steps['gb'], 
                       X_test, 
                       top_features, 
                       selected_features)