Assignment 17
Title:
Diabetes Prediction using Ensemble Learning: Bagging and Boosting

In [1]:
# ================================================================
# ðŸ§ª Practical 17
# Title: Diabetes Prediction using Ensemble Learning (Bagging & Boosting)
# ================================================================

# ------------------------------------------------
# 1. Import Required Libraries
# ------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# ------------------------------------------------
# 2. Load and Explore Dataset
# ------------------------------------------------
df = pd.read_csv("diabetes.csv")
print("âœ… Data Loaded Successfully!")
print(df.head())

pd.set_option('display.float_format', '{:.2f}'.format)
print(df.describe())

# ------------------------------------------------
# 3. Check for Missing or Zero Values
# ------------------------------------------------
feature_columns = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'Pedigree', 'Age'
]


for column in feature_columns:
    print("--------------------------------------------")
    print(f"{column} ==> Missing zeros : {len(df.loc[df[column] == 0])}")

# ------------------------------------------------
# 4. Handle Missing Values (replace zeros with mean)
# ------------------------------------------------
fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)
df[feature_columns] = fill_values.fit_transform(df[feature_columns])

for column in feature_columns:
    print("--------------------------------------------")
    print(f"{column} ==> Missing zeros : {len(df.loc[df[column] == 0])}")

# ------------------------------------------------
# 5. Split Data into Training and Testing Sets
# ------------------------------------------------
X = df[feature_columns]
y = df.Outcome
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------------------------
# 6. Define Evaluation Function
# ------------------------------------------------
def evaluate(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print("\n========== TRAINING RESULTS ==========")
    print("Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))
    print("Accuracy Score:", round(accuracy_score(y_train, y_train_pred), 4))
    print("Classification Report:\n", classification_report(y_train, y_train_pred))
    
    print("\n========== TESTING RESULTS ==========")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
    print("Accuracy Score:", round(accuracy_score(y_test, y_test_pred), 4))
    print("Classification Report:\n", classification_report(y_test, y_test_pred))

# ------------------------------------------------
# 7. BAGGING (Using Decision Trees)
# ------------------------------------------------
print("\n================ BAGGING ================\n")

base_model = DecisionTreeClassifier(random_state=42)
bagging_model = BaggingClassifier(
    estimator=base_model,
    n_estimators=10,
    random_state=42
)

bagging_model.fit(X_train, y_train)
evaluate(bagging_model, X_train, X_test, y_train, y_test)

bagging_scores = {
    'Train Accuracy': accuracy_score(y_train, bagging_model.predict(X_train)),
    'Test Accuracy': accuracy_score(y_test, bagging_model.predict(X_test))
}
print("\nBagging Accuracy Scores:", bagging_scores)

# ------------------------------------------------
# 8. BOOSTING - AdaBoost
# ------------------------------------------------
print("\n================ ADABOOST ================\n")

ada_boost_clf = AdaBoostClassifier(n_estimators=30, random_state=42)
ada_boost_clf.fit(X_train, y_train)
evaluate(ada_boost_clf, X_train, X_test, y_train, y_test)

ada_scores = {
    'Train Accuracy': accuracy_score(y_train, ada_boost_clf.predict(X_train)),
    'Test Accuracy': accuracy_score(y_test, ada_boost_clf.predict(X_test))
}
print("\nAdaBoost Accuracy Scores:", ada_scores)

# ------------------------------------------------
# 9. BOOSTING - Gradient Boosting
# ------------------------------------------------
print("\n================ GRADIENT BOOSTING ================\n")

grad_boost_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)
grad_boost_clf.fit(X_train, y_train)
evaluate(grad_boost_clf, X_train, X_test, y_train, y_test)

grad_scores = {
    'Train Accuracy': accuracy_score(y_train, grad_boost_clf.predict(X_train)),
    'Test Accuracy': accuracy_score(y_test, grad_boost_clf.predict(X_test))
}
print("\nGradient Boosting Accuracy Scores:", grad_scores)

# ------------------------------------------------
# 10. Summary of All Models
# ------------------------------------------------
print("\n================ COMPARISON SUMMARY ================\n")
comparison = pd.DataFrame({
    'Model': ['Bagging', 'AdaBoost', 'Gradient Boosting'],
    'Train Accuracy': [bagging_scores['Train Accuracy'], ada_scores['Train Accuracy'], grad_scores['Train Accuracy']],
    'Test Accuracy': [bagging_scores['Test Accuracy'], ada_scores['Test Accuracy'], grad_scores['Test Accuracy']]
})
print(comparison)


âœ… Data Loaded Successfully!
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   Pedigree  Age  Outcome  
0     0.627   50        1  
1     0.351   31        0  
2     0.672   32        1  
3     0.167   21        0  
4     2.288   33        1  
       Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin    BMI  \
count       768.00   768.00         768.00         768.00   768.00 768.00   
mean          3.85   120.89          69.11          20.54    79.80  31.99   
std           3.37    31.97          19.36          15.95   115.24   7.88   
min           0.00     0.00           0.00           0.00    