# Assignment 13 â€” LightGBM vs XGBoost (Polished)

This notebook is prepared to fully match the assignment requirements. Run it in Google Colab or your local environment to execute all cells (install packages if needed).

## Objective
Compare LightGBM and XGBoost on the provided `diabetes (3).csv` dataset. The notebook includes: EDA, preprocessing, imbalance handling, model training, evaluation metrics, and comparison notes.

In [None]:
# 1. Imports
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

# Try to import LightGBM and XGBoost
use_lgb = False
use_xgb = False
try:
    import lightgbm as lgb
    use_lgb = True
    print('lightgbm available')
except Exception:
    print('lightgbm not available')
try:
    import xgboost as xgb
    use_xgb = True
    print('xgboost available')
except Exception:
    print('xgboost not available')

# Try SMOTE
use_smote = False
try:
    from imblearn.over_sampling import SMOTE
    use_smote = True
    print('imblearn SMOTE available')
except Exception:
    print('imblearn not available')

In [None]:
# 2. Load dataset
df = pd.read_csv('/mnt/data/diabetes (3).csv')
print('Shape:', df.shape)
df.head()

In [None]:
# 3. Quick EDA
print('Missing values:\n', df.isna().sum())
print('\nTarget value counts (last column):')
print(df.iloc[:,-1].value_counts())

# Basic histograms
_ = df.hist(bins=20, figsize=(12,8))
plt.tight_layout()
plt.show()

In [None]:
# 4. Preprocessing
# Choose target
if 'Outcome' in df.columns:
    target = 'Outcome'
else:
    target = df.columns[-1]
print('Target:', target)

# Drop rows with missing target
df = df[df[target].notna()].reset_index(drop=True)
X = df.drop(columns=[target])
y = df[target]

# Encode object columns
for c in X.select_dtypes(include=['object','category']).columns:
    X[c] = X[c].astype('category').cat.codes

# Fill numeric NaNs
for c in X.select_dtypes(include=[np.number]).columns:
    X[c] = X[c].fillna(X[c].median())

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print('Features shape:', X_scaled.shape)

In [None]:
# 5. Handle imbalance
print('Original class distribution:\n', pd.Series(y).value_counts())
if use_smote:
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_scaled, y)
    print('After SMOTE:', pd.Series(y_res).value_counts())
else:
    X_res, y_res = X_scaled, y
    print('Using original distribution; consider class_weight or resampling')

In [None]:
# 6. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25, random_state=42, stratify=y_res if len(pd.Series(y_res).unique())>1 else None)
print('Train/test sizes:', X_train.shape, X_test.shape)

In [None]:
# 7. Helper function for evaluation
from sklearn.metrics import classification_report, confusion_matrix

def evaluate(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('----', name, '----')
    print('Accuracy:', round(accuracy_score(y_test, y_pred),4))
    print('Precision (weighted):', round(precision_score(y_test, y_pred, average='weighted', zero_division=0),4))
    print('Recall (weighted):', round(recall_score(y_test, y_pred, average='weighted', zero_division=0),4))
    print('F1 (weighted):', round(f1_score(y_test, y_pred, average='weighted', zero_division=0),4))
    try:
        if len(pd.Series(y_test).unique())==2:
            y_proba = model.predict_proba(X_test)[:,1]
            print('ROC AUC:', round(roc_auc_score(y_test, y_proba),4))
    except Exception:
        pass
    print('\nClassification report:\n', classification_report(y_test, y_pred))
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))


In [None]:
# 8. Train LightGBM (or fallback)
if use_lgb:
    model_lgb = lgb.LGBMClassifier(n_estimators=100, random_state=42)
else:
    model_lgb = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_lgb.fit(X_train, y_train)
evaluate('LightGBM/Fallback', model_lgb, X_test, y_test)

In [None]:
# 9. Train XGBoost (or fallback)
if use_xgb:
    model_xgb = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
else:
    model_xgb = GradientBoostingClassifier(n_estimators=100, random_state=7)
model_xgb.fit(X_train, y_train)
evaluate('XGBoost/Fallback', model_xgb, X_test, y_test)

In [None]:
# 10. Feature importances (if available)
try:
    if hasattr(model_lgb, 'feature_importances_'):
        imp = pd.Series(model_lgb.feature_importances_, index=X.columns).sort_values(ascending=False)
    elif hasattr(model_xgb, 'feature_importances_'):
        imp = pd.Series(model_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
    else:
        imp = None
    if imp is not None:
        display(imp)
        sns.barplot(x=imp.values, y=imp.index)
        plt.title('Feature importances')
        plt.show()
except Exception as e:
    print('Feature importance error:', e)

## Conclusions & Notes
- Bagging vs Boosting: Bagging trains independent models and averages (reduces variance). Boosting trains sequential models focusing on mistakes (reduces bias). 
- Imbalance handling: SMOTE (oversampling), undersampling, or class_weight can be used. Prefer F1/ROC metrics when imbalance exists.

**To run this notebook fully:**
- On Google Colab run `!pip install lightgbm xgboost imbalanced-learn` before running cells, if you want full LGBM/XGB/SMOTE support.


## End