# Assignment 17: XGBoost and LightGBM

## Dataset: Titanic Survival

**Topics Covered:**
- XGBoost (Extreme Gradient Boosting)
- LightGBM (Light Gradient Boosting Machine)
- Ensemble Methods Comparison

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb

# Load data
df = pd.read_csv('Titanic_train.csv')
print("Dataset loaded! Shape:", df.shape)
df.head()

In [None]:
# Preprocess data
# Fill missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Encode categorical
df['Sex_encoded'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked_encoded'] = LabelEncoder().fit_transform(df['Embarked'].astype(str))

# Select features
features = ['Pclass', 'Sex_encoded', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_encoded']
X = df[features]
y = df['Survived']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training:", len(X_train), "Testing:", len(X_test))

In [None]:
# XGBoost
print("=== XGBoost ===")

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
print("XGBoost Accuracy:", round(xgb_acc, 4))

In [None]:
# LightGBM
print("=== LightGBM ===")

lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    verbose=-1
)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
lgb_acc = accuracy_score(y_test, lgb_pred)
print("LightGBM Accuracy:", round(lgb_acc, 4))

In [None]:
# Compare models
print("=== Model Comparison ===")
print("XGBoost:", round(xgb_acc, 4))
print("LightGBM:", round(lgb_acc, 4))

# Bar chart
plt.figure(figsize=(8, 5))
plt.bar(['XGBoost', 'LightGBM'], [xgb_acc, lgb_acc], color=['blue', 'green'])
plt.ylabel('Accuracy')
plt.title('XGBoost vs LightGBM')
plt.ylim(0.7, 1.0)
plt.savefig('model_comparison.png')
plt.show()

In [None]:
# Feature Importance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# XGBoost importance
xgb_imp = pd.DataFrame({'Feature': features, 'Importance': xgb_model.feature_importances_})
xgb_imp = xgb_imp.sort_values('Importance')
axes[0].barh(xgb_imp['Feature'], xgb_imp['Importance'], color='blue')
axes[0].set_title('XGBoost Feature Importance')

# LightGBM importance
lgb_imp = pd.DataFrame({'Feature': features, 'Importance': lgb_model.feature_importances_})
lgb_imp = lgb_imp.sort_values('Importance')
axes[1].barh(lgb_imp['Feature'], lgb_imp['Importance'], color='green')
axes[1].set_title('LightGBM Feature Importance')

plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

## Summary

- Both XGBoost and LightGBM are powerful boosting algorithms
- LightGBM is generally faster
- Both provide feature importance rankings