In [None]:
# titanic_eda_xgbm_lgbm.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# === PATH SETUP ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\15 XGBM & LGBM\XGBM & LGBM"
train_path = os.path.join(base_path, "Titanic_train.csv")
test_path = os.path.join(base_path, "Titanic_test.csv")

In [None]:
# === LOAD DATA ===
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
print("✅ Datasets Loaded Successfully")
print("Train Shape:", train_df.shape)
print("Test Shape:", test_df.shape)
print("\n--- Columns ---")
print(list(train_df.columns))

In [None]:
# === 1️⃣ MISSING VALUES ===
print("\n--- Missing Values in Train Dataset ---")
print(train_df.isnull().sum())

In [None]:
plt.figure(figsize=(8, 5))
sns.heatmap(train_df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap - Train Data")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "missing_values_heatmap.png"))
plt.close()

In [None]:
# === 2️⃣ FEATURE DISTRIBUTIONS ===
numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']
plt.figure(figsize=(12, 8))
train_df[numeric_cols].hist(bins=15, figsize=(12, 8), color='skyblue', edgecolor='black')
plt.suptitle("Feature Distributions - Titanic Dataset", fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(base_path, "histograms.png"))
plt.close()

In [None]:
# Boxplots to see outliers
plt.figure(figsize=(12, 8))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(y=train_df[col], color='salmon')
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "boxplots.png"))
plt.close()

=== 3️⃣ RELATIONSHIPS WITH SURVIVAL ===

In [None]:
# Bar plot: Survival vs Sex
plt.figure(figsize=(6, 4))
sns.countplot(data=train_df, x='Sex', hue='Survived', palette='pastel')
plt.title("Survival Count by Sex")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "survival_by_sex.png"))
plt.close()

In [None]:
# Bar plot: Survival vs Pclass
plt.figure(figsize=(6, 4))
sns.countplot(data=train_df, x='Pclass', hue='Survived', palette='muted')
plt.title("Survival Count by Passenger Class")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "survival_by_pclass.png"))
plt.close()

In [None]:
# Scatter: Age vs Fare colored by survival
plt.figure(figsize=(7, 5))
sns.scatterplot(data=train_df, x='Age', y='Fare', hue='Survived', palette='coolwarm', alpha=0.7)
plt.title("Age vs Fare — Colored by Survival")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "scatter_age_fare.png"))
plt.close()

In [None]:
# Boxplot: Age distribution by Survival
plt.figure(figsize=(6, 4))
sns.boxplot(data=train_df, x='Survived', y='Age', palette='Set2')
plt.title("Age Distribution by Survival")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "age_by_survival.png"))
plt.close()

In [None]:
# Bar plot: Survival vs Embarked
if 'Embarked' in train_df.columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(data=train_df, x='Embarked', hue='Survived', palette='pastel')
    plt.title("Survival by Embarked Port")
    plt.tight_layout()
    plt.savefig(os.path.join(base_path, "survival_by_embarked.png"))
    plt.close()

In [None]:
print("\n✅ EDA Completed Successfully.")
print("Plots saved in:", base_path)
print("""
Saved files:
- missing_values_heatmap.png
- histograms.png
- boxplots.png
- survival_by_sex.png
- survival_by_pclass.png
- scatter_age_fare.png
- age_by_survival.png
- survival_by_embarked.png
""")

In [None]:
# === QUICK INSIGHTS ===
print("\n--- Insights ---")
print("1. Females had a higher survival rate compared to males.")
print("2. Higher Passenger Classes (1st class) show higher survival chances.")
print("3. Younger passengers and those paying higher fares tended to survive more.")
print("4. Age and Fare contain outliers but are still informative.")
print("5. Missing values primarily in 'Age', 'Cabin', and 'Embarked' columns.")

In [None]:
# titanic_preprocessing_xgbm_lgbm.py
import pandas as pd
import numpy as np
import os

In [None]:
# === PATH SETUP ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\15 XGBM & LGBM\XGBM & LGBM"
train_path = os.path.join(base_path, "Titanic_train.csv")
test_path = os.path.join(base_path, "Titanic_test.csv")

In [None]:
# === LOAD DATA ===
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
print("✅ Data Loaded Successfully")
print("Train Shape:", train_df.shape, "| Test Shape:", test_df.shape)

======================================================
1️⃣ IMPUTE MISSING VALUES
======================================================

In [None]:
# Check missing values
print("\n--- Missing Values Before Imputation ---")
print(train_df.isnull().sum())

In [None]:
# Fill Age with median (more robust to outliers)
train_df["Age"].fillna(train_df["Age"].median(), inplace=True)
test_df["Age"].fillna(train_df["Age"].median(), inplace=True)

In [None]:
# Fill Embarked with mode (most common value)
train_df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)
test_df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)

In [None]:
# Fill Fare in test set with median
test_df["Fare"].fillna(train_df["Fare"].median(), inplace=True)

In [None]:
# Drop Cabin (too many missing)
train_df.drop(columns=["Cabin"], inplace=True, errors="ignore")
test_df.drop(columns=["Cabin"], inplace=True, errors="ignore")

======================================================
2️⃣ FEATURE ENGINEERING (OPTIONAL BUT USEFUL)
======================================================

In [None]:
# Extract Title from Name
for df in [train_df, test_df]:
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace(
        ["Mlle", "Ms", "Lady", "Countess", "Mme", "Dr", "Major", "Col", "Capt", "Sir", "Don", "Jonkheer", "Rev"],
        "Rare"
    )

In [None]:
# Family size
for df in [train_df, test_df]:
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

In [None]:
# Drop non-useful columns
cols_to_drop = ["PassengerId", "Name", "Ticket"]
train_df.drop(columns=cols_to_drop, inplace=True, errors="ignore")
test_df.drop(columns=cols_to_drop, inplace=True, errors="ignore")

In [None]:
# ======================================================
# 3️⃣ ENCODE CATEGORICAL VARIABLES
# ======================================================
# Columns like Sex, Embarked, Title, and Pclass need encoding
cat_cols = ["Sex", "Embarked", "Title", "Pclass"]

In [None]:
# One-hot encoding for categorical columns
train_df = pd.get_dummies(train_df, columns=cat_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)

In [None]:
# Align test set to have same columns as train
train_cols = train_df.columns
test_df = test_df.reindex(columns=train_cols.drop("Survived"), fill_value=0)

In [None]:
# ======================================================
# 4️⃣ FINAL CLEANUP
# ======================================================
print("\n--- Missing Values After Imputation ---")
print(train_df.isnull().sum())

In [None]:
print("\n✅ Categorical Encoding Completed")
print("Train shape:", train_df.shape, "| Test shape:", test_df.shape)

In [None]:
# ======================================================
# 5️⃣ SAVE CLEAN DATA
# ======================================================
train_clean_path = os.path.join(base_path, "Titanic_train_processed.csv")
test_clean_path = os.path.join(base_path, "Titanic_test_processed.csv")

In [None]:
train_df.to_csv(train_clean_path, index=False)
test_df.to_csv(test_clean_path, index=False)

In [None]:
print("\n✅ Preprocessing Completed Successfully!")
print("Processed files saved as:")
print(" -", train_clean_path)
print(" -", test_clean_path)

In [None]:
# titanic_lgbm_xgbm_model.py
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
import lightgbm as lgb
import xgboost as xgb

In [None]:
# === PATH SETUP ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\15 XGBM & LGBM\XGBM & LGBM"
train_path = os.path.join(base_path, "Titanic_train_processed.csv")

In [None]:
# === LOAD CLEAN DATA ===
df = pd.read_csv(train_path)
print("✅ Preprocessed dataset loaded successfully:", df.shape)

In [None]:
# === 1️⃣ SPLIT DATA ===
X = df.drop("Survived", axis=1)
y = df["Survived"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train shape: {X_train.shape} | Test shape: {X_test.shape}")

In [None]:
# === 2️⃣ EVALUATION FUNCTION ===
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    print(f"\n{name} Performance:")
    print("-----------------------------")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"ROC-AUC:   {roc_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion Matrix Plot
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(os.path.join(base_path, f"{name}_confusion_matrix.png"))
    plt.close()

    return {"Model": name, "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1, "ROC_AUC": roc_auc}

============================================================
3️⃣ LIGHTGBM MODEL
============================================================

In [None]:
lgbm_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42
)
lgbm_model.fit(X_train, y_train)
lgbm_results = evaluate_model("LightGBM", lgbm_model, X_test, y_test)
joblib.dump(lgbm_model, os.path.join(base_path, "lightgbm_model.pkl"))

============================================================
4️⃣ XGBOOST MODEL
============================================================

In [None]:
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train, y_train)
xgb_results = evaluate_model("XGBoost", xgb_model, X_test, y_test)
joblib.dump(xgb_model, os.path.join(base_path, "xgboost_model.pkl"))

============================================================
5️⃣ CROSS VALIDATION (for performance robustness)
============================================================

In [None]:
for model, name in zip([lgbm_model, xgb_model], ["LightGBM", "XGBoost"]):
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"\n{name} 5-Fold CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

============================================================
6️⃣ HYPERPARAMETER TUNING (LightGBM example)
============================================================

In [None]:
print("\nRunning LightGBM Hyperparameter Tuning (Grid Search)...")
param_grid = {
    'num_leaves': [15, 31, 63],
    'max_depth': [-1, 5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 300, 500]
}

In [None]:
grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=0,
    n_jobs=-1
)
grid.fit(X_train, y_train)

In [None]:
print("✅ Best LightGBM Parameters:")
print(grid.best_params_)
print(f"Best CV Accuracy: {grid.best_score_:.4f}")

============================================================
7️⃣ FEATURE IMPORTANCE COMPARISON
============================================================

In [None]:
plt.figure(figsize=(10,5))
lgb.plot_importance(lgbm_model, max_num_features=10, title="LightGBM Feature Importance")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "lightgbm_feature_importance.png"))
plt.close()

In [None]:
plt.figure(figsize=(10,5))
xgb.plot_importance(xgb_model, max_num_features=10, title="XGBoost Feature Importance")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "xgboost_feature_importance.png"))
plt.close()

============================================================
8️⃣ COMPARISON SUMMARY
============================================================

In [None]:
results_df = pd.DataFrame([lgbm_results, xgb_results])
results_df.to_csv(os.path.join(base_path, "model_comparison_results.csv"), index=False)
print("\n✅ Model comparison completed. Results saved to 'model_comparison_results.csv'")
print(results_df)

In [None]:
# ============================================================
# 9️⃣ QUICK INSIGHTS
# ============================================================
print("\n--- Insights ---")
print("1. Both LightGBM and XGBoost perform strongly on Titanic survival prediction.")
print("2. LightGBM typically trains faster with similar or better accuracy.")
print("3. Feature importances often highlight 'Sex', 'Pclass', 'Fare', and 'Age' as top predictors.")
print("4. Hyperparameter tuning can yield slight accuracy improvements (~1–3%).")
print("5. ROC-AUC and cross-validation scores confirm the models are generalizing well.")

In [None]:
# titanic_model_comparison_analysis.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# === PATH SETUP ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\15 XGBM & LGBM\XGBM & LGBM"
results_path = os.path.join(base_path, "model_comparison_results.csv")

In [None]:
# === LOAD RESULTS ===
results_df = pd.read_csv(results_path)
print("✅ Results Loaded Successfully:")
print(results_df, "\n")

In [None]:
# === 1️⃣ COMPARISON PLOT ===
plt.figure(figsize=(8, 6))
metrics = ["Accuracy", "Precision", "Recall", "F1", "ROC_AUC"]
results_melted = results_df.melt(id_vars="Model", value_vars=metrics, var_name="Metric", value_name="Score")

In [None]:
sns.barplot(data=results_melted, x="Metric", y="Score", hue="Model", palette="coolwarm")
plt.title("LightGBM vs XGBoost — Performance Comparison", fontsize=14)
plt.ylim(0, 1)
plt.legend(title="Model")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "lgbm_xgbm_comparison.png"))
plt.close()

In [None]:
print("📊 Saved performance comparison chart as 'lgbm_xgbm_comparison.png'")

In [None]:
# === 2️⃣ INTERPRETATION ===
lgbm = results_df[results_df["Model"] == "LightGBM"].iloc[0]
xgb = results_df[results_df["Model"] == "XGBoost"].iloc[0]

In [None]:
print("\n--- Comparative Summary ---")
if lgbm["Accuracy"] > xgb["Accuracy"]:
    print("✅ LightGBM achieved higher accuracy than XGBoost.")
else:
    print("✅ XGBoost achieved higher accuracy than LightGBM.")

In [None]:
print(f"\nLightGBM → Accuracy: {lgbm['Accuracy']:.4f}, Precision: {lgbm['Precision']:.4f}, Recall: {lgbm['Recall']:.4f}, F1: {lgbm['F1']:.4f}")
print(f"XGBoost → Accuracy: {xgb['Accuracy']:.4f}, Precision: {xgb['Precision']:.4f}, Recall: {xgb['Recall']:.4f}, F1: {xgb['F1']:.4f}")

In [None]:
print("\n--- Insights ---")
print("1. LightGBM usually trains faster and handles large datasets more efficiently using histogram-based learning.")
print("2. XGBoost provides slightly more stable performance on smaller datasets due to its regularization controls.")
print("3. Both models identified similar top predictors — 'Sex', 'Pclass', 'Fare', and 'Age'.")
print("4. If computational speed is key → LightGBM wins.")
print("5. If interpretability and consistent performance are needed → XGBoost holds strong.")