# Titanic Survival Prediction using LightGBM and XGBoost (with LightGBM Tuning)

### Step 1: Load Dataset and Perform Exploratory Data Analysis (EDA)

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Titanic train and test datasets
train_data = pd.read_csv('/mnt/data/Titanic_train.csv')
test_data = pd.read_csv('/mnt/data/Titanic_test.csv')

# Display the first few rows of the training dataset
train_data.head()

# Check for missing values in the dataset
missing_values = train_data.isnull().sum()
print('Missing values in training data:')
print(missing_values)

# Display basic statistics of the dataset
train_data.describe()
    

### Step 2: Data Preprocessing

In [None]:

# Impute missing values in the Age column with the median
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())

# Impute missing values in Embarked column with the most common value (mode)
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])

# Encode categorical variables (Sex and Embarked) using one-hot encoding
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)

# Drop unnecessary columns (PassengerId, Name, Ticket, Cabin)
train_data = train_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# Separate features (X) and target (y)
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Display the preprocessed data
X.head()
    

### Step 3: Building Predictive Models (LightGBM and XGBoost with Tuning)

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import lightgbm as lgb
import xgboost as xgb

# Split the dataset into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Tuned LightGBM Model with adjusted parameters
lgbm = lgb.LGBMClassifier(random_state=42, min_child_samples=20, max_depth=6, lambda_l1=0.1, lambda_l2=0.1)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_val)

# XGBoost Model
xgbm = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgbm.fit(X_train, y_train)
y_pred_xgbm = xgbm.predict(X_val)

# Evaluate models using various metrics
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    print(f"Evaluation Metrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")

# Evaluate LightGBM
evaluate_model(y_val, y_pred_lgbm, "Tuned LightGBM")

# Evaluate XGBoost
evaluate_model(y_val, y_pred_xgbm, "XGBoost")
    