<a href="https://colab.research.google.com/github/Srishti6125/Titanic_Survival_prediction/blob/main/Titanic_Survival_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
px.colors.carto
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score,RocCurveDisplay,confusion_matrix, classification_report)

In [2]:
# importing dataset
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dataset/Titanic-Dataset.csv")

In [3]:
# shape of dataset
df.shape

(891, 12)

In [4]:
# column names
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

####Column Informtion

* 'PassengerId' - passenger id number
* 'Survived' - Passenger survived or not (1-Yes,0-No)
* 'Pclass' - Passenger class travelling in
* 'Name' - Name of passenger
* 'Sex' - Gender of passenger
* 'Age' - Age of passenger
* 'SibSp' - No. of siblings / spouses aboard the Titanic
* 'Parch' - No. of parents / children aboard the Titanic
* 'Ticket' - Ticket Number
* 'Fare' - Trip fare
* 'Cabin' - Cabin number
* 'Embarked' - Port of Embarkation: C=Cherbourg, Q=Queenstown, S=Southampton

In [5]:
#dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
#top 5 entries
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# statistical description of numerical columns
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
# null value count
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [9]:
# null value visualizaton
fig=px.imshow(df.isnull(),color_continuous_scale=px.colors.sequential.Blugrn_r,height=500)
fig.update_layout(title='Null Value Heatmap',xaxis_title='Columns',yaxis_title='Rows',title_x=0.5)
fig.show()

In [10]:
# filling null values in Age column with null values
imputer = SimpleImputer(strategy="mean")
df["Age"] = imputer.fit_transform(df[["Age"]])

In [11]:
# filling null values in Embarked column with null values
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

In [12]:
# removing Cabin column as more than 50 per values are null
df.drop('Cabin',axis=1,inplace=True)

In [13]:
# per of people who survived and who didnt
fig = px.pie(df, names='Survived', height=500,width=500,title='Survived vs Not Survived',color_discrete_sequence=px.colors.sequential.Blugrn_r)
fig.update_layout(legend_title_text='Survived',title_x=0.5)
fig.show()

In [14]:
# passengers by gender (Grouped by Survival)
fig=px.histogram(df,x='Sex',color='Survived', barmode='group',height=400,width=500,color_discrete_sequence=px.colors.sequential.Blugrn_r)
fig.update_layout(title='Passengers by Gender',xaxis_title='Gender',title_x=0.5)
fig.show()

In [15]:
# passengers tavelling in each class (Grouped by Survival)
fig=px.histogram(df,x='Pclass',color='Survived',barmode='group',height=400,width=500,color_discrete_sequence=px.colors.sequential.Blugrn_r)
fig.update_layout(title='Passengers by Class',xaxis_title='Class',title_x=0.5)
fig.show()

In [16]:
# age dustribution (Grouped by Survival)
fig=px.histogram(df,x='Age',height=400,width=1000,barmode='overlay',nbins=50,color_discrete_sequence=px.colors.sequential.Blugrn_r,color='Survived')
fig.update_layout(title='Age Distribution',xaxis_title='Age',title_x=0.5)
fig.show()

In [17]:
# Fare Distribution wrt class
fig=px.box(df,x='Pclass',y='Fare',title="Fair v/s Class",height=500,width=1000,color='Pclass',color_discrete_sequence=px.colors.carto.Blugrn_r)
fig.update_layout(title_x=0.5)
fig.show()

In [18]:
# survival rate based on embarked destination
fig=px.histogram(df,x='Survived',color='Embarked',barmode='group',title="Survival Count by Embarkation Port",color_discrete_sequence=px.colors.sequential.Blugrn,height=400,width=800)
fig.update_layout(title_x=0.5)
fig.show()

In [19]:
fare_mean = df.groupby(['SibSp','Survived'])['Fare'].mean().reset_index()
fig = px.bar(fare_mean, x='SibSp', y='Fare', color='Survived', barmode='group', title='Average Fare vs SibSp (Grouped by Survival)',color_continuous_scale=px.colors.sequential.Blugrn,height=500,width=1000)
fig.update_layout(title_x=0.5)
fig.show()

In [20]:
# Fare vs Parch (Grouped by Survival)
fig=px.scatter(df,x='Parch',y='Fare',title='Fare vs Parch (Grouped by Survival)',color='Survived',color_continuous_scale=px.colors.carto.Temps,height=500,width=1000)
fig.update_layout(title_x=0.5)
fig.show()

In [21]:
# pairplot
fig = px.scatter_matrix(df, dimensions=['Age', 'Fare', 'Pclass', 'SibSp', 'Parch'],color='Survived', title="Scatter Matrix of Key Features",color_continuous_scale=px.colors.carto.Temps)
fig.update_layout(title_x=0.5)
fig.show()

In [22]:
df['Fare'].skew()

np.float64(4.787316519674893)

In [23]:
# Fare Distribution
fig = px.histogram(df, x='Fare', title="Distribution of Fare",nbins=50,height=500,width=1000,color_discrete_sequence=px.colors.carto.Blugrn_r)
fig.update_layout(title_x=0.5)
fig.show()

In [24]:
# log transformation of Fare - right skewed
df['Fare'] = np.log1p(df['Fare'])

In [25]:
# feature selection
df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)

In [26]:
# Feature Extraction
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

In [27]:
# one hot encoding Embarked, sex, Pclass column
df=pd.get_dummies(df, columns=['Embarked','Sex','Pclass'])

In [28]:
# correlation heatmap
corr = df.corr(numeric_only=True)
fig = px.imshow(corr, text_auto=True, color_continuous_scale='Temps',title="Correlation Heatmap",height=600,width=600)
fig.update_layout(title_x=0.5)
fig.show()

In [29]:
corr_target = df.corr(numeric_only=True)['Survived'].sort_values(ascending=False)
corr_target

Unnamed: 0,Survived
Survived,1.0
Sex_female,0.543351
Fare,0.329862
Pclass_1,0.285904
Embarked_C,0.16824
Pclass_2,0.093349
Parch,0.081629
FamilySize,0.016639
Embarked_Q,0.00365
SibSp,-0.035322


In [30]:
# Feature Selection
corr_target= corr_target.drop(['Survived','Pclass_2','Embarked_Q','Sex_male','SibSp','FamilySize'])

In [31]:
# storinng selected features into list
selected_features = corr_target.index.tolist()

In [32]:
# definig x and y
x=df[selected_features]
y=df['Survived']

In [33]:
# Split your data to train and test.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10,stratify=y)

In [34]:
# shape of x_train, x_test, y_train, y_test
print ("Shape of x_train : ",x_train.shape)
print ("Shape of y_train : ",y_train.shape)
print ("Shape of x_test : ",x_test.shape)
print ("Shape of y_test : ",y_test.shape)

Shape of x_train :  (712, 9)
Shape of y_train :  (712,)
Shape of x_test :  (179, 9)
Shape of y_test :  (179,)


In [35]:
# Handling Imbalanced Dataset (If needed)
df['Survived'].value_counts()

Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
0,549
1,342


In [36]:
# grid search cv
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [37]:
# defining function to evaluate model
def eval_model(model, X_test, y_test, name="Model"):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred)
    }
    print(f"\n--- {name} Test Metrics ---")
    print("Accuracy:", round(metrics['accuracy'], 4))
    print("Precision:", round(metrics['precision'], 4))
    print("Recall:", round(metrics['recall'], 4))
    print("F1:", round(metrics['f1'], 4))
    print("ROC AUC:", round(metrics['roc_auc'], 4))
    print("Confusion Matrix:\n", metrics['confusion_matrix'])
    print("\nClassification Report:\n", metrics['classification_report'])
    return y_proba, metrics

In [38]:
# logistic regression
pipe_lr = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(max_iter=2000, random_state=42))
])
param_grid_lr = {
    'clf__C': [0.01, 0.1, 1],
    'clf__penalty': ['l2'],
    'clf__class_weight': ['balanced', None],
    'smote__k_neighbors': [3, 5]
}
gs_lr = GridSearchCV(pipe_lr, param_grid_lr, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1, return_train_score=False)

print("\nFitting Logistic Regression grid...")
gs_lr.fit(x_train, y_train)

print("\n=== Logistic Regression Best ===")
print("Best CV ROC AUC:", gs_lr.best_score_)
print("Best params:", gs_lr.best_params_)

best_lr = gs_lr.best_estimator_
proba_lr, metrics_lr = eval_model(best_lr, x_test, y_test, "Logistic Regression (best)")


Fitting Logistic Regression grid...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

=== Logistic Regression Best ===
Best CV ROC AUC: 0.8452800866554785
Best params: {'clf__C': 0.01, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'smote__k_neighbors': 3}

--- Logistic Regression (best) Test Metrics ---
Accuracy: 0.7765
Precision: 0.6986
Recall: 0.7391
F1: 0.7183
ROC AUC: 0.8485
Confusion Matrix:
 [[88 22]
 [18 51]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81       110
           1       0.70      0.74      0.72        69

    accuracy                           0.78       179
   macro avg       0.76      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [39]:
# Random forest classifier
pipe_rf = ImbPipeline([
    ('scaler', StandardScaler()),   # scaler is harmless for RF; keeps interface consistent
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])
param_grid_rf = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [4, 8, None],
    'clf__min_samples_leaf': [1, 2],
    'clf__class_weight': ['balanced', None],
    'smote__k_neighbors': [3, 5]
}
gs_rf = GridSearchCV(pipe_rf, param_grid_rf, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1, return_train_score=False)

print("\nFitting Random Forest grid...")
gs_rf.fit(x_train, y_train)

print("\n=== Random Forest Best ===")
print("Best CV ROC AUC:", gs_rf.best_score_)
print("Best params:", gs_rf.best_params_)

best_rf = gs_rf.best_estimator_
proba_rf, metrics_rf = eval_model(best_rf, x_test, y_test, "Random Forest (best)")


Fitting Random Forest grid...
Fitting 5 folds for each of 48 candidates, totalling 240 fits

=== Random Forest Best ===
Best CV ROC AUC: 0.8622158365261813
Best params: {'clf__class_weight': 'balanced', 'clf__max_depth': 8, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 200, 'smote__k_neighbors': 3}

--- Random Forest (best) Test Metrics ---
Accuracy: 0.8324
Precision: 0.7746
Recall: 0.7971
F1: 0.7857
ROC AUC: 0.8839
Confusion Matrix:
 [[94 16]
 [14 55]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86       110
           1       0.77      0.80      0.79        69

    accuracy                           0.83       179
   macro avg       0.82      0.83      0.82       179
weighted avg       0.83      0.83      0.83       179



In [40]:
# xgboost
pipe_xgb = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('clf', XGBClassifier(
        random_state=42,
        n_estimators=200,
        learning_rate=0.1,
        eval_metric='logloss',
        n_jobs=-1
    ))
])
param_grid_xgb = {
    'clf__max_depth': [3, 4, 5],
    'clf__learning_rate': [0.05, 0.1],
    'clf__n_estimators': [200, 300],
    'clf__subsample': [0.8, 1],
    'clf__colsample_bytree': [0.8, 1],
    'smote__k_neighbors': [3, 5]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gs_xgb = GridSearchCV(
    estimator=pipe_xgb,
    param_grid=param_grid_xgb,
    scoring='roc_auc',
    n_jobs=-1,
    cv=cv,
    verbose=1
)

print("\nFitting XGBoost Grid...")
gs_xgb.fit(x_train, y_train)

print("\n=== XGBoost Best ===")
print("Best CV ROC AUC:", gs_xgb.best_score_)
print("Best Params:", gs_xgb.best_params_)

best_xgb = gs_xgb.best_estimator_
proba_xgb, metrics_xgb = eval_model(best_xgb, x_test, y_test, "XGBoost (best)")


Fitting XGBoost Grid...
Fitting 5 folds for each of 96 candidates, totalling 480 fits

=== XGBoost Best ===
Best CV ROC AUC: 0.8644901206421582
Best Params: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__max_depth': 5, 'clf__n_estimators': 200, 'clf__subsample': 0.8, 'smote__k_neighbors': 3}

--- XGBoost (best) Test Metrics ---
Accuracy: 0.8436
Precision: 0.7808
Recall: 0.8261
F1: 0.8028
ROC AUC: 0.87
Confusion Matrix:
 [[94 16]
 [12 57]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.85      0.87       110
           1       0.78      0.83      0.80        69

    accuracy                           0.84       179
   macro avg       0.83      0.84      0.84       179
weighted avg       0.85      0.84      0.84       179



In [41]:
# comparing all models
comp = pd.DataFrame({
    'metric': ['accuracy','precision','recall','f1','roc_auc'],
    'logistic': [metrics_lr[m] for m in ['accuracy','precision','recall','f1','roc_auc']],
    'random_forest': [metrics_rf[m] for m in ['accuracy','precision','recall','f1','roc_auc']],
    'xgboost': [metrics_xgb[m] for m in ['accuracy','precision','recall','f1','roc_auc']]
})
print("\nComparison (test):\n", comp)


Comparison (test):
       metric  logistic  random_forest   xgboost
0   accuracy  0.776536       0.832402  0.843575
1  precision  0.698630       0.774648  0.780822
2     recall  0.739130       0.797101  0.826087
3         f1  0.718310       0.785714  0.802817
4    roc_auc  0.848485       0.883926  0.869960


In [42]:
# melting comparison table
comp_long = comp.melt(id_vars='metric', value_vars=['logistic', 'random_forest','xgboost'],var_name='model',value_name='score')

In [43]:
# plotting comparison of performance metrics of all models
fig = px.bar(comp_long, x='metric', y='score', color='model', barmode='group', title='Model Performance Comparison',color_discrete_sequence=px.colors.sequential.Blugrn)
fig.update_layout(title_x=0.5)
fig.show()

Random Forest achieved the highest ROC-AUC (best at ranking probabilities), whereas **XGBoost** achieved the best accuracy, precision and F1, so XGBoost is chosen as the final predictive model.

In [44]:
# plotting feature importance using SHAP
import shap

# Extracting the scaler and model from the pipeline
scaler = best_xgb.named_steps['scaler']
xgb_model = best_xgb.named_steps['clf']

# Transform train and test without SMOTE
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Create SHAP TreeExplainer (best for XGBoost)
explainer = shap.TreeExplainer(xgb_model)

# Compute SHAP values for test data
shap_values = explainer.shap_values(x_test_scaled)

# Convert to DataFrame
shap_df = pd.DataFrame(shap_values, columns=selected_features)

# Mean absolute SHAP values
mean_abs_shap = (
    shap_df.abs().mean()
        .sort_values(ascending=True)
        .reset_index()
)
mean_abs_shap.columns = ['Feature', 'Mean |SHAP Value|']

# Plot
fig = px.bar(
    mean_abs_shap,
    x='Mean |SHAP Value|',
    y='Feature',
    orientation='h',
    title='XGBoost Feature Importance (SHAP)',
    color='Mean |SHAP Value|',color_continuous_scale='Temps'
)

fig.update_layout(title_x=0.5, yaxis=dict(categoryorder='total ascending'))
fig.show()