In [228]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

In [229]:
df_train = pd.read_csv('titanicdata.csv')
df_train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [230]:
df_train.shape

(891, 12)

In [231]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [232]:
df_train.describe()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [233]:
df_train.isna().sum()

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         0
dtype: int64

In [234]:
#Filling in missing values
df_train['age'].fillna(df_train['age'].median(), inplace=True)



In [235]:
#Drop cabin, too many missing values
df_train.drop(columns=['cabin'], inplace=True)

In [282]:
#Feature engineering
df_train['family_size'] = df_train['sibsp'] + df_train['parch'] + 1
df_train['is_alone'] = (df_train['family_size']==1).astype(int)

In [237]:
df_train.isna().sum()

passengerid    0
survived       0
pclass         0
name           0
sex            0
age            0
sibsp          0
parch          0
ticket         0
fare           0
embarked       0
family_size    0
dtype: int64

In [238]:
df_train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,family_size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1


In [284]:
X = df_train[['pclass', 'sex', 'age','embarked','family_size','is_alone']]
y = df_train['survived']

In [285]:
(X.shape,y.shape)

((891, 6), (891,))

In [286]:
X.dtypes

pclass           int64
sex             object
age            float64
embarked        object
family_size      int64
is_alone         int32
dtype: object

In [287]:
for col in X.columns:
    if X[col].dtype == object and  X[col].nunique() > 1:
        dummies = pd.get_dummies(X[col], drop_first=True).astype(float)
        X.drop(columns=[col], inplace = True)
        X = pd.concat([X, dummies], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[col], inplace = True)


In [288]:
X.head()

Unnamed: 0,pclass,age,family_size,is_alone,male,C,Q,S
0,3,22.0,2,0,1.0,0.0,0.0,1.0
1,1,38.0,2,0,0.0,1.0,0.0,0.0
2,3,26.0,1,1,0.0,0.0,0.0,1.0
3,1,35.0,2,0,0.0,0.0,0.0,1.0
4,3,35.0,1,1,1.0,0.0,0.0,1.0


In [289]:
#Split data into Train & Test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

((712, 8), (179, 8), (712,), (179,))

In [290]:
from imblearn.over_sampling import SMOTE
from collections import Counter

Counter(y_train)


Counter({0: 441, 1: 271})

In [291]:
#Applying SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [292]:
#After smote
Counter(y_train_smote)

Counter({0: 441, 1: 441})

In [293]:
scaler = StandardScaler()
X_train_smote_scaled = scaler.fit_transform(X_train_smote)  # Fit & transform on training data
X_test_scaled = scaler.transform(X_test) 

In [294]:
#Apply PCA(keep 95% of variance)
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_smote)
X_test_pca = pca.transform(X_test)

In [295]:
X.shape[1], X_train_pca.shape[1]

(8, 1)

In [296]:
#Train Random Forest on SMOTE data(No PCA)
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train_smote, y_train_smote) #No need for scaling

In [297]:
#Train XGBoost on SMOTE data(No PCA)
xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
xgb_model.fit(X_train_smote, y_train_smote)

In [298]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#Train Logistic Regression on smote data(No PCA)
lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_model.fit(X_train_smote_scaled, y_train_smote)

In [299]:
from sklearn.svm import SVC
svm_model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

svm_model.fit(X_train_smote_scaled, y_train_smote)

Train Logistic Regression and SVM with(PCA) - Cannot handle high dimensionality

In [300]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#Train Logistic Regression on PCA-transformed- data
lr_model_pca = LogisticRegression(max_iter=500, random_state=42)
lr_model_pca.fit(X_train_pca, y_train_smote)

In [301]:
#Train SVM on PCA_transformed data
svm_model_pca = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
svm_model_pca.fit(X_train_pca, y_train_smote)

In [302]:
X_test_pca.shape, X_train_pca.shape

((179, 1), (882, 1))

In [303]:
y_pred_rf = rf.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_lr_pca = lr_model_pca.predict(X_test_pca)
y_pred_svm_pca= svm_model_pca.predict(X_test_pca)

In [304]:
#Evaluating the model performance
print("Accuracy scores")
print(f"Random Forest: {accuracy_score(y_test,y_pred_rf)}")
print(f"XGBoost: {accuracy_score(y_test, y_pred_xgb)}")
print(f'Logistic Regression(No PCA): {accuracy_score(y_test,y_pred_lr)}')
print(f'SVM(No PCA): {accuracy_score(y_test,y_pred_svm)}')
print(f'Logistic Regression(PCA): {accuracy_score(y_test,y_pred_lr_pca)}')
print(f'svm(PCA): {accuracy_score(y_test,y_pred_svm_pca)}')

Accuracy scores
Random Forest: 0.7988826815642458
XGBoost: 0.7877094972067039
Logistic Regression(No PCA): 0.8324022346368715
SVM(No PCA): 0.8044692737430168
Logistic Regression(PCA): 0.48044692737430167
svm(PCA): 0.547486033519553


In [305]:
from sklearn.metrics import classification_report

print("Classification Reports:\n")

print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

print("XGBoost:")
print(classification_report(y_test, y_pred_xgb))

print("Logistic Regression (No PCA):")
print(classification_report(y_test, y_pred_lr))

print("SVM (No PCA):")
print(classification_report(y_test, y_pred_svm))

print("Logistic Regression (PCA):")
print(classification_report(y_test, y_pred_lr_pca))

print("SVM (PCA):")
print(classification_report(y_test, y_pred_svm_pca))


Classification Reports:

Random Forest:
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       108
           1       0.79      0.68      0.73        71

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179

XGBoost:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       108
           1       0.75      0.69      0.72        71

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179

Logistic Regression (No PCA):
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       108
           1       0.78      0.80      0.79        71

    accuracy                           0.83       179
   macro avg       0.82      0.83      0.83      

### **🔹 Summary of Model Insights**  

- **Random Forest** → Good overall performance, but recall for survivors is slightly lower.  
- **XGBoost** → Well-balanced precision and recall for both classes.  
- **Logistic Regression (No PCA)** → High recall for survivors, making it better at identifying them.  
- **SVM (No PCA)** → Similar to Random Forest but with lower recall for survivors.  
- **Logistic Regression (PCA)** → Poor accuracy; PCA removed too much important information.  
- **SVM (PCA)** → Slightly better than Logistic Regression (PCA), but still weak performance.