In [3]:
import pandas as pd

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.metrics import accuracy_score,classification_report

# Cleaning Pipeline

In [7]:
tc = pd.read_csv("../Day-3/titanic/train.csv")

In [8]:
tc['Age'] = tc['Age'].fillna(tc['Age'].median())

In [9]:
tc.shape

(891, 12)

In [10]:
tc['Has_Cabin']  = tc['Cabin'].apply(lambda x : 0  if pd.isnull(x) else 1)

In [11]:
tc = tc.drop('Cabin',axis=1)

In [12]:
tc['Embarked'] = tc['Embarked'].fillna(tc['Embarked'].mode()[0])

In [13]:
tc['Sex'] = tc['Sex'].map({'male':0, 'female':1})

In [14]:
print("Missing Value after Cleaning :  ", tc.isnull().sum())

Missing Value after Cleaning :   PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Has_Cabin      0
dtype: int64


# Data split

In [15]:
# MY PREDICTIONS:
# Training set rows: 881
# Test set rows: 400
# Which feature will matter most for survival?: Age??

In [16]:
features = ['Pclass', 'Sex','Age', 'SibSp','Parch', 'Fare','Has_Cabin']

In [17]:
X = tc[features]

In [18]:
Y = tc['Survived']

In [19]:
# Split 80% train , 20 % test

In [20]:
X_train, X_test , y_train, y_test = train_test_split(
    X,Y,test_size=0.2,random_state=42
)

In [21]:
print("Training set : ", X_train.shape)

Training set :  (712, 7)


In [22]:
print("Test split : " ,X_test.shape)

Test split :  (179, 7)


# Training Model

In [23]:
# What accuracy do I expect? (0-100%): 95% ?  
# Will model be better at predicting survivors or non-survivors?: training

In [24]:
# Train Model
model = LogisticRegression(max_iter=1000)

In [25]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [26]:
# Make Predictions

y_pred = model.predict(X_test)

In [27]:
#Evaluate

In [28]:
print("Accuracy:", accuracy_score(y_test,y_pred))

Accuracy: 0.8212290502793296


In [29]:
print("Detailed Report:")
print(classification_report(y_test,y_pred))

Detailed Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [30]:
print(tc.columns.tolist())

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked', 'Has_Cabin']


In [31]:
print(tc.groupby('Embarked')['Survived'].mean())

Embarked
C    0.553571
Q    0.389610
S    0.339009
Name: Survived, dtype: float64


In [32]:
tc = pd.get_dummies(tc, columns=['Embarked'], drop_first=True)

In [33]:
print(tc.columns.tolist())

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Has_Cabin', 'Embarked_Q', 'Embarked_S']


# Retrain MOdel With Embarked 

In [34]:
# Updated features - added Embarked_Q and Embarked_S
features = ['Pclass', 'Sex', 'Age', 'SibSp', 
            'Parch', 'Fare', 'Has_Cabin', 
            'Embarked_Q', 'Embarked_S']

In [35]:
X = tc[features]

In [36]:
Y = tc['Survived']

In [37]:
X_train , X_test , y_train , y_test = train_test_split(
    X,Y,  test_size=0.2, random_state=42
)

In [38]:
model = LogisticRegression(max_iter=1000)

In [39]:
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [40]:
y_pred  =model.predict(X_test)

In [41]:
print("Previous Accuracy 0.8212")
print("new Accuracy: ", accuracy_score(y_test,y_pred))
print("Detailed  Report: ")
print(classification_report(y_test,y_pred))

Previous Accuracy 0.8212
new Accuracy:  0.8212290502793296
Detailed  Report: 
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [42]:
print(tc.groupby('Embarked_S')['Pclass'].mean())

Embarked_S
False    2.208163
True     2.346749
Name: Pclass, dtype: float64


In [43]:
# Adding more features doesn't always improve accuracy
# If two features are correlated, the second one adds no new signal
# Embarked ≈ Pclass in terms of information content
# More features can sometimes hurt by adding noise

In [44]:
tc['Title'] = tc['Name'].str.extract('([A-Za-z]+)\.', expand=False)
print(tc['Title'].value_counts())
print(tc.groupby('Title')['Survived'].mean())

Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64
Title
Capt        0.000000
Col         0.500000
Countess    1.000000
Don         0.000000
Dr          0.428571
Jonkheer    0.000000
Lady        1.000000
Major       0.500000
Master      0.575000
Miss        0.697802
Mlle        1.000000
Mme         1.000000
Mr          0.156673
Mrs         0.792000
Ms          1.000000
Rev         0.000000
Sir         1.000000
Name: Survived, dtype: float64


In [45]:
tc['Title'] = tc['Title'].replace(
    ['Lady', 'Countess', 'Mme', 'Mlle', 'Ms', 'Sir', 
     'Don', 'Jonkheer'], 'Rare'
)
tc['Title'] = tc['Title'].replace(
    ['Col', 'Major', 'Capt', 'Rev', 'Dr'], 'Officer'
)


In [46]:
print(tc['Title'].value_counts())
print(tc.groupby('Title')['Survived'].mean())

Title
Mr         517
Miss       182
Mrs        125
Master      40
Officer     18
Rare         9
Name: count, dtype: int64
Title
Master     0.575000
Miss       0.697802
Mr         0.156673
Mrs        0.792000
Officer    0.277778
Rare       0.777778
Name: Survived, dtype: float64


In [47]:
tc = pd.get_dummies(tc, columns=['Title'], drop_first=True)

In [48]:
print(tc.columns.tolist())

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Has_Cabin', 'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Rare']


# Retraining Model withi more features

In [49]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 
            'Fare', 'Has_Cabin', 'Embarked_Q', 'Embarked_S',
            'Title_Miss', 'Title_Mr', 'Title_Mrs', 
            'Title_Officer', 'Title_Rare']

In [50]:
X = tc[features]

In [51]:
Y = tc['Survived']

In [52]:
X_train , X_test,y_train,y_test = train_test_split(
    X,Y , test_size=0.2,random_state=42
)

In [53]:
model  = LogisticRegression(max_iter=1000)

In [54]:
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [55]:
y_pred = model.predict(X_test)

In [56]:
print("Baseline accuracy:       0.8212")
print("After Embarked:          0.8212")
print("After Title:", accuracy_score(y_test, y_pred))
print("\nDetailed Report:")
print(classification_report(y_test, y_pred))

Baseline accuracy:       0.8212
After Embarked:          0.8212
After Title: 0.8100558659217877

Detailed Report:
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       105
           1       0.77      0.77      0.77        74

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [57]:
# EXPERIMENT SUMMARY - Day 4
# Baseline (Pclass, Sex, Age, SibSp, Parch, Fare, Has_Cabin): 82.12%
# + Embarked: 82.12% - correlated with Pclass, no new signal
# + Title + Sex: 80.44% - multicollinearity between Title and Sex
# + Title - Sex: 81.00% - better but Title alone weaker than Sex alone
# 
# Key lessons:
# 1. More features != better accuracy
# 2. Correlated features confuse the model
# 3. Feature engineering = adding AND removing
# 4. Always compare against baseline

# Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [60]:
rf_classifier = RandomForestClassifier(n_estimators=100 , random_state=42 )

In [61]:
rf_classifier.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [62]:
y_pred = rf_classifier.predict(X_test)

In [63]:
print("After Title:", accuracy_score(y_test, y_pred))
print("\nDetailed Report:")
print(classification_report(y_test, y_pred))

After Title: 0.8156424581005587

Detailed Report:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       105
           1       0.77      0.78      0.78        74

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [64]:
# Random Forest vs Logistic Regression
# LR:  82.12% — simpler, slightly better
# RF:  81.56% — more complex, no improvement
# 
# Lesson: algorithm choice matters less than 
# feature quality. Both models hit same ceiling.
# The data is the bottleneck, not the algorithm.

# XGBoost

In [65]:
from xgboost import XGBClassifier

In [66]:
model_xbg  = XGBClassifier(n_estimators=100,
                              random_state=42,
                                  eval_metric='logloss')

In [67]:
model_xbg.fit(X_train,y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [68]:
y_pred = model_xbg.predict(X_test)

In [69]:
print("Logistic Regression: 0.8212")
print("Random Forest:       0.8156")
print("XGBoost:            ", accuracy_score(y_test, y_pred))
print("\nDetailed Report:")
print(classification_report(y_test, y_pred))

Logistic Regression: 0.8212
Random Forest:       0.8156
XGBoost:             0.8156424581005587

Detailed Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       105
           1       0.79      0.76      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.81      0.82      0.82       179

