### Imputing the age column

In [72]:
import pandas as pd

In [73]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

In [74]:
# Function to fill Age based on median age of Pclass and SibSp group
def fill_age(row):
    if pd.isnull(row['Age']):
        # Compute median age for passengers with same Pclass and SibSp
        return train[(train['Pclass'] == row['Pclass']) & (train['SibSp'] == row['SibSp'])]['Age'].median()
    else:
        return row['Age']

# Apply the function
train['Age'] = train.apply(fill_age, axis=1)
test['Age'] = test.apply(fill_age, axis=1)


In [75]:
print(train["Age"].isnull().sum())
print(test["Age"].isnull().sum())

7
1


In [76]:
train[train['Age'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
159,160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
180,181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
324,325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S


- In this 7 columns both the SibSp is 8 and Pclass is 3 and there are only 7 of them and none of the value is present in the data set so we can fill the simple median of age

In [77]:
median_age = train['Age'].median()
median_age_test = test['Age'].median()
# Fill Age where SibSp == 8 and Pclass == 3
train.loc[(train['SibSp'] == 8) & (train['Pclass'] == 3), 'Age'] = median_age
test.loc[(test['SibSp'] == 8) & (test['Pclass'] == 3), 'Age'] = median_age_test
# Verify the change
train[(train['SibSp'] == 8) & (train['Pclass'] == 3)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
159,160,0,3,"Sage, Master. Thomas Henry",male,26.0,8,2,CA. 2343,69.55,,S
180,181,0,3,"Sage, Miss. Constance Gladys",female,26.0,8,2,CA. 2343,69.55,,S
201,202,0,3,"Sage, Mr. Frederick",male,26.0,8,2,CA. 2343,69.55,,S
324,325,0,3,"Sage, Mr. George John Jr",male,26.0,8,2,CA. 2343,69.55,,S
792,793,0,3,"Sage, Miss. Stella Anna",female,26.0,8,2,CA. 2343,69.55,,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,26.0,8,2,CA. 2343,69.55,,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,26.0,8,2,CA. 2343,69.55,,S


In [78]:
# Drop the 'Cabin' column
train = train.drop('Cabin', axis=1)
test = test.drop('Cabin', axis=1)

# Verify
train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [79]:
#Convert the age to int
train['Age'] = train['Age'].astype(int)
test['Age'] = test['Age'].astype(int)

In [80]:
#encode the Embarked
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

In [81]:
train = pd.get_dummies(train, columns=['Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Embarked'], drop_first=True)

In [82]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,False,False
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,False,True
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,False,True


In [83]:
train = pd.get_dummies(train, columns=['Pclass'], drop_first=True)
test = pd.get_dummies(test, columns=['Pclass'], drop_first=True)

In [84]:
train['Fare'] = train['Fare'].fillna(train['Fare'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())


In [85]:
X = train.drop(['PassengerId', 'Survived', 'Name', "Ticket"], axis=1)
y = train['Survived']

In [86]:
X.isna().sum()

Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_Q    0
Embarked_S    0
Pclass_2      0
Pclass_3      0
dtype: int64

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)

In [89]:
y_pred = pipeline.predict(X_test)
print(y_pred[:10])  # first 10 predictions


[0 0 0 1 1 1 1 0 1 1]


In [90]:
y_prob = pipeline.predict_proba(X_test)
print(y_prob[:5])  # first 5 rows of [prob_not_survive, prob_survive]


[[0.88752456 0.11247544]
 [0.7369019  0.2630981 ]
 [0.8795547  0.1204453 ]
 [0.08744131 0.91255869]
 [0.25822701 0.74177299]]


In [91]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_mat)

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.7988826815642458
Confusion Matrix:
 [[90 15]
 [21 53]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [92]:
features = X_train.columns  # assuming X_train is your training dataframe
X_test_final = test[features]

In [93]:
X_test_final

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
0,0,34,0,0,7.8292,True,False,False,True
1,1,47,1,0,7.0000,False,True,False,True
2,0,62,0,0,9.6875,True,False,True,False
3,0,27,0,0,8.6625,False,True,False,True
4,1,22,1,1,12.2875,False,True,False,True
...,...,...,...,...,...,...,...,...,...
413,0,26,0,0,8.0500,False,True,False,True
414,1,39,0,0,108.9000,False,False,False,False
415,0,38,0,0,7.2500,False,True,False,True
416,0,26,0,0,8.0500,False,True,False,True


In [95]:
test_predictions = pipeline.predict(X_test_final)

In [96]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predictions
})
submission.to_csv("titanic_predictions.csv", index=False)
print("Submission file saved!")

Submission file saved!
