# Titanic Competition
This jupyter notebook includes the training of ML model used to predict the survival of passengers aboard the titanic

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [22]:
def extract_features(df):
    features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
    X = pd.get_dummies(df[features])
    X["Fare"] = df["Fare"]
    return X

In [23]:
X = extract_features(train)
y = train["Survived"]

In [24]:
rfc = RandomForestClassifier()

In [25]:
forest_params = [{
    "criterion": ["entropy", "gini", "log_loss"],
    "max_leaf_nodes": [5, 50, 500, 500],
    "n_estimators": [10, 100, 1000],
    "max_features": ["sqrt", "log2", None]
}]

In [26]:
clf = GridSearchCV(rfc, forest_params, cv=10, scoring="accuracy")

In [27]:
clf.fit(X, y)
print(clf.best_params_)

{'criterion': 'entropy', 'max_features': None, 'max_leaf_nodes': 50, 'n_estimators': 1000}


In [28]:
model = clf.best_estimator_
model.fit(X, y)

In [29]:
def verify(model, X, y):
    verification = model.predict(X)
    error = np.sum(abs(verification - y)) / len(y)
    print(
        f"Percentage Error: {error:.0%}"
    )

In [30]:
verify(model, X, y)

Percentage Error: 10%


In [31]:
X_test = extract_features(test)

In [32]:
X_test.isnull().sum()

Pclass        0
SibSp         0
Parch         0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
Fare          1
dtype: int64

In [34]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputer.fit(X)

In [37]:
X_test.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Fare
0,3,0,0,False,True,False,True,False,7.8292
1,3,1,0,True,False,False,False,True,7.0
2,2,0,0,False,True,False,True,False,9.6875
3,3,0,0,False,True,False,False,True,8.6625
4,3,1,1,True,False,False,False,True,12.2875


In [36]:
imputer.transform(X_test)

array([[ 3.    ,  0.    ,  0.    , ...,  1.    ,  0.    ,  7.8292],
       [ 3.    ,  1.    ,  0.    , ...,  0.    ,  1.    ,  7.    ],
       [ 2.    ,  0.    ,  0.    , ...,  1.    ,  0.    ,  9.6875],
       ...,
       [ 3.    ,  0.    ,  0.    , ...,  0.    ,  1.    ,  7.25  ],
       [ 3.    ,  0.    ,  0.    , ...,  0.    ,  1.    ,  8.05  ],
       [ 3.    ,  1.    ,  1.    , ...,  0.    ,  0.    , 22.3583]])

In [47]:
X_test_transformed = pd.DataFrame(
    data = imputer.transform(X_test),
    columns = imputer.get_feature_names_out()
)
X_test_transformed.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Fare
0,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,7.8292
1,3.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7.0
2,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,9.6875
3,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,8.6625
4,3.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,12.2875


In [45]:
# Estimated value is very different from median, meaning previous approach was not good
print(
    np.median(X_test["Fare"].dropna()),
    np.median(X["Fare"])
)

14.4542 14.4542


In [50]:
predictions = model.predict(imputer.transform(X_test_transformed))



In [52]:
output = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
output.to_csv('submission_iter4.csv', index=False)

In [58]:
for score, name in zip(model.feature_importances_, X.columns.values):
    print(f"Feature {name}: {score:.2f}")

Feature Pclass: 0.14
Feature SibSp: 0.04
Feature Parch: 0.05
Feature Sex_female: 0.18
Feature Sex_male: 0.18
Feature Embarked_C: 0.01
Feature Embarked_Q: 0.01
Feature Embarked_S: 0.02
Feature Fare: 0.36
