# Titanic Competition
This jupyter notebook includes the training of ML model used to predict the survival of passengers aboard the titanic

In [101]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [102]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [103]:
def extract_features(df):
    features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
    X = pd.get_dummies(df[features])
    # X["Fare"] = df["Fare"]
    return X

In [104]:
X = extract_features(train)
y = train["Survived"]

In [105]:
rfc = RandomForestClassifier()

In [106]:
forest_params = [{
    "criterion": ["entropy", "gini", "log_loss"],
    "max_leaf_nodes": [5, 50, 500, 500],
    "n_estimators": [10, 100, 1000],
    "max_features": ["sqrt", "log2", None]
}]

In [107]:
clf = GridSearchCV(rfc, forest_params, cv=10, scoring="accuracy")

In [108]:
clf.fit(X, y)
print(clf.best_params_)

{'criterion': 'log_loss', 'max_features': None, 'max_leaf_nodes': 5, 'n_estimators': 100}


In [109]:
model = clf.best_estimator_
model.fit(X, y)

In [110]:
def verify(model, X, y):
    verification = model.predict(X)
    error = np.sum(abs(verification - y)) / len(y)
    print(
        f"Percentage Error: {error:.0%}"
    )

In [111]:
verify(model, X, y)

Percentage Error: 19%


In [112]:
X_test = extract_features(test)

In [113]:
X_test.isnull().sum()

Pclass        0
SibSp         0
Parch         0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [98]:
# Replace the missing value with the median
# This is bad if the specific entry is extraordinary
# X_test.loc[X_test["Fare"].isnull(), "Fare"] = np.median(X_test["Fare"].dropna())

In [114]:
predictions = model.predict(X_test)

In [115]:
output = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
output.to_csv('submission_iter3.csv', index=False)

In [1]:
for score, name in zip(model.feature_importances_, X.columns.values):
    print(f"Feature {name}: {score:.2f}")

NameError: name 'model' is not defined