# Titanic Competition
This jupyter notebook includes the training of ML model used to predict the survival of passengers aboard the titanic

In [1]:
import pandas as pd  # data processing, CSV file, I/O
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [3]:
y = train['Survived']

features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
X = pd.get_dummies(train[features])

In [4]:
rfc = RandomForestClassifier()
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [5]:
forest_params = [{
    "criterion": ["entropy", "gini", "log_loss"],
    "max_leaf_nodes": [5, 50, 500, 500],
    "n_estimators": [10, 100, 1000],
    "max_features": ["sqrt", "log2", None]
}]

In [6]:
clf = GridSearchCV(rfc, forest_params, cv=10, scoring="accuracy")

In [7]:
clf.fit(X, y)
print(clf.best_params_)

{'criterion': 'gini', 'max_features': None, 'max_leaf_nodes': 5, 'n_estimators': 100}


In [8]:
model = clf.best_estimator_
model.fit(X, y)

In [9]:
def verify(model, X, y):
    verification = model.predict(X)
    error = np.sum(abs(verification - y)) / len(y)
    print(
        f"Percentage Error: {error:.0%}"
    )

In [10]:
verify(model, X, y)

Percentage Error: 19%


In [11]:
X_test = pd.get_dummies(test[features])
predictions = model.predict(X_test)

In [12]:
output = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
output.to_csv('submission_iter2.csv', index=False)