In [6]:
import numpy as np
import pandas as pd
import os

# TUTORIAL: https://www.kaggle.com/alexisbcook/titanic-tutorial

# Import model type - Random Forest Model.
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Read in the CSV data files as DataFrames.
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
# From the gender_submission, let's test the hypothesis - did all women survive?
women = train_data.loc[train_data['Sex'] == 'female']['Survived']
# Get the rate of women that survived (sum of 1's) vs the number of women (length of the return).
rate_women = sum(women)/len(women)

print(f'% of women that survived: {round(rate_women*100,1)}%.')

% of women that survived: 74.2%.


In [5]:
# And let's see that same idea for men.
men = train_data.loc[train_data['Sex'] == 'male']['Survived']
rate_men = sum(men)/len(men)

print(f'% of men that survived: {round(rate_women*100,1)}%.')

% of men that survived: 74.2%.


In [9]:
# Store the binary survival column in the training data as a variable.
y = train_data['Survived']

# Store the features (or columns) of the training data we wish to use.
features = ['Pclass', 'Sex', 'SibSp', 'Parch']
# Use get_dummies: Convert categorical variable into dummy/indicator variables.
# Creates a DF of the data with the values in the columns and 0's for missing ones.
# Also binaries the female/male Sex column - 1 for True and 0 for False.
X = pd.get_dummies(train_data[features])
# Create a test DF from the train data in the same way.
X_test = pd.get_dummies(test_data[features])

# Create the model with the imported Random Forest Model (sklearn).
# n_estimators is the number of trees, max_ depth is how deep the trees
# go before returning a prediction of survivability.
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://en.wikipedia.org/wiki/Random_forest
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# Traing the model using the training set (X) and the results (y).
model.fit(X, y)
# Pass the test data into the model and predict the survivors - save the predictions.
predictions = model.predict(X_test)

# Save the output to a DataFrame and that DF to a CSV file.
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Submission file successfully saved.")

Submission file successfully saved.
