# <center>Titanic: Machine Learning from Disaster</center>
### <center>Using Random Forest Regression to Predict Survival</center>

In [1]:
import numpy as np
import pandas as pd

# import training and testing datasets
csv_test = pd.read_csv("data/titanic/test.csv")
csv_train = pd.read_csv("data/titanic/train.csv")

In [2]:
csv_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print(f"Training Dataset has {csv_train.shape[0]} rows and {csv_train.shape[1]} columns")
print(f"Testing Dataset has {csv_test.shape[0]} rows and {csv_test.shape[1]} columns")

Training Dataset has 891 rows and 12 columns
Testing Dataset has 418 rows and 11 columns


### Preprocess:
Convert strings to numbers      
Split features and value

In [4]:
def convert2Number(dataset, columnsNot2Convert):
    for column in list(dataset.columns.values):
        if column in columnsNot2Convert:
            continue
        dataset[column] = dataset[column].astype('category').cat.codes

In [5]:
columnsNot2Convert = ["PassengerId"]
convert2Number(csv_train, columnsNot2Convert)
convert2Number(csv_test, columnsNot2Convert)

In [6]:
X_train = csv_train.drop("Survived", axis=1)
y_train = csv_train["Survived"] 

In [7]:
csv_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,2,108,1,28,1,0,523,18,-1,2
1,2,1,0,190,0,51,1,0,596,207,81,0
2,3,1,2,353,0,34,0,0,669,41,-1,2
3,4,1,0,272,0,47,1,0,49,189,55,2
4,5,0,2,15,1,47,0,0,472,43,-1,2


### Random Forest

In [8]:
# import random forest
from sklearn.ensemble import RandomForestRegressor

# create
RANDOM_STATE = 105
rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=RANDOM_STATE)

In [9]:
# training
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=105, verbose=0, warm_start=False)

In [10]:
# predicting
X_test = csv_test
y_test = rf.predict(X_test)

In [11]:
# testing
# overfitting low training error score, high testing error score
# underfitting high training error score, high testing error score

# calculate r-squared that gives us the accuracy
# testing score:
#         0 = bad 
#         1 = perfect
print(f"r score of training set = {rf.score(X_train, y_train)}")
print(f"r score of testing set = {rf.score(X_test, y_test)}")

r score of training set = 0.9260006747195858
r score of testing set = 1.0


Random Forest prediction returns a percentage, whether or not that individual died or not.      
Simply convert that percentage to 0, 1 by rounding      
    0 = died      
    1 = survived

In [12]:
def survivedORdied(predictions):
    rounded = [int(round(number)) for number in predictions]
    return rounded    

In [13]:
y_test_rounded = survivedORdied(y_test)

In [14]:
y_test

array([0.092, 0.394, 0.136, 0.112, 0.536, 0.078, 0.538, 0.462, 0.635,
       0.081, 0.093, 0.081, 0.796, 0.203, 0.812, 0.946, 0.183, 0.362,
       0.36 , 0.467, 0.125, 0.628, 0.941, 0.276, 0.965, 0.068, 0.863,
       0.304, 0.586, 0.257, 0.066, 0.078, 0.336, 0.509, 0.374, 0.246,
       0.706, 0.643, 0.105, 0.262, 0.174, 0.404, 0.028, 0.859, 0.726,
       0.05 , 0.442, 0.116, 0.886, 0.33 , 0.395, 0.115, 0.872, 0.975,
       0.146, 0.41 , 0.05 , 0.348, 0.106, 0.966, 0.057, 0.157, 0.198,
       0.665, 0.512, 0.938, 0.7  , 0.269, 0.476, 0.965, 0.615, 0.111,
       0.591, 0.376, 0.984, 0.567, 0.073, 0.76 , 0.258, 0.497, 0.732,
       0.337, 0.288, 0.079, 0.168, 0.221, 0.647, 0.612, 0.745, 0.796,
       0.451, 0.097, 0.909, 0.084, 0.37 , 0.077, 0.833, 0.076, 0.698,
       0.067, 0.81 , 0.19 , 0.099, 0.098, 0.803, 0.109, 0.223, 0.107,
       0.113, 0.421, 0.205, 0.82 , 0.944, 0.639, 0.946, 0.291, 0.2  ,
       0.652, 0.704, 0.922, 0.943, 0.118, 0.845, 0.322, 0.118, 0.372,
       0.109, 0.636,

In [15]:
y_test_rounded

[0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,


### Results
Write results to a csv file

In [16]:
import csv

In [17]:
with open("submission_file.csv", "w") as submission_file:
    fieldnames = ["PassengerId", "Survived"]
    csv_writer = csv.DictWriter(submission_file, fieldnames=fieldnames)
    
    csv_writer.writeheader()
    for index in range(len(y_test)):
        csv_writer.writerow({"PassengerId": X_test["PassengerId"][index], "Survived": y_test_rounded[index]})