In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
%matplotlib inline

In [89]:
# load preprocessed train dataset
df_train = pd.read_csv('titanic_train_preprocessed2.csv')

# create matrix of the features (X) and target (y)
X = df_train.drop('Survived',axis=1) # values used for prediction
y = df_train['Survived'] # values to be predicted

# split data from "titanic_train_preprocessed.csv" to training data (75%) and testing data (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [90]:
# Create a random forest classifier
rfc = RandomForestClassifier(criterion='gini',n_estimators=10000,max_depth=7,min_samples_split=6,min_samples_leaf=5,
                                       max_features=3 ,oob_score=True,random_state=42,n_jobs=-1,verbose=1)
# fit classifier to the training data
rfc.fit(X_train.values,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 10000 out of 

In [91]:
# Make predictions on the test set
y_pred = rfc.predict(X_test.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 10000 out of 10000 | elapsed:

In [92]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8251121076233184


In [93]:
# once happy with the model accuracy, fit it on all data
rfc.fit(X.values,y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 10000 out of 

In [94]:
X_pred = pd.read_csv('titanic_test_preprocessed2.csv')
X_pred.replace([np.inf, -np.inf], np.nan, inplace=True) # becaouse of error NaN/inf
X_pred.fillna(999, inplace=True) # becaouse of error NaN/inf
X_pred.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Female,Male,Master,Miss,Mr,Mrs,Class 1,Class 2,Class 3
0,0.452723,0.0,0.0,0.015282,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.617566,0.125,0.0,0.013663,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.815377,0.0,0.0,0.018909,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.353818,0.0,0.0,0.016908,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.287881,0.125,0.111111,0.023984,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [95]:
X_pred_convert = np.nan_to_num(X_pred) # conversion, otherwise y_pred throws an error

In [96]:
# and do prediction with all data available
y_pred = rfc.predict(X_pred_convert)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 10000 out of 10000 | elapsed:

In [97]:
# convert numpy ndarray back to dataframe
predicted_data = pd.DataFrame(y_pred)

# append the predicted data to final file
df_test = pd.read_csv('titanic_test_data.csv')
df_test = pd.concat([df_test,predicted_data],axis=1) # append predicted y
df_test = df_test.rename(columns = {0:'Survived'}) # rename added column
df_test = df_test[['PassengerId', 'Survived']] # leave only requested columns for submit
df_test.head() # check if correct

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [98]:
# save result
df_test.to_csv('submission.csv', index=False)