# Titanic. Random Forest

In [29]:
#data analysis libraries 
import numpy as np
import pandas as pd

#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#import train and test CSV files
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score

SURVIVED = 'Survived';
PASSENGER_ID = 'PassengerId';
SEX = "Sex";
PCLASS = "Pclass";
EMBARKED = "Embarked";
SIBSP = "SibSp";
PARCH = "Parch";
AGE = "Age";
TITLE = "Title";
NAME = 'Name';
AGEBIN = 'AgeBin';
AGEBIN_CODE = 'AgeBin_Code';
FARE = 'Fare';
CABIN = 'Cabin';
TICKET = 'Ticket';

# Obtain title
train[TITLE] = train[NAME].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
test[TITLE] = train[NAME].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

#Calculate Age
AverageAgeByTitle = train[['Title',AGE]].groupby('Title', as_index=False).mean();
AverageAgeByTitle.reset_index(inplace=True);
df = train[[PASSENGER_ID, TITLE, AGE]].merge(AverageAgeByTitle, on=TITLE, how='left');
train[AGE][train[AGE].isnull()]=df['Age_y']
df = test[[PASSENGER_ID, TITLE, AGE]].merge(AverageAgeByTitle, on=TITLE, how='left');
test[AGE][test[AGE].isnull()]=df['Age_y']

#Drop columns
train.drop([NAME, FARE], axis=1, inplace=True);
test.drop([NAME, FARE], axis=1, inplace=True);

# Fill NA
train[EMBARKED].fillna(train[EMBARKED].mode()[0], inplace = True)
test[EMBARKED].fillna(test[EMBARKED].mode()[0], inplace = True)

# One Hot Encoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
categories = [SEX, EMBARKED]

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train[categories]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(test[categories]))

# Remove categorical columns (will replace with one-hot encoding)
train.drop(categories, axis=1, inplace=True);
test.drop(categories, axis=1, inplace=True);

train = pd.concat([train, OH_cols_train], axis=1)
test = pd.concat([test, OH_cols_test], axis=1)

#Age
train[AGEBIN] = pd.cut(train[AGE].astype(int), 5)
test[AGEBIN] = pd.cut(test[AGE].astype(int), 5)

train[AGEBIN_CODE] = LabelEncoder().fit_transform(train[AGEBIN])
test[AGEBIN_CODE] = LabelEncoder().fit_transform(test[AGEBIN])

# Add Family size
FAMILY_SIZE = 'FamilySize'
train[FAMILY_SIZE] = train[SIBSP] + train[PARCH]
test[FAMILY_SIZE] = train[SIBSP] + train[PARCH]

train.drop([TICKET, CABIN, TITLE, AGEBIN], axis=1, inplace=True);
test.drop([TICKET, CABIN, TITLE, AGEBIN], axis=1, inplace=True);

# Train model
predictors = train.drop([SURVIVED, PASSENGER_ID], axis=1);
real_predictors = test.drop([PASSENGER_ID], axis=1);

target = train[SURVIVED]
x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.12, random_state = 0)

randomforest = RandomForestClassifier(n_estimators=60, max_depth=4, random_state=1)

randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_val)
acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_randomforest)


85.05



from matplotlib.legend_handler import HandlerLine2D

train_results = []
val_results = []
estimators = [50 + n * 2 for n  in range (1, 10)]
for estimator in estimators:
    randomforest = RandomForestClassifier(n_estimators=estimator,  random_state=1)
    randomforest.fit(x_train, y_train)
    
    train_pred = randomforest.predict(x_train)
    train_results.append (round(accuracy_score(train_pred, y_train) * 100, 2))
    val_pred = randomforest.predict(x_val)
    val_results.append (round(accuracy_score(val_pred, y_val) * 100, 2))

line1, = plt.plot(estimators, train_results, "b", label="Train AS")
line2, = plt.plot(estimators, val_results, "r", label="Val AS")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AS score")
plt.xlabel("estimators")
plt.show()

train_results = []
val_results = []
depths = [n for n in range (1, 10)]
for depth in depths:
    randomforest = RandomForestClassifier(max_depth=depth,  random_state=1)
    randomforest.fit(x_train, y_train)
    
    train_pred = randomforest.predict(x_train)
    train_results.append (round(accuracy_score(train_pred, y_train) * 100, 2))
    val_pred = randomforest.predict(x_val)
    val_results.append (round(accuracy_score(val_pred, y_val) * 100, 2))

line1, = plt.plot(depths, train_results, "b", label="Train AS")
line2, = plt.plot(depths, val_results, "r", label="Val AS")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AS score")
plt.xlabel("estimators")
plt.show()



In [30]:

real_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
real_model.fit(predictors, target)
predictions = real_model.predict(real_predictors)#

#predictions = model.predict(X_real) #
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)