In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [50]:
df_train = pd.read_csv('_Titanic_train.csv')
df_test = pd.read_csv('_Titanic_test.csv')

## Preprocessing

In [51]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [52]:
def impute_age_manual2(cols):  # add missing values to age manually with this function (try with imputer too)
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
    
df_train['Age'] = df_train[['Age', 'Pclass']].apply(impute_age_manual2,axis=1)

In [53]:
df_train.drop('Cabin', axis=1, inplace=True)  #remove Cabin column - too many missing values

In [54]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [55]:
df_train.dropna(inplace=True)

In [56]:
male = pd.get_dummies(df_train['Sex'],drop_first=True) #convert male/female to 0/1. Check video why drop_first=True
embark = pd.get_dummies(df_train['Embarked'],drop_first=True) #same conversion for embark
pcla = pd.get_dummies(df_train['Pclass'], drop_first=True) #make numbers 1,2,3 categories - its different

In [57]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [58]:
df_train = pd.concat([df_train,pcla,male,embark],axis=1)  # add those converted columns
df_train.drop(['Sex','Embarked','Pclass'],axis=1,inplace=True) # drop unnecesary columns
df_train.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,2,3,male,Q,S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,1,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,1,1,0,1


In [59]:
df_train.drop(['Ticket'],axis=1,inplace=True)

In [60]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Fare,2,3,male,Q,S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,7.25,0,1,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,7.925,0,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,8.05,0,1,1,0,1


In [62]:
df_train.drop(['Name'],axis=1,inplace=True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,2,3,male,Q,S
0,1,0,22.0,1,0,7.25,0,1,1,0,1
1,2,1,38.0,1,0,71.2833,0,0,0,0,0
2,3,1,26.0,0,0,7.925,0,1,0,0,1
3,4,1,35.0,1,0,53.1,0,0,0,0,1
4,5,0,35.0,0,0,8.05,0,1,1,0,1


In [63]:
df_train.to_csv('Titanic_train_preprocessed.csv')

In [64]:
def impute_age_manual(cols):  # add missing values to age manually with this function (try with imputer too)
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
    
df_test['Age'] = df_test[['Age', 'Pclass']].apply(impute_age_manual2,axis=1)

In [65]:
df_test.drop('Cabin', axis=1, inplace=True)  #remove Cabin column - too many missing values
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [66]:
male = pd.get_dummies(df_test['Sex'],drop_first=True) 
embark = pd.get_dummies(df_test['Embarked'],drop_first=True)
pcla = pd.get_dummies(df_test['Pclass'], drop_first=True)

df_test = pd.concat([df_test,pcla,male,embark],axis=1)
df_test.drop(['Sex','Embarked','Name','Ticket','Pclass'],axis=1,inplace=True) 

In [67]:
df_test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,2,3,male,Q,S
0,892,34.5,0,0,7.8292,0,1,1,1,0
1,893,47.0,1,0,7.0,0,1,0,0,1
2,894,62.0,0,0,9.6875,1,0,1,1,0
3,895,27.0,0,0,8.6625,0,1,1,0,1
4,896,22.0,1,1,12.2875,0,1,0,0,1


In [68]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,2,3,male,Q,S
0,1,0,22.0,1,0,7.25,0,1,1,0,1
1,2,1,38.0,1,0,71.2833,0,0,0,0,0
2,3,1,26.0,0,0,7.925,0,1,0,0,1
3,4,1,35.0,1,0,53.1,0,0,0,0,1
4,5,0,35.0,0,0,8.05,0,1,1,0,1


In [70]:
df_test.to_csv('Titanic_test_preprocessed.csv')

In [71]:
df_test = pd.read_csv('Titanic_test_preprocessed.csv')
df_test.drop(['Unnamed: 0'],axis=1,inplace=True) 
df_test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,2,3,male,Q,S
0,892,34.5,0,0,7.8292,0,1,1,1,0
1,893,47.0,1,0,7.0,0,1,0,0,1
2,894,62.0,0,0,9.6875,1,0,1,1,0
3,895,27.0,0,0,8.6625,0,1,1,0,1
4,896,22.0,1,1,12.2875,0,1,0,0,1


## Train the data

In [72]:
X_train = df_train.drop('Survived',axis=1)
y_train = df_train['Survived']

In [81]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10000)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [82]:
df_test = pd.read_csv('Titanic_test_preprocessed.csv')
X_test = df_test[['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare', '2', '3', 'male', 'Q', 'S']]
rfc_predict = rfc.predict(X_test)

In [83]:
rfc_pred_data = pd.DataFrame(rfc_predict)
df_test = pd.concat([df_test,rfc_pred_data],axis=1) 
df_test.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,2,3,male,Q,S,0
0,0,892,34.5,0,0,7.8292,0,1,1,1,0,0
1,1,893,47.0,1,0,7.0,0,1,0,0,1,0
2,2,894,62.0,0,0,9.6875,1,0,1,1,0,0
3,3,895,27.0,0,0,8.6625,0,1,1,0,1,0
4,4,896,22.0,1,1,12.2875,0,1,0,0,1,0


In [84]:
df_test.columns

Index([ 'Unnamed: 0', 'PassengerId',         'Age',       'SibSp',
             'Parch',        'Fare',           '2',           '3',
              'male',           'Q',           'S',             0],
      dtype='object')

In [85]:
df_test.drop(['Unnamed: 0', 'Age', 'SibSp', 'Parch', 'Fare', '2', '3', 'male', 'Q', 'S'],axis=1,inplace=True) 
df_test.head()

Unnamed: 0,PassengerId,0
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [86]:
df_test.columns = ['PassengerId', 'Survived']
df_test.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [87]:
df_test.to_csv('Titanic_result_RndForest.csv')