In [249]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

In [250]:
df_train = pd.read_csv('_Titanic_train.csv')
df_test = pd.read_csv('_Titanic_test.csv')

## Preprocessing

### Training data

In [251]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [252]:
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)  #fill n/a age with mean

In [253]:
# select the title in name - all names are in format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_train.Name.map(lambda x: get_title(x))]))
print('Different titles in dataset:')
print(len(titles), ':', titles)
print()

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# create a new column for the titles
df_train['Title'] = df_train['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_train['Title'] = df_train.apply(replace_titles, axis=1)

# Check that the number of Mr, Mrs and Miss are the same that 'male' and 'female'
print('Title column values. Males and females are the same that for the "Sex" column:')
print(df_train.Title.value_counts())

Different titles in dataset:
17 : ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the Countess']

Title column values. Males and females are the same that for the "Sex" column:
Mr        537
Miss      185
Mrs       129
Master     40
Name: Title, dtype: int64


In [254]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [255]:
#convert columns/table
male = pd.get_dummies(df_train['Sex'],drop_first=True) #convert male/female to 0/1. Check video why drop_first=True
embark = pd.get_dummies(df_train['Embarked'],drop_first=True) #same conversion for embark
pcla = pd.get_dummies(df_train['Pclass'], drop_first=True) #make numbers 1,2,3 categories - its different
salutation = pd.get_dummies(df_train['Title'], drop_first=True) #the same with title

# visualise the changes, update table
df_train = pd.concat([df_train,pcla,male,embark,salutation],axis=1)  # add those converted columns
df_train.drop(['Sex','Embarked','Pclass','Cabin','Title','Ticket','Name'],axis=1,inplace=True) #drop unnecesary cols
df_train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,2,3,male,Q,S,Miss,Mr,Mrs
0,1,0,22.0,1,0,7.25,0,1,1,0,1,0,1,0
1,2,1,38.0,1,0,71.2833,0,0,0,0,0,0,0,1
2,3,1,26.0,0,0,7.925,0,1,0,0,1,1,0,0
3,4,1,35.0,1,0,53.1,0,0,0,0,1,0,0,1
4,5,0,35.0,0,0,8.05,0,1,1,0,1,0,1,0


In [256]:
df_train.to_csv('Titanic_train_preprocessed.csv', index=False)

In [257]:
#scaler = preprocessing.StandardScaler()
#normalized_cols = ['Age', 'Fare']
#scaled_df_train = scaler.fit_transform(df_train)
df_train[['Age', 'Fare']] = StandardScaler().fit_transform(df_train[['Age', 'Fare']])
#scaled_df_train.shape
#selcolumns = scaled_df_train['Age', 'Fare']

In [258]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,2,3,male,Q,S,Miss,Mr,Mrs
0,1,0,-0.565736,1,0,-0.502445,0,1,1,0,1,0,1,0
1,2,1,0.663861,1,0,0.786845,0,0,0,0,0,0,0,1
2,3,1,-0.258337,0,0,-0.488854,0,1,0,0,1,1,0,0
3,4,1,0.433312,1,0,0.42073,0,0,0,0,1,0,0,1
4,5,0,0.433312,0,0,-0.486337,0,1,1,0,1,0,1,0


In [259]:
df_train.to_csv('Titanic_train_preprocessed_scaled.csv', index=False)

### Test data

In [260]:
df_test['Age'].fillna(df_test['Age'].median(), inplace=True)
df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)
#median_fare = df_test.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
#df_test['Fare'] = df_test['Fare'].fillna(median_fare)

In [261]:
# same for test data
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_test.Name.map(lambda x: get_title(x))]))
print('Different titles found on the dataset:')
print(len(titles), ':', titles)
print()

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_test['Title'] = df_test['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_test['Title'] = df_test.apply(replace_titles, axis=1)

# Check that the number of Mr, Mrs and Miss are the same that 'male' and 'female'
print('Title column values. Males and females are the same that for the "Sex" column:')
print(df_test.Title.value_counts())

Different titles found on the dataset:
9 : ['Col', 'Dona', 'Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Ms', 'Rev']

Title column values. Males and females are the same that for the "Sex" column:
Mr        245
Miss       79
Mrs        73
Master     21
Name: Title, dtype: int64


In [262]:
#convert columns/table
male = pd.get_dummies(df_test['Sex'],drop_first=True) #convert male/female to 0/1. Check video why drop_first=True
embark = pd.get_dummies(df_test['Embarked'],drop_first=True) #same conversion for embark
pcla = pd.get_dummies(df_test['Pclass'], drop_first=True) #make numbers 1,2,3 categories - its different
salutation = pd.get_dummies(df_test['Title'], drop_first=True) #the same with title

# visualise the changes, update table
df_test = pd.concat([df_test,pcla,male,embark,salutation],axis=1)  # add those converted columns
df_test.drop(['Sex','Embarked','Pclass','Cabin','Title','Ticket','Name'],axis=1,inplace=True) # drop unnecesary columns
df_test.head()

df_test.to_csv('Titanic_test_preprocessed.csv', index=False)

In [263]:
df_test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,2,3,male,Q,S,Miss,Mr,Mrs
0,892,34.5,0,0,7.8292,0,1,1,1,0,0,1,0
1,893,47.0,1,0,7.0,0,1,0,0,1,0,0,1
2,894,62.0,0,0,9.6875,1,0,1,1,0,0,1,0
3,895,27.0,0,0,8.6625,0,1,1,0,1,0,1,0
4,896,22.0,1,1,12.2875,0,1,0,0,1,0,0,1


In [264]:
df_test[['Age', 'Fare']] = StandardScaler().fit_transform(df_test[['Age', 'Fare']])
#scaled_df_test.shape

#scaler = preprocessing.StandardScaler()
#scaled_df_test = scaler.fit_transform(df_test)
#scaled_df_test = pd.DataFrame(scaled_df_test, columns=['Age', 'Fare'])
#scaled_df_test.head()

In [265]:
#df_test.drop(['Age','Fare'],axis=1,inplace=True) 
df_test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,2,3,male,Q,S,Miss,Mr,Mrs
0,892,0.386231,0,0,-0.497413,0,1,1,1,0,0,1,0
1,893,1.37137,1,0,-0.512278,0,1,0,0,1,0,0,1
2,894,2.553537,0,0,-0.4641,1,0,1,1,0,0,1,0
3,895,-0.204852,0,0,-0.482475,0,1,1,0,1,0,1,0
4,896,-0.598908,1,1,-0.417492,0,1,0,0,1,0,0,1


In [266]:
df_test.to_csv('Titanic_test_preprocessed_scaled.csv', index=False)

## Train the data

In [267]:
X_train = df_train.drop('Survived',axis=1)
y_train = df_train['Survived']

In [268]:
X_train.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,2,3,male,Q,S,Miss,Mr,Mrs
0,1,-0.565736,1,0,-0.502445,0,1,1,0,1,0,1,0
1,2,0.663861,1,0,0.786845,0,0,0,0,0,0,0,1
2,3,-0.258337,0,0,-0.488854,0,1,0,0,1,1,0,0
3,4,0.433312,1,0,0.42073,0,0,0,0,1,0,0,1
4,5,0.433312,0,0,-0.486337,0,1,1,0,1,0,1,0


In [269]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion='gini',n_estimators=100000,max_depth=5,min_samples_split=4,min_samples_leaf=5,
                                       max_features='auto',oob_score=True,random_state=42,n_jobs=-1,verbose=1)
rfc.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 11242 tasks  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=100000, n_jobs=-1,
            oob_score=True, random_state=42, verbose=1, warm_start=False)

In [270]:
df_test.columns

Index(['PassengerId',         'Age',       'SibSp',       'Parch',
              'Fare',             2,             3,        'male',
                 'Q',           'S',        'Miss',          'Mr',
               'Mrs'],
      dtype='object')

In [271]:
X_test=df_test[['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare', 2, 3, 'male', 'Q', 'S', 'Miss', 'Mr', 'Mrs']]
predictions = rfc.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 11242 tasks      | elapsed:  

In [272]:
# add result to dataframe
pred_data = pd.DataFrame(predictions)
df_test = pd.concat([df_test,pred_data],axis=1) 
df_test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,2,3,male,Q,S,Miss,Mr,Mrs,0
0,892,0.386231,0,0,-0.497413,0,1,1,1,0,0,1,0,0
1,893,1.37137,1,0,-0.512278,0,1,0,0,1,0,0,1,1
2,894,2.553537,0,0,-0.4641,1,0,1,1,0,0,1,0,0
3,895,-0.204852,0,0,-0.482475,0,1,1,0,1,0,1,0,0
4,896,-0.598908,1,1,-0.417492,0,1,0,0,1,0,0,1,1


In [273]:
df_test.columns

Index(['PassengerId',         'Age',       'SibSp',       'Parch',
              'Fare',             2,             3,        'male',
                 'Q',           'S',        'Miss',          'Mr',
               'Mrs',             0],
      dtype='object')

In [274]:
# remove unnecessary columns and rename result column 
df_test.drop(['Age',       'SibSp',       'Parch',
              'Fare',           2,           3,        'male',
                 'Q',           'S',        'Miss',          'Mr',
               'Mrs'],axis=1,inplace=True) 
df_test.columns = ['PassengerId', 'Survived']
df_test.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [275]:
df_test.to_csv('submission_scaled.csv', index=False)