In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('_Titanic_train.csv')
df_test = pd.read_csv('_Titanic_test.csv')

## Preprocessing

### Training data

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)  #fill n/a age with mean

In [5]:
# select the title in name - all names are in format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_train.Name.map(lambda x: get_title(x))]))
print('Different titles in dataset:')
print(len(titles), ':', titles)
print()

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# create a new column for the titles
df_train['Title'] = df_train['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_train['Title'] = df_train.apply(replace_titles, axis=1)

# Check that the number of Mr, Mrs and Miss are the same that 'male' and 'female'
print('Title column values. Males and females are the same that for the "Sex" column:')
print(df_train.Title.value_counts())

Different titles in dataset:
17 : ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the Countess']

Title column values. Males and females are the same that for the "Sex" column:
Title
Mr        537
Miss      185
Mrs       129
Master     40
Name: count, dtype: int64


In [6]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [7]:
#convert columns/table
male = pd.get_dummies(df_train['Sex'],drop_first=True) #convert male/female to 0/1. Check video why drop_first=True
embark = pd.get_dummies(df_train['Embarked'],drop_first=True) #same conversion for embark
pcla = pd.get_dummies(df_train['Pclass'], drop_first=True) #make numbers 1,2,3 categories - its different
salutation = pd.get_dummies(df_train['Title'], drop_first=True) #the same with title

# visualise the changes, update table
df_train = pd.concat([df_train,pcla,male,embark,salutation],axis=1)  # add those converted columns
df_train.drop(['Sex','Embarked','Pclass','Cabin','Title','Ticket','Name'],axis=1,inplace=True) #drop unnecesary cols
df_train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,2,3,male,Q,S,Miss,Mr,Mrs
0,1,0,22.0,1,0,7.25,False,True,True,False,True,False,True,False
1,2,1,38.0,1,0,71.2833,False,False,False,False,False,False,False,True
2,3,1,26.0,0,0,7.925,False,True,False,False,True,True,False,False
3,4,1,35.0,1,0,53.1,False,False,False,False,True,False,False,True
4,5,0,35.0,0,0,8.05,False,True,True,False,True,False,True,False


In [8]:
df_train.to_csv('Titanic_train_preprocessed.csv', index=False)

In [9]:
#scaler = preprocessing.StandardScaler()
#normalized_cols = ['Age', 'Fare']
#scaled_df_train = scaler.fit_transform(df_train)
df_train[['Age', 'Fare']] = StandardScaler().fit_transform(df_train[['Age', 'Fare']])
#scaled_df_train.shape
#selcolumns = scaled_df_train['Age', 'Fare']

NameError: name 'StandardScaler' is not defined

In [None]:
df_train.head()

In [None]:
df_train.to_csv('Titanic_train_preprocessed_scaled.csv', index=False)

### Test data

In [None]:
df_test['Age'].fillna(df_test['Age'].median(), inplace=True)
df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)
#median_fare = df_test.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
#df_test['Fare'] = df_test['Fare'].fillna(median_fare)

In [None]:
# same for test data
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_test.Name.map(lambda x: get_title(x))]))
print('Different titles found on the dataset:')
print(len(titles), ':', titles)
print()

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_test['Title'] = df_test['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_test['Title'] = df_test.apply(replace_titles, axis=1)

# Check that the number of Mr, Mrs and Miss are the same that 'male' and 'female'
print('Title column values. Males and females are the same that for the "Sex" column:')
print(df_test.Title.value_counts())

In [None]:
#convert columns/table
male = pd.get_dummies(df_test['Sex'],drop_first=True) #convert male/female to 0/1. Check video why drop_first=True
embark = pd.get_dummies(df_test['Embarked'],drop_first=True) #same conversion for embark
pcla = pd.get_dummies(df_test['Pclass'], drop_first=True) #make numbers 1,2,3 categories - its different
salutation = pd.get_dummies(df_test['Title'], drop_first=True) #the same with title

# visualise the changes, update table
df_test = pd.concat([df_test,pcla,male,embark,salutation],axis=1)  # add those converted columns
df_test.drop(['Sex','Embarked','Pclass','Cabin','Title','Ticket','Name'],axis=1,inplace=True) # drop unnecesary columns
df_test.head()

df_test.to_csv('Titanic_test_preprocessed.csv', index=False)

In [None]:
df_test.head()

In [None]:
df_test[['Age', 'Fare']] = StandardScaler().fit_transform(df_test[['Age', 'Fare']])
#scaled_df_test.shape

#scaler = preprocessing.StandardScaler()
#scaled_df_test = scaler.fit_transform(df_test)
#scaled_df_test = pd.DataFrame(scaled_df_test, columns=['Age', 'Fare'])
#scaled_df_test.head()

In [None]:
#df_test.drop(['Age','Fare'],axis=1,inplace=True) 
df_test.head()

In [None]:
df_test.to_csv('Titanic_test_preprocessed_scaled.csv', index=False)

## Train the data

In [None]:
X_train = df_train.drop('Survived',axis=1)
y_train = df_train['Survived']

In [None]:
X_train.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion='gini',n_estimators=100000,max_depth=5,min_samples_split=4,min_samples_leaf=5,
                                       max_features='auto',oob_score=True,random_state=42,n_jobs=-1,verbose=1)
rfc.fit(X_train,y_train)

In [None]:
df_test.columns

In [None]:
X_test=df_test[['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare', 2, 3, 'male', 'Q', 'S', 'Miss', 'Mr', 'Mrs']]
predictions = rfc.predict(X_test)

In [None]:
# add result to dataframe
pred_data = pd.DataFrame(predictions)
df_test = pd.concat([df_test,pred_data],axis=1) 
df_test.head()

In [None]:
df_test.columns

In [None]:
# remove unnecessary columns and rename result column 
df_test.drop(['Age',       'SibSp',       'Parch',
              'Fare',           2,           3,        'male',
                 'Q',           'S',        'Miss',          'Mr',
               'Mrs'],axis=1,inplace=True) 
df_test.columns = ['PassengerId', 'Survived']
df_test.head()

In [None]:
df_test.to_csv('submission_scaled.csv', index=False)