In [24]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [25]:
# load dataset
df_train = pd.read_csv('titanic_train_data.csv')

In [26]:
#df_train['Age'].isnull().sum()
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [27]:
df_train.drop(['PassengerId', 'Ticket'],axis=1,inplace=True)
df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C148,C


In [28]:
df_train['Deck'] = df_train['Cabin'].str.slice(0,1) 
df_train['Deck'] = df_train['Deck'].map({"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F":6,"G":7, "T":8})
df_train['Deck'] = df_train['Deck'].fillna(0)
df_train['Deck'] = df_train['Deck'].astype(np.int64)
df_train.drop(['Cabin'],axis=1,inplace=True) 
df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,3
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,2
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,0
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,3


In [29]:
# Fill the missing values of Age
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)  #fill n/a age with mean value

common_value = 'S'
df_train['Embarked'] = df_train['Embarked'].fillna(common_value)


# create Age categories
def age_type(age):
    if age>0 and age<=5:
        return 'baby'
    elif(age>=6 and age<=10):
        return 'child'
    elif(age>=11 and age<=19):
        return 'teenager'
    elif(age>=20 and age<=30):
        return 'early adult'
    elif(age>=31 and age<=45):
        return 'adult'
    elif(age>=46 and age<=60):
        return 'late adult'
    else :
        return 'senior'
    

df_train['Age_type']=df_train['Age'].apply(age_type)
df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Age_type
0,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,7.2500,S,0,early adult
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,71.2833,C,3,adult
2,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,7.9250,S,0,early adult
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,53.1000,S,3,adult
4,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,8.0500,S,0,adult
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,13.0000,S,0,early adult
887,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,30.0000,S,2,teenager
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,23.4500,S,0,early adult
889,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,30.0000,C,3,early adult


In [30]:
# Create new features

df_train['Relatives'] = df_train['SibSp'] + df_train['Parch']
df_train['Age_Class'] = df_train['Age'] * df_train['Pclass']
df_train['FarePerPerson']=df_train['Fare']/(df_train['Relatives']+1)

df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Age_type,Relatives,Age_Class,FarePerPerson
0,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,7.2500,S,0,early adult,1,66.000000,3.62500
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,71.2833,C,3,adult,1,38.000000,35.64165
2,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,7.9250,S,0,early adult,0,78.000000,7.92500
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,53.1000,S,3,adult,1,35.000000,26.55000
4,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,8.0500,S,0,adult,0,105.000000,8.05000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,13.0000,S,0,early adult,0,54.000000,13.00000
887,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,30.0000,S,2,teenager,0,19.000000,30.00000
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,23.4500,S,0,early adult,3,89.097353,5.86250
889,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,30.0000,C,3,early adult,0,26.000000,30.00000


In [31]:
df_train.isnull().sum()

Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
Deck             0
Age_type         0
Relatives        0
Age_Class        0
FarePerPerson    0
dtype: int64

In [32]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_train.Name.map(lambda x: get_title(x))]))

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_train['Title'] = df_train['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_train['Title'] = df_train.apply(replace_titles, axis=1)


#convert columns/table
male = pd.get_dummies(df_train['Sex'],drop_first=False) #convert male/female to 0/1. Check video why drop_first=True
pcla = pd.get_dummies(df_train['Pclass'], drop_first=False) #make dummy variables
salutation = pd.get_dummies(df_train['Title'], drop_first=False) 
embark = pd.get_dummies(df_train['Embarked'], drop_first=False) 
agelevel = pd.get_dummies(df_train['Age_type'],drop_first=False)


# visualise the changes, update table
df_train = pd.concat([df_train, male, salutation, pcla, embark, agelevel],axis=1)  # add those converted columns
df_train.drop(['Sex', 'Title', 'Pclass', 'Name', 'Embarked', 'Age_type'],axis=1,inplace=True) #drop unnecesary cols
df_train = df_train.rename(columns = {'female':'Female', 'male':'Male', 1:'Class 1', 2:'Class 2', 3:'Class 3'})
df_train = df_train.rename(columns = {'C':'EmbarkC', 'Q':'EmbarkQ', 'S':'EmbarkS'})

df_train

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Deck,Relatives,Age_Class,FarePerPerson,Female,...,EmbarkC,EmbarkQ,EmbarkS,adult,baby,child,early adult,late adult,senior,teenager
0,0,22.000000,1,0,7.2500,0,1,66.000000,3.62500,False,...,False,False,True,False,False,False,True,False,False,False
1,1,38.000000,1,0,71.2833,3,1,38.000000,35.64165,True,...,True,False,False,True,False,False,False,False,False,False
2,1,26.000000,0,0,7.9250,0,0,78.000000,7.92500,True,...,False,False,True,False,False,False,True,False,False,False
3,1,35.000000,1,0,53.1000,3,1,35.000000,26.55000,True,...,False,False,True,True,False,False,False,False,False,False
4,0,35.000000,0,0,8.0500,0,0,105.000000,8.05000,False,...,False,False,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.000000,0,0,13.0000,0,0,54.000000,13.00000,False,...,False,False,True,False,False,False,True,False,False,False
887,1,19.000000,0,0,30.0000,2,0,19.000000,30.00000,True,...,False,False,True,False,False,False,False,False,False,True
888,0,29.699118,1,2,23.4500,0,3,89.097353,5.86250,True,...,False,False,True,False,False,False,True,False,False,False
889,1,26.000000,0,0,30.0000,3,0,26.000000,30.00000,False,...,True,False,False,False,False,False,True,False,False,False


In [33]:
# from sklearn.preprocessing import StandardScaler
# X = StandardScaler().fit_transform(X)
# OR
# import sklearn
# X = sklearn.preprocessing.StandardScaler().fit_transform(X)

from sklearn.preprocessing import StandardScaler
df_train.iloc[:,9:] = StandardScaler().fit_transform(df_train.iloc[:,9:])
df_train

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Deck,Relatives,Age_Class,FarePerPerson,Female,...,EmbarkC,EmbarkQ,EmbarkS,adult,baby,child,early adult,late adult,senior,teenager
0,0,22.000000,1,0,7.2500,0,1,66.000000,3.62500,-0.737695,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559
1,1,38.000000,1,0,71.2833,3,1,38.000000,35.64165,1.355574,...,2.074505,-0.307562,-1.623803,1.858763,-0.227921,-0.151533,-0.948571,-0.311914,-0.173372,-0.355559
2,1,26.000000,0,0,7.9250,0,0,78.000000,7.92500,1.355574,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559
3,1,35.000000,1,0,53.1000,3,1,35.000000,26.55000,1.355574,...,-0.482043,-0.307562,0.615838,1.858763,-0.227921,-0.151533,-0.948571,-0.311914,-0.173372,-0.355559
4,0,35.000000,0,0,8.0500,0,0,105.000000,8.05000,-0.737695,...,-0.482043,-0.307562,0.615838,1.858763,-0.227921,-0.151533,-0.948571,-0.311914,-0.173372,-0.355559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.000000,0,0,13.0000,0,0,54.000000,13.00000,-0.737695,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559
887,1,19.000000,0,0,30.0000,2,0,19.000000,30.00000,1.355574,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,-0.948571,-0.311914,-0.173372,2.812472
888,0,29.699118,1,2,23.4500,0,3,89.097353,5.86250,1.355574,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559
889,1,26.000000,0,0,30.0000,3,0,26.000000,30.00000,-0.737695,...,2.074505,-0.307562,-1.623803,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559


In [34]:
df_train.iloc[:,1:9] = StandardScaler().fit_transform(df_train.iloc[:,1:9])
df_train

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Deck,Relatives,Age_Class,FarePerPerson,Female,...,EmbarkC,EmbarkQ,EmbarkS,adult,baby,child,early adult,late adult,senior,teenager
0,0,-0.592481,0.432793,-0.473674,-0.502445,-0.488461,0.059160,0.031376,-0.454798,-0.737695,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559
1,1,0.638789,0.432793,-0.473674,0.786845,1.398325,0.059160,-0.818937,0.438994,1.355574,...,2.074505,-0.307562,-1.623803,1.858763,-0.227921,-0.151533,-0.948571,-0.311914,-0.173372,-0.355559
2,1,-0.284663,-0.474545,-0.473674,-0.488854,-0.488461,-0.560975,0.395796,-0.334757,1.355574,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559
3,1,0.407926,0.432793,-0.473674,0.420730,1.398325,0.059160,-0.910042,0.185187,1.355574,...,-0.482043,-0.307562,0.615838,1.858763,-0.227921,-0.151533,-0.948571,-0.311914,-0.173372,-0.355559
4,0,0.407926,-0.474545,-0.473674,-0.486337,-0.488461,-0.560975,1.215741,-0.331267,-0.737695,...,-0.482043,-0.307562,0.615838,1.858763,-0.227921,-0.151533,-0.948571,-0.311914,-0.173372,-0.355559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,-0.207709,-0.474545,-0.473674,-0.386671,-0.488461,-0.560975,-0.333044,-0.193081,-0.737695,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559
887,1,-0.823344,-0.474545,-0.473674,-0.044381,0.769396,-0.560975,-1.395935,0.281499,1.355574,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,-0.948571,-0.311914,-0.173372,2.812472
888,0,0.000000,0.432793,2.008933,-0.176263,-0.488461,1.299429,0.732804,-0.392335,1.355574,...,-0.482043,-0.307562,0.615838,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559
889,1,-0.284663,-0.474545,-0.473674,-0.044381,1.398325,-0.560975,-1.183357,0.281499,-0.737695,...,2.074505,-0.307562,-1.623803,-0.537992,-0.227921,-0.151533,1.054217,-0.311914,-0.173372,-0.355559


In [35]:
# save preprocessed training data
df_train.to_csv('titanic_train_preprocessed.csv', index=False)

In [36]:
# load submission data
df_test = pd.read_csv('titanic_test_data.csv')

In [37]:
#df_test['Age'].isnull().sum()
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [38]:
df_test.drop(['PassengerId', 'Ticket'],axis=1,inplace=True)
df_test

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0000,,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,,S
...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,,0,0,8.0500,,S
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,108.9000,C105,C
415,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,7.2500,,S
416,3,"Ware, Mr. Frederick",male,,0,0,8.0500,,S


In [39]:
df_test['Deck'] = df_test['Cabin'].str.slice(0,1) 
df_test['Deck'] = df_test['Deck'].map({"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F":6,"G":7, "T":8})
df_test['Deck'] = df_test['Deck'].fillna(0)
df_test['Deck'] = df_test['Deck'].astype(np.int64)
df_test.drop(['Cabin'],axis=1,inplace=True) 
df_test

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,0
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0000,S,0
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,0
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,0
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,0
...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,,0,0,8.0500,S,0
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,108.9000,C,3
415,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,7.2500,S,0
416,3,"Ware, Mr. Frederick",male,,0,0,8.0500,S,0


In [40]:
# Fill the missing values of Age
df_test['Age'].fillna(df_test['Age'].mean(), inplace=True)  #fill n/a age with mean value
common_value = 'S'
df_test['Embarked'] = df_test['Embarked'].fillna(common_value)
df_test['Fare'].fillna(df_test['Fare'].mean(), inplace=True)

# create Age categories
def age_type(age):
    if age>0 and age<=5:
        return 'baby'
    elif(age>=6 and age<=10):
        return 'child'
    elif(age>=11 and age<=19):
        return 'teenager'
    elif(age>=20 and age<=30):
        return 'early adult'
    elif(age>=31 and age<=45):
        return 'adult'
    elif(age>=46 and age<=60):
        return 'late adult'
    else :
        return 'senior'
    

df_test['Age_type']=df_test['Age'].apply(age_type)
df_test

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Age_type
0,3,"Kelly, Mr. James",male,34.50000,0,0,7.8292,Q,0,adult
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00000,1,0,7.0000,S,0,late adult
2,2,"Myles, Mr. Thomas Francis",male,62.00000,0,0,9.6875,Q,0,senior
3,3,"Wirz, Mr. Albert",male,27.00000,0,0,8.6625,S,0,early adult
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00000,1,1,12.2875,S,0,early adult
...,...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,30.27259,0,0,8.0500,S,0,senior
414,1,"Oliva y Ocana, Dona. Fermina",female,39.00000,0,0,108.9000,C,3,adult
415,3,"Saether, Mr. Simon Sivertsen",male,38.50000,0,0,7.2500,S,0,adult
416,3,"Ware, Mr. Frederick",male,30.27259,0,0,8.0500,S,0,senior


In [41]:
df_test['Relatives'] = df_test['SibSp'] + df_test['Parch']
df_test['Age_Class'] = df_test['Age'] * df_test['Pclass']
df_test['FarePerPerson']=df_test['Fare']/(df_test['Relatives']+1)
df_test


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Age_type,Relatives,Age_Class,FarePerPerson
0,3,"Kelly, Mr. James",male,34.50000,0,0,7.8292,Q,0,adult,0,103.500000,7.829200
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00000,1,0,7.0000,S,0,late adult,1,141.000000,3.500000
2,2,"Myles, Mr. Thomas Francis",male,62.00000,0,0,9.6875,Q,0,senior,0,124.000000,9.687500
3,3,"Wirz, Mr. Albert",male,27.00000,0,0,8.6625,S,0,early adult,0,81.000000,8.662500
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00000,1,1,12.2875,S,0,early adult,2,66.000000,4.095833
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,30.27259,0,0,8.0500,S,0,senior,0,90.817771,8.050000
414,1,"Oliva y Ocana, Dona. Fermina",female,39.00000,0,0,108.9000,C,3,adult,0,39.000000,108.900000
415,3,"Saether, Mr. Simon Sivertsen",male,38.50000,0,0,7.2500,S,0,adult,0,115.500000,7.250000
416,3,"Ware, Mr. Frederick",male,30.27259,0,0,8.0500,S,0,senior,0,90.817771,8.050000


In [42]:
df_test.isnull().sum()

Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
Deck             0
Age_type         0
Relatives        0
Age_Class        0
FarePerPerson    0
dtype: int64

In [43]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_test.Name.map(lambda x: get_title(x))]))

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_test['Title'] = df_test['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_test['Title'] = df_test.apply(replace_titles, axis=1)


#convert columns/table
male = pd.get_dummies(df_test['Sex'],drop_first=False) #convert male/female to 0/1. Check video why drop_first=True
pcla = pd.get_dummies(df_test['Pclass'], drop_first=False) #make dummy variables
embark = pd.get_dummies(df_test['Embarked'],drop_first=False)
salutation = pd.get_dummies(df_test['Title'], drop_first=False) 
agelevel = pd.get_dummies(df_test['Age_type'],drop_first=False)


# visualise the changes, update table
df_test = pd.concat([df_test, male, salutation, pcla, embark, agelevel],axis=1)  # add those converted columns
df_test.drop(['Sex', 'Title', 'Pclass', 'Name', 'Embarked', 'Age_type'],axis=1,inplace=True) #drop unnecesary cols
df_test = df_test.rename(columns = {'female':'Female', 'male':'Male', 1:'Class 1', 2:'Class 2', 3:'Class 3'})
df_test = df_test.rename(columns = {'C':'EmbarkC', 'Q':'EmbarkQ', 'S':'EmbarkS'})

df_test

Unnamed: 0,Age,SibSp,Parch,Fare,Deck,Relatives,Age_Class,FarePerPerson,Female,Male,...,EmbarkC,EmbarkQ,EmbarkS,adult,baby,child,early adult,late adult,senior,teenager
0,34.50000,0,0,7.8292,0,0,103.500000,7.829200,False,True,...,False,True,False,True,False,False,False,False,False,False
1,47.00000,1,0,7.0000,0,1,141.000000,3.500000,True,False,...,False,False,True,False,False,False,False,True,False,False
2,62.00000,0,0,9.6875,0,0,124.000000,9.687500,False,True,...,False,True,False,False,False,False,False,False,True,False
3,27.00000,0,0,8.6625,0,0,81.000000,8.662500,False,True,...,False,False,True,False,False,False,True,False,False,False
4,22.00000,1,1,12.2875,0,2,66.000000,4.095833,True,False,...,False,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,30.27259,0,0,8.0500,0,0,90.817771,8.050000,False,True,...,False,False,True,False,False,False,False,False,True,False
414,39.00000,0,0,108.9000,3,0,39.000000,108.900000,True,False,...,True,False,False,True,False,False,False,False,False,False
415,38.50000,0,0,7.2500,0,0,115.500000,7.250000,False,True,...,False,False,True,True,False,False,False,False,False,False
416,30.27259,0,0,8.0500,0,0,90.817771,8.050000,False,True,...,False,False,True,False,False,False,False,False,True,False


In [44]:
df_test.iloc[:,8:] = StandardScaler().fit_transform(df_test.iloc[:,8:])
df_test

Unnamed: 0,Age,SibSp,Parch,Fare,Deck,Relatives,Age_Class,FarePerPerson,Female,Male,...,EmbarkC,EmbarkQ,EmbarkS,adult,baby,child,early adult,late adult,senior,teenager
0,34.50000,0,0,7.8292,0,0,103.500000,7.829200,-0.755929,0.755929,...,-0.568142,2.843757,-1.350676,2.055480,-0.17192,-0.156556,-0.705838,-0.329778,-0.549710,-0.320784
1,47.00000,1,0,7.0000,0,1,141.000000,3.500000,1.322876,-1.322876,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,-0.705838,3.032346,-0.549710,-0.320784
2,62.00000,0,0,9.6875,0,0,124.000000,9.687500,-0.755929,0.755929,...,-0.568142,2.843757,-1.350676,-0.486504,-0.17192,-0.156556,-0.705838,-0.329778,1.819142,-0.320784
3,27.00000,0,0,8.6625,0,0,81.000000,8.662500,-0.755929,0.755929,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,1.416755,-0.329778,-0.549710,-0.320784
4,22.00000,1,1,12.2875,0,2,66.000000,4.095833,1.322876,-1.322876,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,1.416755,-0.329778,-0.549710,-0.320784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,30.27259,0,0,8.0500,0,0,90.817771,8.050000,-0.755929,0.755929,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,-0.705838,-0.329778,1.819142,-0.320784
414,39.00000,0,0,108.9000,3,0,39.000000,108.900000,1.322876,-1.322876,...,1.760125,-0.351647,-1.350676,2.055480,-0.17192,-0.156556,-0.705838,-0.329778,-0.549710,-0.320784
415,38.50000,0,0,7.2500,0,0,115.500000,7.250000,-0.755929,0.755929,...,-0.568142,-0.351647,0.740370,2.055480,-0.17192,-0.156556,-0.705838,-0.329778,-0.549710,-0.320784
416,30.27259,0,0,8.0500,0,0,90.817771,8.050000,-0.755929,0.755929,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,-0.705838,-0.329778,1.819142,-0.320784


In [45]:
df_test.iloc[:,0:8] = StandardScaler().fit_transform(df_test.iloc[:,0:8])
df_test

Unnamed: 0,Age,SibSp,Parch,Fare,Deck,Relatives,Age_Class,FarePerPerson,Female,Male,...,EmbarkC,EmbarkQ,EmbarkS,adult,baby,child,early adult,late adult,senior,teenager
0,0.334993,-0.499470,-0.400248,-0.498407,-0.476399,-0.553443,1.369105,-0.393918,-0.755929,0.755929,...,-0.568142,2.843757,-1.350676,2.055480,-0.17192,-0.156556,-0.705838,-0.329778,-0.549710,-0.320784
1,1.325530,0.616992,-0.400248,-0.513274,-0.476399,0.105643,2.665836,-0.515658,1.322876,-1.322876,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,-0.705838,3.032346,-0.549710,-0.320784
2,2.514175,-0.499470,-0.400248,-0.465088,-0.476399,-0.553443,2.077984,-0.341661,-0.755929,0.755929,...,-0.568142,2.843757,-1.350676,-0.486504,-0.17192,-0.156556,-0.705838,-0.329778,1.819142,-0.320784
3,-0.259330,-0.499470,-0.400248,-0.483466,-0.476399,-0.553443,0.591066,-0.370485,-0.755929,0.755929,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,1.416755,-0.329778,-0.549710,-0.320784
4,-0.655545,0.616992,0.619896,-0.418471,-0.476399,0.764728,0.072374,-0.498903,1.322876,-1.322876,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,1.416755,-0.329778,-0.549710,-0.320784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.000000,-0.499470,-0.400248,-0.494448,-0.476399,-0.553443,0.930560,-0.387709,-0.755929,0.755929,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,-0.705838,-0.329778,1.819142,-0.320784
414,0.691586,-0.499470,-0.400248,1.313753,1.514950,-0.553443,-0.861272,2.448278,1.322876,-1.322876,...,1.760125,-0.351647,-1.350676,2.055480,-0.17192,-0.156556,-0.705838,-0.329778,-0.549710,-0.320784
415,0.651965,-0.499470,-0.400248,-0.508792,-0.476399,-0.553443,1.784059,-0.410205,-0.755929,0.755929,...,-0.568142,-0.351647,0.740370,2.055480,-0.17192,-0.156556,-0.705838,-0.329778,-0.549710,-0.320784
416,0.000000,-0.499470,-0.400248,-0.494448,-0.476399,-0.553443,0.930560,-0.387709,-0.755929,0.755929,...,-0.568142,-0.351647,0.740370,-0.486504,-0.17192,-0.156556,-0.705838,-0.329778,1.819142,-0.320784


In [46]:
# save submission data
df_test.to_csv('titanic_test_preprocessed.csv', index=False)