In [185]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [186]:
# load dataset
df_train = pd.read_csv('titanic_train_data.csv')

In [187]:
df_train.drop(['PassengerId', 'Ticket', 'Cabin', 'Embarked'],axis=1,inplace=True)
df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000


In [188]:
# Fill the missing values of Age
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)  #fill n/a age with mean value

# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_train.Name.map(lambda x: get_title(x))]))

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_train['Title'] = df_train['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_train['Title'] = df_train.apply(replace_titles, axis=1)


#convert columns/table
male = pd.get_dummies(df_train['Sex'],drop_first=False) #convert male/female to 0/1. Check video why drop_first=True
pcla = pd.get_dummies(df_train['Pclass'], drop_first=False) #make numbers 1,2,3 categories - its different
salutation = pd.get_dummies(df_train['Title'], drop_first=False) #the same with title

# visualise the changes, update table
df_train = pd.concat([df_train, male, salutation, pcla],axis=1)  # add those converted columns
df_train.drop(['Sex', 'Title', 'Pclass', 'Name'],axis=1,inplace=True) #drop unnecesary cols
df_train = df_train.rename(columns = {'female':'Female', 'male':'Male', 1:'Class 1', 2:'Class 2', 3:'Class 3'})

df_train

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Female,Male,Master,Miss,Mr,Mrs,Class 1,Class 2,Class 3
0,0,22.000000,1,0,7.2500,False,True,False,False,True,False,False,False,True
1,1,38.000000,1,0,71.2833,True,False,False,False,False,True,True,False,False
2,1,26.000000,0,0,7.9250,True,False,False,True,False,False,False,False,True
3,1,35.000000,1,0,53.1000,True,False,False,False,False,True,True,False,False
4,0,35.000000,0,0,8.0500,False,True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.000000,0,0,13.0000,False,True,False,False,True,False,False,True,False
887,1,19.000000,0,0,30.0000,True,False,False,True,False,False,True,False,False
888,0,29.699118,1,2,23.4500,True,False,False,True,False,False,False,False,True
889,1,26.000000,0,0,30.0000,False,True,False,False,True,False,True,False,False


In [27]:
# x = df_train.values #returns a numpy array
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# df_train_scaled = pd.DataFrame(x_scaled)
# df_train_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.271174,0.125,0.000000,0.014151,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.472229,0.125,0.000000,0.139136,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1.0,0.321438,0.000,0.000000,0.015469,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,0.434531,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.334004,0.000,0.000000,0.025374,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
887,1.0,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
888,0.0,0.346569,0.125,0.333333,0.045771,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
889,1.0,0.321438,0.000,0.000000,0.058556,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [189]:
# df_train['Female'] = df_train['Female'].astype(int)
# df_train['Male'] = df_train['Male'].astype(int)
# df_train['Master'] = df_train['Master'].astype(int)
# df_train['Miss'] = df_train['Miss'].astype(int)
# df_train['Mr'] = df_train['Mr'].astype(int)
# df_train['Mrs'] = df_train['Mrs'].astype(int)
# df_train['Class 1'] = df_train['Class 1'].astype(int)
# df_train['Class 2'] = df_train['Class 2'].astype(int)
# df_train['Class 3'] = df_train['Class 3'].astype(int)
# df_train

df_train.iloc[:,5:] = scaler.fit_transform(df_train.iloc[:,5:])
df_train

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Female,Male,Master,Miss,Mr,Mrs,Class 1,Class 2,Class 3
0,0,22.000000,1,0,7.2500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,38.000000,1,0,71.2833,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1,26.000000,0,0,7.9250,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1,35.000000,1,0,53.1000,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0,35.000000,0,0,8.0500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.000000,0,0,13.0000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
887,1,19.000000,0,0,30.0000,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
888,0,29.699118,1,2,23.4500,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
889,1,26.000000,0,0,30.0000,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [190]:
#scaler = MinMaxScaler()
df_train.iloc[:,1:5] = scaler.fit_transform(df_train.iloc[:,1:5])
df_train

#alebo pomocou loc
#df_train.loc[:,['Age', 'SibSp', 'Parch', 'Fare']] = scaler.fit_transform(df_train.loc[:,['Age', 'SibSp', 'Parch', 'Fare']])

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Female,Male,Master,Miss,Mr,Mrs,Class 1,Class 2,Class 3
0,0,0.271174,0.125,0.000000,0.014151,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,0.472229,0.125,0.000000,0.139136,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1,0.321438,0.000,0.000000,0.015469,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0,0.434531,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0.334004,0.000,0.000000,0.025374,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
887,1,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
888,0,0.367921,0.125,0.333333,0.045771,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
889,1,0.321438,0.000,0.000000,0.058556,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [191]:
# save preprocessed training data
df_train.to_csv('titanic_train_preprocessed2.csv', index=False)

In [192]:
# load submission data
df_test = pd.read_csv('titanic_test_data.csv')

In [193]:
df_test.drop(['PassengerId', 'Ticket', 'Cabin', 'Embarked'],axis=1,inplace=True)

In [194]:
# Fill the missing values of Age
df_test['Age'].fillna(df_test['Age'].mean(), inplace=True)  #fill n/a age with mean value

# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_test.Name.map(lambda x: get_title(x))]))

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_test['Title'] = df_test['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_test['Title'] = df_test.apply(replace_titles, axis=1)


#convert columns/table
male = pd.get_dummies(df_test['Sex'],drop_first=False) #convert male/female to 0/1. Check video why drop_first=True
pcla = pd.get_dummies(df_test['Pclass'], drop_first=False) #make numbers 1,2,3 categories - its different
salutation = pd.get_dummies(df_test['Title'], drop_first=False) #the same with title

# visualise the changes, update table
df_test = pd.concat([df_test, male, salutation, pcla],axis=1)  # add those converted columns
df_test.drop(['Sex', 'Title', 'Pclass', 'Name'],axis=1,inplace=True) #drop unnecesary cols
df_test = df_test.rename(columns = {'female':'Female', 'male':'Male', 1:'Class 1', 2:'Class 2', 3:'Class 3'})

df_test

Unnamed: 0,Age,SibSp,Parch,Fare,Female,Male,Master,Miss,Mr,Mrs,Class 1,Class 2,Class 3
0,34.50000,0,0,7.8292,False,True,False,False,True,False,False,False,True
1,47.00000,1,0,7.0000,True,False,False,False,False,True,False,False,True
2,62.00000,0,0,9.6875,False,True,False,False,True,False,False,True,False
3,27.00000,0,0,8.6625,False,True,False,False,True,False,False,False,True
4,22.00000,1,1,12.2875,True,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,30.27259,0,0,8.0500,False,True,False,False,True,False,False,False,True
414,39.00000,0,0,108.9000,True,False,False,False,False,True,True,False,False
415,38.50000,0,0,7.2500,False,True,False,False,True,False,False,False,True
416,30.27259,0,0,8.0500,False,True,False,False,True,False,False,False,True


In [195]:
df_test.iloc[:,4:] = scaler.fit_transform(df_test.iloc[:,4:])
df_test

Unnamed: 0,Age,SibSp,Parch,Fare,Female,Male,Master,Miss,Mr,Mrs,Class 1,Class 2,Class 3
0,34.50000,0,0,7.8292,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,47.00000,1,0,7.0000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,62.00000,0,0,9.6875,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,27.00000,0,0,8.6625,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,22.00000,1,1,12.2875,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,30.27259,0,0,8.0500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
414,39.00000,0,0,108.9000,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
415,38.50000,0,0,7.2500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
416,30.27259,0,0,8.0500,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [196]:
df_test.iloc[:,0:4] = scaler.fit_transform(df_test.iloc[:,0:4])
df_test

Unnamed: 0,Age,SibSp,Parch,Fare,Female,Male,Master,Miss,Mr,Mrs,Class 1,Class 2,Class 3
0,0.452723,0.000,0.000000,0.015282,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.617566,0.125,0.000000,0.013663,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.815377,0.000,0.000000,0.018909,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.353818,0.000,0.000000,0.016908,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.287881,0.125,0.111111,0.023984,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.396975,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
414,0.512066,0.000,0.000000,0.212559,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
415,0.505473,0.000,0.000000,0.014151,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
416,0.396975,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [197]:
# save submission data
df_test.to_csv('titanic_test_preprocessed2.csv', index=False)