In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
# load dataset
df_train = pd.read_csv('titanic_train_data.csv')

In [3]:
#df_train['Age'].isnull().sum()
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df_train.drop(['PassengerId', 'Ticket'],axis=1,inplace=True)
df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C148,C


In [5]:
df_train['Deck'] = df_train['Cabin'].str.slice(0,1) 
df_train['Deck'] = df_train['Deck'].map({"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F":6,"G":7, "T":8})
df_train['Deck'] = df_train['Deck'].fillna(0)
df_train['Deck'] = df_train['Deck'].astype(np.int64)
df_train.drop(['Cabin'],axis=1,inplace=True) 
df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,3
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,2
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,0
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,3


In [6]:
# Fill the missing values of Age
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)  #fill n/a age with mean value

common_value = 'S'
df_train['Embarked'] = df_train['Embarked'].fillna(common_value)

df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,7.2500,S,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,71.2833,C,3
2,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,7.9250,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,53.1000,S,3
4,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,13.0000,S,0
887,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,30.0000,S,2
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,23.4500,S,0
889,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,30.0000,C,3


In [7]:
df_train.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Deck        0
dtype: int64

In [8]:
# new features
df_train['Relatives'] = df_train['SibSp'] + df_train['Parch']
df_train['Age_Class'] = df_train['Age'] * df_train['Pclass']
df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Relatives,Age_Class
0,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,7.2500,S,0,1,66.000000
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,71.2833,C,3,1,38.000000
2,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,7.9250,S,0,0,78.000000
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,53.1000,S,3,1,35.000000
4,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,8.0500,S,0,0,105.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,13.0000,S,0,0,54.000000
887,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,30.0000,S,2,0,19.000000
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,23.4500,S,0,3,89.097353
889,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,30.0000,C,3,0,26.000000


In [None]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_train.Name.map(lambda x: get_title(x))]))

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_train['Title'] = df_train['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_train['Title'] = df_train.apply(replace_titles, axis=1)


#convert columns/table
male = pd.get_dummies(df_train['Sex'],drop_first=False) #convert male/female to 0/1. Check video why drop_first=True
pcla = pd.get_dummies(df_train['Pclass'], drop_first=False) #make dummy variables
salutation = pd.get_dummies(df_train['Title'], drop_first=False) 
embark = pd.get_dummies(df_train['Embarked'],drop_first=False) 
#agecategory = pd.get_dummies(df_train['Age_type'],drop_first=False)

# visualise the changes, update table
df_train = pd.concat([df_train, male, salutation, pcla, embark],axis=1)  # add those converted columns
df_train.drop(['Sex', 'Title', 'Pclass', 'Name', 'Embarked'],axis=1,inplace=True) #drop unnecesary cols
df_train = df_train.rename(columns = {'female':'Female', 'male':'Male', 1:'Class 1', 2:'Class 2', 3:'Class 3'})
df_train = df_train.rename(columns = {'C':'EmbarkC', 'Q':'EmbarkQ', 'S':'EmbarkS'})

df_train

In [None]:
df_train.iloc[:,8:] = StandardScaler().fit_transform(df_train.iloc[:,8:])
df_train

In [None]:
#scaler = MinMaxScaler()
df_train.iloc[:,1:8] = StandardScaler().fit_transform(df_train.iloc[:,1:8])
df_train

#alebo pomocou loc
#df_train.loc[:,['Age', 'SibSp', 'Parch', 'Fare']] = scaler.fit_transform(df_train.loc[:,['Age', 'SibSp', 'Parch', 'Fare']])

In [None]:
# save preprocessed training data
df_train.to_csv('titanic_train_preprocessed2.csv', index=False)

In [None]:
# load submission data
df_test = pd.read_csv('titanic_test_data.csv')

In [None]:
#df_test['Age'].isnull().sum()
df_test.isnull().sum()

In [None]:
df_test.drop(['PassengerId', 'Ticket'],axis=1,inplace=True)
df_test

In [None]:
df_test['Deck'] = df_test['Cabin'].str.slice(0,1) 
df_test['Deck'] = df_test['Deck'].map({"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F":6,"G":7, "T":8})
df_test['Deck'] = df_test['Deck'].fillna(0)
df_test['Deck'] = df_test['Deck'].astype(np.int64)
df_test.drop(['Cabin'],axis=1,inplace=True) 
df_test

In [None]:
# Fill the missing values of Age
df_test['Age'].fillna(df_test['Age'].mean(), inplace=True)  #fill n/a age with mean value
common_value = 'S'
df_test['Embarked'] = df_test['Embarked'].fillna(common_value)
df_test['Fare'].fillna(df_test['Fare'].mean(), inplace=True)
df_test

In [None]:
df_test.isnull().sum()

In [None]:
df_test['Relatives'] = df_test['SibSp'] + df_test['Parch']
df_test['Age_Class'] = df_test['Age'] * df_test['Pclass']
df_test

In [None]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_test.Name.map(lambda x: get_title(x))]))

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_test['Title'] = df_test['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_test['Title'] = df_test.apply(replace_titles, axis=1)


#convert columns/table
male = pd.get_dummies(df_test['Sex'],drop_first=False) #convert male/female to 0/1. Check video why drop_first=True
pcla = pd.get_dummies(df_test['Pclass'], drop_first=False) #make dummy variables
embark = pd.get_dummies(df_test['Embarked'],drop_first=False)
salutation = pd.get_dummies(df_test['Title'], drop_first=False) 
#agecategory = pd.get_dummies(df_test['Age_type'],drop_first=False)

# visualise the changes, update table
df_test = pd.concat([df_test, male, salutation, pcla, embark],axis=1)  # add those converted columns
df_test.drop(['Sex', 'Title', 'Pclass', 'Name', 'Embarked'],axis=1,inplace=True) #drop unnecesary cols
df_test = df_test.rename(columns = {'female':'Female', 'male':'Male', 1:'Class 1', 2:'Class 2', 3:'Class 3'})
df_test = df_test.rename(columns = {'C':'EmbarkC', 'Q':'EmbarkQ', 'S':'EmbarkS'})

df_test

In [None]:
df_test.iloc[:,7:] = scaler.fit_transform(df_test.iloc[:,7:])
df_test

In [None]:
df_test.iloc[:,0:7] = scaler.fit_transform(df_test.iloc[:,0:7])
df_test

In [None]:
# save submission data
df_test.to_csv('titanic_test_preprocessed2.csv', index=False)