In [1]:
import pandas as pd
import numpy as np

In [2]:
# load dataset
df_train = pd.read_csv('titanic_train_data.csv')

In [3]:
# Preprocessing the training data
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)  #fill n/a age with mean value

# extract titles from names
# Print out the distribution of males and females
# print('Sex column values:')
# print(df_train.Sex.value_counts())
# print()

# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_train.Name.map(lambda x: get_title(x))]))
# print('Different titles found on the dataset:')
# print(len(titles), ':', titles)
# print()

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_train['Title'] = df_train['Name'].map(lambda x: get_title(x))
# train.Title.value_counts()
# train.Title.value_counts().plot(kind='bar')

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_train['Title'] = df_train.apply(replace_titles, axis=1)

# Check that the number of Mr, Mrs and Miss are the same that 'male' and 'female'
# print('Title column values. Males and females are the same that for the "Sex" column:')
# print(df_train.Title.value_counts())

# Plot the result
# df_train.Title.value_counts().plot(kind='bar')

#convert columns/table
male = pd.get_dummies(df_train['Sex'],drop_first=True) #convert male/female to 0/1. Check video why drop_first=True
embark = pd.get_dummies(df_train['Embarked'],drop_first=True) #same conversion for embark
pcla = pd.get_dummies(df_train['Pclass'], drop_first=True) #make numbers 1,2,3 categories - its different
salutation = pd.get_dummies(df_train['Title'], drop_first=True) #the same with title

# visualise the changes, update table
df_train = pd.concat([df_train,pcla,male,embark,salutation],axis=1)  # add those converted columns
df_train.drop(['Sex','Embarked','Pclass','Cabin','Title','Ticket','Name'],axis=1,inplace=True) #drop unnecesary cols

In [4]:
# save preprocessed training data
df_train.to_csv('titanic_train_preprocessed.csv', index=False)

In [5]:
# load submission data
df_test = pd.read_csv('titanic_test_data.csv')

In [6]:
# Preprocessing submission/test data
df_test['Age'].fillna(df_test['Age'].median(), inplace=True)  #fill n/a age with mean value

# extract titles from names
# Print out the distribution of males and females
# print('Sex column values:')
# print(df_train.Sex.value_counts())
# print()

# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_test.Name.map(lambda x: get_title(x))]))
# print('Different titles found on the dataset:')
# print(len(titles), ':', titles)
# print()

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_test['Title'] = df_test['Name'].map(lambda x: get_title(x))
# train.Title.value_counts()
# train.Title.value_counts().plot(kind='bar')

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_test['Title'] = df_test.apply(replace_titles, axis=1)

# Check that the number of Mr, Mrs and Miss are the same that 'male' and 'female'
# print('Title column values. Males and females are the same that for the "Sex" column:')
# print(df_train.Title.value_counts())

# Plot the result
# df_train.Title.value_counts().plot(kind='bar')

#convert columns/table
male = pd.get_dummies(df_test['Sex'],drop_first=True) #convert male/female to 0/1. Check video why drop_first=True
embark = pd.get_dummies(df_test['Embarked'],drop_first=True) #same conversion for embark
pcla = pd.get_dummies(df_test['Pclass'], drop_first=True) #make numbers 1,2,3 categories - its different
salutation = pd.get_dummies(df_test['Title'], drop_first=True) #the same with title

# visualise the changes, update table
df_test = pd.concat([df_test,pcla,male,embark,salutation],axis=1)  # add those converted columns
df_test.drop(['Sex','Embarked','Pclass','Cabin','Title','Ticket','Name'],axis=1,inplace=True) #drop unnecesary cols

In [7]:
# save submission data
df_test.to_csv('titanic_test_preprocessed.csv', index=False)