In [509]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [510]:
# load dataset
df_train = pd.read_csv('titanic_train_data.csv')

In [511]:
#df_train['Age'].isnull().sum()
df_train.isnull().sum()

In [512]:
df_train.drop(['PassengerId', 'Ticket', 'Cabin'],axis=1,inplace=True)
df_train

In [513]:
# Fill the missing values of Age
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)  #fill n/a age with mean value

common_value = 'S'
df_train['Embarked'] = df_train['Embarked'].fillna(common_value)

df_train

# # create Age categories
# def age_type(age):
#     if age>0 and age<=5:
#         return 'baby'
#     elif(age>=6 and age<=10):
#         return 'child'
#     elif(age>=11 and age<=19):
#         return 'teenager'
#     elif(age>=20 and age<=30):
#         return 'early adult'
#     elif(age>=31 and age<=45):
#         return 'adult'
#     elif(age>=46 and age<=60):
#         return 'late adult'
#     else :
#         return 'senior'
    

# df_train['Age_type']=df_train['Age'].apply(age_type)
# df_train

In [514]:
df_train.isnull().sum()

In [515]:
# new features
df_train['Relatives'] = df_train['SibSp'] + df_train['Parch']
df_train['Age_Class'] = df_train['Age'] * df_train['Pclass']
df_train

In [516]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_train.Name.map(lambda x: get_title(x))]))

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_train['Title'] = df_train['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_train['Title'] = df_train.apply(replace_titles, axis=1)


#convert columns/table
male = pd.get_dummies(df_train['Sex'],drop_first=False) #convert male/female to 0/1. Check video why drop_first=True
pcla = pd.get_dummies(df_train['Pclass'], drop_first=False) #make dummy variables
salutation = pd.get_dummies(df_train['Title'], drop_first=False) 
embark = pd.get_dummies(df_train['Embarked'],drop_first=False) 
#agecategory = pd.get_dummies(df_train['Age_type'],drop_first=False)

# visualise the changes, update table
df_train = pd.concat([df_train, male, salutation, pcla, embark],axis=1)  # add those converted columns
df_train.drop(['Sex', 'Title', 'Pclass', 'Name', 'Embarked'],axis=1,inplace=True) #drop unnecesary cols
df_train = df_train.rename(columns = {'female':'Female', 'male':'Male', 1:'Class 1', 2:'Class 2', 3:'Class 3'})
df_train = df_train.rename(columns = {'C':'EmbarkC', 'Q':'EmbarkQ', 'S':'EmbarkS'})

df_train

In [517]:
# x = df_train.values #returns a numpy array
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# df_train_scaled = pd.DataFrame(x_scaled)
# df_train_scaled

In [518]:
# df_train['Female'] = df_train['Female'].astype(int)
# df_train['Male'] = df_train['Male'].astype(int)
# df_train['Master'] = df_train['Master'].astype(int)
# df_train['Miss'] = df_train['Miss'].astype(int)
# df_train['Mr'] = df_train['Mr'].astype(int)
# df_train['Mrs'] = df_train['Mrs'].astype(int)
# df_train['Class 1'] = df_train['Class 1'].astype(int)
# df_train['Class 2'] = df_train['Class 2'].astype(int)
# df_train['Class 3'] = df_train['Class 3'].astype(int)
# df_train

df_train.iloc[:,7:] = scaler.fit_transform(df_train.iloc[:,7:])
df_train

In [519]:
#scaler = MinMaxScaler()
df_train.iloc[:,1:7] = scaler.fit_transform(df_train.iloc[:,1:7])
df_train

#alebo pomocou loc
#df_train.loc[:,['Age', 'SibSp', 'Parch', 'Fare']] = scaler.fit_transform(df_train.loc[:,['Age', 'SibSp', 'Parch', 'Fare']])

In [520]:
# save preprocessed training data
df_train.to_csv('titanic_train_preprocessed2.csv', index=False)

In [521]:
# load submission data
df_test = pd.read_csv('titanic_test_data.csv')

In [522]:
#df_test['Age'].isnull().sum()
df_test.isnull().sum()

In [523]:
df_test.drop(['PassengerId', 'Ticket', 'Cabin'],axis=1,inplace=True)
df_test

In [524]:
# Fill the missing values of Age
df_test['Age'].fillna(df_test['Age'].mean(), inplace=True)  #fill n/a age with mean value
common_value = 'S'
df_test['Embarked'] = df_test['Embarked'].fillna(common_value)
df_test['Fare'].fillna(df_test['Fare'].mean(), inplace=True)
df_test

# # create Age categories
# def age_type(age):
#     if age>0 and age<=5:
#         return 'baby'
#     elif(age>=6 and age<=10):
#         return 'child'
#     elif(age>=11 and age<=19):
#         return 'teenager'
#     elif(age>=20 and age<=30):
#         return 'early adult'
#     elif(age>=31 and age<=45):
#         return 'adult'
#     elif(age>=46 and age<=60):
#         return 'late adult'
#     else :
#         return 'senior'
    

# df_test['Age_type']=df_test['Age'].apply(age_type)
# df_test

In [525]:
df_test.isnull().sum()

In [526]:
df_test['Relatives'] = df_test['SibSp'] + df_test['Parch']
df_test['Age_Class'] = df_test['Age'] * df_test['Pclass']
df_test

In [527]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

# A list with the all the different titles
titles = sorted(set([x for x in df_test.Name.map(lambda x: get_title(x))]))

# Normalize the titles, returning 'Mr', 'Master', 'Miss' or 'Mrs'
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

# Lets create a new column for the titles
df_test['Title'] = df_test['Name'].map(lambda x: get_title(x))

# And replace the titles, so the are normalized to 'Mr', 'Miss' and 'Mrs'
df_test['Title'] = df_test.apply(replace_titles, axis=1)


#convert columns/table
male = pd.get_dummies(df_test['Sex'],drop_first=False) #convert male/female to 0/1. Check video why drop_first=True
pcla = pd.get_dummies(df_test['Pclass'], drop_first=False) #make dummy variables
embark = pd.get_dummies(df_test['Embarked'],drop_first=False)
salutation = pd.get_dummies(df_test['Title'], drop_first=False) 
#agecategory = pd.get_dummies(df_test['Age_type'],drop_first=False)

# visualise the changes, update table
df_test = pd.concat([df_test, male, salutation, pcla, embark],axis=1)  # add those converted columns
df_test.drop(['Sex', 'Title', 'Pclass', 'Name', 'Embarked'],axis=1,inplace=True) #drop unnecesary cols
df_test = df_test.rename(columns = {'female':'Female', 'male':'Male', 1:'Class 1', 2:'Class 2', 3:'Class 3'})
df_test = df_test.rename(columns = {'C':'EmbarkC', 'Q':'EmbarkQ', 'S':'EmbarkS'})

df_test

In [528]:
df_test.iloc[:,6:] = scaler.fit_transform(df_test.iloc[:,6:])
df_test

In [529]:
df_test.iloc[:,0:6] = scaler.fit_transform(df_test.iloc[:,0:6])
df_test

In [530]:
# save submission data
df_test.to_csv('titanic_test_preprocessed2.csv', index=False)