# Kaggle Titanic Dataset

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import sklearn
from sklearn import preprocessing
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,SVR

In [None]:
TRAIN_PATH = 'Desktop/data/kaggle_titanic/train.csv'
TEST_PATH = 'Desktop/data/kaggle_titanic/test.csv'

In [4]:
#uncomment for full dataset output
#pd.set_option('display.max_rows',None,'display.max_columns',None)
#pd.set_option('display.width', 1000)
#pd.reset_option('all')

## Data pre-processing
### 1. Load the dataset (Source: https://www.kaggle.com/c/titanic/data)
### 2. Extracting useful information from columns
### 3. Filling in NA values and dropping redundant columns
### 4. Standardizing the dataset

In [5]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

In [6]:
#extracting titles from 'Name' column and grouping titles in the same domain
def map_titles(df):
    title = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
    df['Title'] = title
    
    new_titles = {
        "Capt": "Officer",
        "Col": "Officer",
        "Major": "Officer",
        "Jonkheer": "Royalty",
        "Don": "Royalty",
        "Sir": "Royalty",
        "Dr": "Officer",
        "Rev": "Officer",
        "the Countess": "Royalty",
        "Dona": "Royalty",
        "Mme": "Mrs",
        "Mlle": "Miss",
        "Ms": "Mrs",
        "Mr": "Mr",
        "Mrs": "Mrs",
        "Miss": "Miss",
        "Master": "Master",
        "Lady": "Royalty"
    }
    
    df['Title'] = df['Title'].map(new_titles)


In [7]:
#return titles from 'Name' column
def get_title(df):
    title = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
    return title

#group titles into common domains
def map_new_titles(df):
    titles = {
        "Capt": "Officer",
        "Col": "Officer",
        "Major": "Officer",
        "Jonkheer": "Royalty",
        "Don": "Royalty",
        "Sir": "Royalty",
        "Dr": "Officer",
        "Rev": "Officer",
        "the Countess": "Royalty",
        "Dona": "Royalty",
        "Mme": "Mrs",
        "Mlle": "Miss",
        "Ms": "Mrs",
        "Mr": "Mr",
        "Mrs": "Mrs",
        "Miss": "Miss",
        "Master": "Master",
        "Lady": "Royalty"
    }
    
    new_titles = df['Title'].map(titles)
    return new_titles

In [8]:
def fill_null_age(df):
    temp = df.groupby(['Pclass','Title']).median()['Age']
    df.set_index(['Pclass','Title'],drop=False,inplace = True)
    df['Age'].fillna(temp,inplace=True)
    df.reset_index(drop=True,inplace=True)

In [9]:
map_titles(train_data)
map_titles(test_data)

In [10]:
fill_null_age(train_data)
fill_null_age(test_data)

In [11]:
train_data.drop(columns = ['Cabin','Name','Ticket'], inplace = True)
test_data.drop(columns = ['Cabin','Name','Ticket'],inplace =True)
train_data.dropna(subset = ['Embarked'],inplace=True)
test_data.dropna(subset = ['Embarked'], inplace= True)

In [12]:
le = preprocessing.LabelEncoder()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
test_data['Sex'] = le.transform(test_data['Sex'])
train_data = pd.get_dummies(train_data, columns = ['Title','Embarked'])
test_data = pd.get_dummies(test_data, columns = ['Title','Embarked'])

In [16]:
scaler = MinMaxScaler()
train_data.iloc[:,2:] = scaler.fit_transform(train_data.iloc[:,2:])
test_data.iloc[:,1:] = scaler.transform(test_data.iloc[:,1:])

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S
0,1,0,1.0,1.0,0.271174,0.125,0.000000,0.014151,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,0.0,0.0,0.472229,0.125,0.000000,0.139136,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,3,1,1.0,0.0,0.321438,0.000,0.000000,0.015469,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,0.0,0.0,0.434531,0.125,0.000000,0.103644,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,5,0,1.0,1.0,0.434531,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,0.5,1.0,0.334004,0.000,0.000000,0.025374,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
887,888,1,0.0,0.0,0.233476,0.000,0.000000,0.058556,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
888,889,0,1.0,0.0,0.220910,0.125,0.333333,0.045771,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
889,890,1,0.0,1.0,0.321438,0.000,0.000000,0.058556,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
