In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

In [None]:
#Read in the data and look at it.
from sklearn import datasets
from sklearn.datasets import fetch_openml

df_train, df_test = fetch_openml("titanic", version = 1,
                                           as_frame=True, return_X_y=True) # loads titanic dataset

In [None]:
#We need to do some data processing on this data.
#Our analysis will require numeric values, not strings.
#We also need to clean the data for the NaN / null values.
#Much of the following code borrows from towardsdatascience.com.

#Impute Ages and Embarked status.
df_train["age"] = df_train["age"].fillna(df_train["age"].median())
df_train["embarked"] = df_train["embarked"].fillna("S")

data = [df_train]

for dataset in data:
    dataset['relatives'] = dataset['sibsp'] + dataset['parch']
    dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
    dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
    dataset['not_alone'] = dataset['not_alone'].astype(int)

#The following takes care of the cabin column by leading letter.
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

data = [df_train]
for dataset in data:
    dataset['cabin'] = dataset['cabin'].fillna("U0")
    dataset['deck'] = dataset['cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['deck'] = dataset['deck'].map(deck)
    dataset['deck'] = dataset['deck'].fillna(0)
    dataset['deck'] = dataset['deck'].astype(int)

#Let's hack and impute the ages next.
data = [df_train]
for dataset in data:
    mean = df_train["age"].mean()
    std = df_train["age"].std()
    is_null = dataset["age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["age"] = age_slice
    dataset["age"] = df_train["age"].astype(int)
df_train["age"].isnull().sum()

#Convert Fares to INT.
data = [df_train]
for dataset in data:
    dataset['fare'] = dataset['fare'].fillna(0)
    dataset['fare'] = dataset['fare'].astype(int)

#Deal with sex column.
genders = {"male": 0, "female": 1}
data = [df_train]
for dataset in data:
    dataset['sex'] = dataset['sex'].map(genders)



#Grab the titles of the passengers.
data = [df_train]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['title'] = dataset.name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['title'] = dataset['title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['title'] = dataset['title'].replace('Mlle', 'Miss')
    dataset['title'] = dataset['title'].replace('Ms', 'Miss')
    dataset['title'] = dataset['title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['title'] = dataset['title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['title'] = dataset['title'].fillna(0)
df_train = df_train.drop(['name'], axis=1)

#Convert the Embarked column to numbers.
data = [df_train]
ports = {"S": 0, "C": 1, "Q": 2}
for dataset in data:
    dataset['embarked'] = dataset['embarked'].map(ports)


#Drop the original cabin column
df_train = df_train.drop(['cabin'], axis=1)




#Ticket has too many types to be of use. Drop it too.
df_train = df_train.drop(['ticket'], axis=1)


#I think we're done. Let's eyeball it.
df_train


In [None]:
#Here we will bin up both AGE and FARE in to some categories
#just like the rest of the data.

data = [df_train]

for dataset in data:
    dataset['age'] = dataset['age'].astype(int)
    dataset.loc[ dataset['age'] <= 11, 'age'] = 0
    dataset.loc[(dataset['age'] > 11) & (dataset['age'] <= 18), 'age'] = 1
    dataset.loc[(dataset['age'] > 18) & (dataset['age'] <= 22), 'age'] = 2
    dataset.loc[(dataset['age'] > 22) & (dataset['age'] <= 27), 'age'] = 3
    dataset.loc[(dataset['age'] > 27) & (dataset['age'] <= 33), 'age'] = 4
    dataset.loc[(dataset['age'] > 33) & (dataset['age'] <= 40), 'age'] = 5
    dataset.loc[(dataset['age'] > 40) & (dataset['age'] <= 66), 'age'] = 6
    dataset.loc[ dataset['age'] > 66, 'age'] = 6

for dataset in data:
    dataset.loc[ dataset['fare'] <= 7.91, 'fare'] = 0
    dataset.loc[(dataset['fare'] > 7.91) & (dataset['fare'] <= 14.454), 'fare'] = 1
    dataset.loc[(dataset['fare'] > 14.454) & (dataset['fare'] <= 31), 'fare']   = 2
    dataset.loc[(dataset['fare'] > 31) & (dataset['fare'] <= 99), 'fare']   = 3
    dataset.loc[(dataset['fare'] > 99) & (dataset['fare'] <= 250), 'fare']   = 4
    dataset.loc[ dataset['fare'] > 250, 'fare'] = 5
    dataset['fare'] = dataset['fare'].astype(int)

#Let's eyeball it.
df_train