In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import os
import hashlib
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
filepath = os.path.join(os.getcwd(),"Data")
train_initial = pd.DataFrame.from_csv(os.path.join(filepath,"train.csv"), index_col = None)
"Train (Rows, Columns): " + str(train_initial.shape)
survival = pd.DataFrame.from_csv(os.path.join(filepath,"gender_submission.csv"), index_col = None)
test_initial = pd.DataFrame.from_csv(os.path.join(filepath,"test.csv"), index_col = None)
test_initial = survival.merge(test_initial, on="PassengerId")
"Test (Rows, Columns): " + str(test_initial.shape)
combined_initial = train_initial.append(test_initial, ignore_index=True)
"Train & Test (Rows, Columns): " + str(combined_initial.shape)

'Train & Test (Rows, Columns): (1309, 12)'

Instead of relying on Kaggle's partitioning of the testing and training sets, we'll combine the sets together and do custom splits for cross-validation. There are 1309 rows, with 12 variables. Let's have a look.

In [2]:
combined_initial.head(n=7)
combined_initial.isnull().sum()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


There are a couple of variables that are of questionable value for surivival prediction.

PassengerID is just an index, and since it was assigned long after the sinking of the Titanic, it will not have any predictive power.

A passenger's name should not have any effect on survival. While names can give us information on the gender of the passenger (and an interesting task that's best suited for another analysis), there's already an pre-existing gender variable.

The ticket variable is probably the most confusing. There doesn't seem to be any kind of clear idea of what information the ticket variable is supposed to have. Some tickets are just digits, and others have a combination of letters, punctuation and numbers. It could be that the ticket number represents the order in which the passengers purchaes their tickets, which could have some predictive value, but without further information on how to extract meaning from the ticket variable, it should not be used as a predictive variable.

There are some other variables that might present issues besides the lack of predictive power.

There are passengers missing ages, cabin numbers, and to a lesser extent, fare and embarkation location.

More than 75% of passengers are missing cabin numbers. Cabin numbers might indicate what section the passenger is located in (the first letter) but with 75% of labels missing, the value that it provides is dubious; imputating 75% of passenger's cabin numbers based on the remaining 25% might lead to some spurious results. Also worth nothing, is that passenger class("Pclass") can be an excellent proxy for ship location, as the different passenger classes were housed in different, seperate sections of the ship.

The missing passenger age problem is a little more tractable, with only 20% of passengers missing age labels. We can attempt age label imputation with confidence. And as for the handful of missing fare/embarkation location labels, we can just use the most common value to fill them in. 

In [3]:
combined_initial["Embarked"] = combined_initial["Embarked"].fillna(combined_initial["Embarked"].mode())
combined_initial["Fare"] = combined_initial["Fare"].fillna(combined_initial["Fare"].mode()[0])

We'll also turn our categorical variables in dummy variables so they can be processed/interpreted better by our models that we'll build. 

In [4]:
drop_sex = "Sex_" + combined_initial["Sex"].mode()[0]
drop_embarked = "Embarked_" + combined_initial["Embarked"].mode()[0]
drop_cols = ["PassengerId", "Name", "Ticket", "Cabin", "Split"]
combined_initial = pd.get_dummies(data=combined_initial, columns=["Sex", "Embarked"])
combined_initial = combined_initial.drop([drop_sex, drop_embarked], axis=1)

# Partitioning the data into training/testing sets

Let's split the dataset: 80% for training, 20% for testing. 
We'll use the passenger's name concatenated with their ticket number(since there could be duplicate ticket numbers and names) to "fingerprint" the data by putting it through a hash function: if any new instances of data are added, the existing data won't get shuffled around the training/testing datasets - they'll stay in the same sets.

While we could have used the PassengerID index, if any other data instances are added, they must be added to the end of the data, otherwise the existing training/testing sets will be shuffled around. 

In [5]:
def test_split(id, seed, test_proportion):
    if type(test_proportion) not in [float, int] or test_proportion > 1 or test_proportion < 0:
        raise ValueError("Test proportion must be a real number between 0 and 1")
    test = str(id) + str(seed)
    test_digest = hashlib.md5(test.encode("ascii")).hexdigest()
    test_hex = int(test_digest[-6:], 16) #last 6 digits only
    split = test_hex/0xFFFFFF
    if split > test_proportion:
        return 0
    else:
        return 1
    
    
combined_initial["Split"] = (combined_initial["Name"] + combined_initial["Ticket"]).map(lambda x: test_split(id = x, seed = 42, test_proportion = 0.20))

train = combined_initial.loc[(combined_initial.Split == 0)]
test = combined_initial.loc[(combined_initial.Split == 1)]
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

train.shape
test.shape
train.isnull().sum()
test.isnull().sum()
train.head(n=7)

(1049, 9)

(260, 9)

Survived        0
Pclass          0
Age           210
SibSp           0
Parch           0
Fare            0
Sex_female      0
Embarked_C      0
Embarked_Q      0
dtype: int64

Survived       0
Pclass         0
Age           53
SibSp          0
Parch          0
Fare           0
Sex_female     0
Embarked_C     0
Embarked_Q     0
dtype: int64

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_C,Embarked_Q
0,0,3,22.0,1,0,7.25,0,0,0
1,1,1,38.0,1,0,71.2833,1,1,0
2,1,3,26.0,0,0,7.925,1,0,0
3,1,1,35.0,1,0,53.1,1,0,0
7,0,3,2.0,3,1,21.075,0,0,0
8,1,3,27.0,0,2,11.1333,1,0,0
9,1,2,14.0,1,0,30.0708,1,1,0


We'll need to find a way to fill in the missing age data. Let's use a K-Nearest Neighbors regression to estimate passenger age based on all the other remaining columns. 

In [6]:
combined_age = combined_initial.copy()
combined_age = combined_age.dropna(subset="Cabin") #this drops cabin!
combined_age["Split"] = (combined_age["Name"] + combined_age["Ticket"]).map(lambda x: test_split(id = x, seed = 123, test_proportion = 0.20))
age_train = combined_age.loc[(combined_age.Split == 0)]
age_test = combined_age.loc[(combined_age.Split == 1)]
age_train.head(n=7)
# age_train = age_train.drop(drop_cols, axis=1)
# age_test = age_test.drop(drop_cols, axis=1)

TypeError: Index(...) must be called with a collection of some kind, 'Cabin' was passed