# Titanic
Achieving 86% accuracy on a test data set using an xgboost classifier to predict whether a passenger survived the Titanic disaster

In [None]:
import pandas as pd
import numpy as np
from math import ceil

%matplotlib inline

In [None]:
df = pd.read_csv('train.csv')
kaggle = pd.read_csv('test.csv')

df.head()

## Feature Engineering
### Creating has_cabin

In [None]:
# does the first letter in cabin correlate to floor of ship?
grp = df.groupby(df['Cabin'].fillna('Z').apply(lambda x: x[0]))

# It looks like passengers without a cabin were significantly less likely to survive
grp.Survived.sum() / grp.Survived.count()

In [None]:
# creating a column in our dataframe that holds whether or not Cabin is NaN
def create_has_cabin(cabin):
    if pd.notnull(cabin):
        return 1
    return 0

df['has_cabin'] = df['Cabin'].apply(create_has_cabin)
kaggle['has_cabin'] = kaggle['Cabin'].apply(create_has_cabin)
df.head()

### Binarizing Sex

In [None]:
df.Sex.unique()

In [None]:
map_sex = {'male':1, 'female':0}

df['is_male'] = df['Sex'].map(map_sex)
df.drop('Sex', axis=1, inplace=True)
kaggle['is_male'] = kaggle['Sex'].map(map_sex)
kaggle.drop('Sex', axis=1, inplace=True)

df.head()

### Creating has_age

In [None]:
# creating a column in our dataframe that holds whether or not Age is NaN
def create_has_age(age):
    if pd.notnull(age):
        return 1
    return 0

df['has_age'] = df['Age'].apply(create_has_age)
kaggle['has_age'] = kaggle['Age'].apply(create_has_age)

df.head()

In [None]:
grp = df.groupby(df['has_age'])

# passengers who have an age are more likely to have survived
grp.Survived.sum() / grp.Survived.count()

### Dealing with null Ages

In [None]:
def create_age_bin(age):
    bin_size = 5
    null_age_default = 42
    if np.isnan(age):
        return null_age_default
    return ceil(age/bin_size)*bin_size

df['age_bin'] = df['Age'].apply(create_age_bin)
kaggle['age_bin'] = kaggle['Age'].apply(create_age_bin)

df.head()

In [None]:
my_list = [25, 40, 30, 35, 42, 55,  5, 15, 60, 20, 10, 70, 45, 50, 65, 75, 80]
my_list.sort()

print(my_list)

In [None]:
df.hist(column='age_bin', by='is_male')

In [None]:
def get_survival_probability(group_df):
    bin_survivors = len(group_df[group_df['Survived'] == 1])
    bin_passengers = len(group_df)
    return bin_survivors/bin_passengers

series = df.groupby(['is_male', 'age_bin']).apply(get_survival_probability)
series.plot(kind='bar')

### Creating is_child

In [None]:
def create_is_child(age):
    if np.isnan(age):
        return 0
    if age <= 20:
        return 1
    return 0

df['is_child'] = df['Age'].apply(create_is_child)
kaggle['is_child'] = kaggle['Age'].apply(create_is_child)

df.head()

### Creating Dummies for Pclass

In [None]:
df = pd.concat([df, pd.get_dummies(df.Pclass, prefix='class')], axis=1);
kaggle = pd.concat([kaggle, pd.get_dummies(kaggle.Pclass, prefix='class')], axis=1);

df.head()

### Creating traveling_alone

In [None]:
# See if the person was traveing alone
df['family_size'] = df['Parch'] + df['SibSp'] + 1
kaggle['family_size'] = kaggle['Parch'] + kaggle['SibSp'] + 1

grp = df.groupby(df['family_size'])

# if you traveled alone, you had a 30% chance of survival
grp.Survived.sum() / grp.Survived.count()

In [None]:
grp.Survived.count()

In [None]:
def create_family_buckets(family_size):
    if family_size == 1:
        return 1
    elif family_size <= 4:
        return 2
    return 3

df['family_bucket'] = df['family_size'].apply(create_family_buckets)
kaggle['family_bucket'] = kaggle['family_size'].apply(create_family_buckets)

df = pd.concat([df, pd.get_dummies(df.family_bucket, prefix='family_bucket')], axis=1);
kaggle = pd.concat([kaggle, pd.get_dummies(kaggle.family_bucket, prefix='family_bucket')], axis=1);

In [None]:
df.head()

### Deal With Fare

In [None]:
df['Fare'].hist(bins=20)

In [None]:
def cheap_ticket(fare):
    if fare < 80:
        return 1
    return 0

df['cheap_ticket'] = df['Fare'].apply(cheap_ticket)
kaggle['cheap_ticket'] = kaggle['Fare'].apply(cheap_ticket)

df.head()

In [None]:
# Does having an cheap ticket affect survival rate?
grp = df.groupby(df['cheap_ticket'])

# It looks like passengers who bought a cheap ticket were significantly less likely to survive
grp.Survived.sum() / grp.Survived.count()

### Splitting into Train/Test

In [None]:
# Making sure we have target values for each of our data points
df.Survived.unique()

The input to our model will be some of our numeric columns, the target will be 'Survived'

In [None]:
from sklearn.model_selection import train_test_split

model_input_columns = ['Fare',
                       'family_bucket_1',
                       'family_bucket_2',
                       'family_bucket_3',
                       'has_cabin',
                       'is_male',
                       'has_age',
                       'age_bin',
                       'is_child',
                       'class_1',
                       'class_2',
                       'class_3'
                      ]
X = df[model_input_columns]
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

## xgboost Model Creation

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(n_estimators=100,
                    max_depth=3,
                    learning_rate=.1,
                    subsample=.9,
                    colsample_bytree=1,
                    gamma=1
                   )
xgb.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

# predict
y_pred = xgb.predict(X_train)

# evaluate predictions
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## xgboost Model Evaluation

In [None]:
# predict
y_pred = xgb.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_pred, labels=[0,1])
print("Dead Correctly Classified as Dead:\t\t"+str(confusion_matrix[0][0]))
print("Survivors Incorrectly Classified as Dead:\t"+str(confusion_matrix[1][0]))
print("Dead Incorrectly Classified as Survivors:\t"+str(confusion_matrix[0][1]))
print("Survivors Correctly Classified as Survivors:\t"+str(confusion_matrix[1][1]))

## Kaggle Competition Submission
Running our model on the dataset in test.csv and exporting the results to CSV

In [None]:
kaggle.head()

In [None]:
kaggle_X = kaggle[model_input_columns]
kaggle['Survived'] = xgb.predict(kaggle_X)

In [None]:
submission_columns = ['PassengerId', 'Survived']
submission = kaggle[submission_columns]

In [None]:
submission.to_csv('kaggle-submission.csv', index=False)