# Titanic - Machine Learning from Disaster
This notebook provides end to end steps in solving the Titanic Problem

In [155]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action = 'ignore')

Reading the data from csv files.

In [156]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_labels = train["Survived"]
train_data = train.drop(["Survived"], axis = 1)

NO_SAMPLES = train_data.shape[0]
NO_FEATURES = train_data.shape[1]
FEATURES = train_data.columns.to_numpy()

TotalData = pd.concat([train_data, test], axis=0)

print(TotalData.shape)

print(f"{'Number of Train Samples:'.ljust(25)}{NO_SAMPLES}\n{'Number of Features:'.ljust(25)}{NO_FEATURES}")
print(f"{'Features:'.ljust(15)}{FEATURES}")

(1309, 11)
Number of Train Samples: 891
Number of Features:      11
Features:      ['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']


Below are the utility functions

In [157]:
def check_unique_count(df: pd.DataFrame):
    print(df.nunique(axis=0))

def check_missing_count(df: pd.DataFrame) -> dict:
    return {column: df[column].isna().sum() for column in df.columns if df[column].isna().sum() > 0}

check_unique_count(train_data)

PassengerId    891
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64


# Feature Processing

Following features are handled as follows:-

FEATURE                         ACTION

PassengerId                     DROP
Pclass                          Ordinal Data , No Action
Name                            Fetch Title
Sex                             Nominal Data, 1 Hot Encode
Age                             Bin Age into ['Child', 'Young Adult', 'Middle Aged', 'Senior'] [0, 18, 35, 50, 100]
SibSp & Parch                   Add them to create a feature # Relatives
Ticket                          Based on regex, split in groups and 1 hot encode
Fare                            Normalization
Cabin                           -
Embarked                        Nominal Data, 1 Hot Encode

STEPS:-

1) Drop any feature which has more than 50% missing value
2) Process all features as described above
3) Impute Missing Values with KNN

In [158]:
train_missing_count = check_missing_count(train_data)
test_missing_count = check_missing_count(test)

features_to_drop = list(dict(filter(lambda x: (x[1] > int(NO_SAMPLES/2)), train_missing_count.items())).keys())

print(f"Train Data Missing Count : \n{check_missing_count(train_data)}\n")
print(f"Test Data Missing Count : \n{check_missing_count(test)}\n")
print(f"Training Features with missing value > 50%: \n{features_to_drop}\n")

Train Data Missing Count : 
{'Age': 177, 'Cabin': 687, 'Embarked': 2}

Test Data Missing Count : 
{'Age': 86, 'Fare': 1, 'Cabin': 327}

Training Features with missing value > 50%: 
['Cabin']


Utility function for pre-processing features

In [159]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# One Hot Encode
# pd.get_dummies(original_dataframe, columns=feature_to_encode, dtype=int, prefix="", prefix_sep="")
def encode_and_bind(original_dataframe: pd.DataFrame, feature_to_encode: list) -> pd.DataFrame:
    dummies = [original_dataframe]
    for feature in feature_to_encode:
        dummy = pd.get_dummies(original_dataframe[feature], prefix=feature, dtype=int)
        dummies.append(dummy)
    
    titanic_dummies = pd.concat(dummies, axis=1)
    encoded_df = titanic_dummies.drop(columns=feature_to_encode)
    
    return encoded_df

# Bin Age Column
def bin_age(df: pd.DataFrame, bins: list, labels: list[str], column: str, new_column: str) -> pd.DataFrame:
    df[new_column] = pd.cut(df[column], bins=bins, labels=labels, right=False)
    df.drop(columns=[column], inplace=True)
    return df

# normalize the titles
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

# Process all features together
def pre_process_features(df: pd.DataFrame, columns_to_drop: list[str], one_hot_encode: list[str]=None) -> pd.DataFrame:
    df = df.drop(columns = columns_to_drop, errors='ignore')
    df.Cabin = df.Cabin.fillna('U')
    df.Cabin = df.Cabin.map(lambda x: x[0])
    df['Title'] = df.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
    df.Title = df.Title.map(normalized_titles)
    df = df.drop(columns = ["Name"], errors='ignore')
    df['Fare'] =  df['Fare'].fillna(df['Fare'].mean())
    scaler = StandardScaler()    
    df['Fare'] = scaler.fit_transform(df[['Fare']])
    df['relatives'] = df['SibSp'] + df['Parch']
    df["Ticket"] = df["Ticket"].str.lower().replace(regex={
    r'^w.*': 0,
    r'^s.*$':1,
    r'^p.*$':2,
    r'^l.*$':3,
    r'^f.*$':4,
    r'^c.*$':5,
    r'^a.*$':6,
    r'^\d.*$':7,})
    df.drop(columns = ['SibSp', 'Parch'], inplace=True)
    #df['Sex'].replace({'male': 1, 'female': 0}, inplace=True)
    #df['Embarked'].replace({'S': 0, 'C': 1, 'Q':2}, inplace=True)
    binned_age_df = bin_age(df, AGE_BINS, LABELS, 'Age', "AgeBin")
    nominal_features = ['Title', "Cabin"]
    one_hot_encode.extend(nominal_features) if one_hot_encode  else nominal_features
    encoded_df = encode_and_bind(binned_age_df, one_hot_encode) if one_hot_encode else binned_age_df
    return encoded_df

In [160]:
DF_TO_ONE_HOT_ENCODE = ["Embarked", "AgeBin", "Sex", "Ticket"] # one-hot encoding these features producing better result.
AGE_BINS = [0, 18, 35, 50, 100]
#LABELS = ['Child', 'Young Adult', 'Middle Aged', 'Senior']
LABELS = [0,1,2,3]
COLUMNS_TO_DROP = ['PassengerId']
processed_data = pre_process_features(TotalData, COLUMNS_TO_DROP, DF_TO_ONE_HOT_ENCODE)
#processed_testing_data = pre_process_features(test, COLUMNS_TO_DROP, DF_TO_ONE_HOT_ENCODE)


In [161]:
#processed_data.head()
processed_data.index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       408, 409, 410, 411, 412, 413, 414, 415, 416, 417],
      dtype='int64', length=1309)

In [162]:
from sklearn.impute import KNNImputer

imputer = KNNImputer()

data = imputer.fit_transform(processed_data)
data = pd.DataFrame(data, columns=processed_data.columns)

print(data.head())

   Pclass      Fare  relatives  Embarked_C  Embarked_Q  Embarked_S  AgeBin_0  \
0     3.0 -0.503595        1.0         0.0         0.0         1.0       0.0   
1     1.0  0.734503        1.0         1.0         0.0         0.0       0.0   
2     3.0 -0.490544        0.0         0.0         0.0         1.0       0.0   
3     1.0  0.382925        1.0         0.0         0.0         1.0       0.0   
4     3.0 -0.488127        0.0         0.0         0.0         1.0       0.0   

   AgeBin_1  AgeBin_2  AgeBin_3  ...  Title_Royalty  Cabin_A  Cabin_B  \
0       1.0       0.0       0.0  ...            0.0      0.0      0.0   
1       0.0       1.0       0.0  ...            0.0      0.0      0.0   
2       1.0       0.0       0.0  ...            0.0      0.0      0.0   
3       0.0       1.0       0.0  ...            0.0      0.0      0.0   
4       0.0       1.0       0.0  ...            0.0      0.0      0.0   

   Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_U  
0      0.0   

# TRAINING DATA

In [163]:
'''
import matplotlib.pyplot as plt

f = plt.figure(figsize=(19, 15))
plt.matshow(processed_train_data.corr(), fignum=f.number)
plt.xticks(range(processed_train_data.select_dtypes(['number']).shape[1]), processed_train_data.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(processed_train_data.select_dtypes(['number']).shape[1]), processed_train_data.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);
'''

train_data = data[:NO_SAMPLES]
test_data = data[NO_SAMPLES:]


In [164]:
def create_submission_csv(predictions, prefix):
    submission_df = pd.DataFrame(predictions, index=test["PassengerId"], columns=["Survived"])
    submission_df.to_csv(f"gender_submission_{prefix}.csv")
    print("File saved successfully")

In [165]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,KFold

kfold = KFold(n_splits=10, random_state=0, shuffle=True)
classifier_results = pd.DataFrame(columns=["DecisionTree", "RandomForest", "KNN", "XGB", "SVM", "Voting"], index=["accuracy", "CV_mean", "CV_std"])

# Decision Tree Classifier

In [166]:
dt = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=4, max_depth=5)
cv_results = cross_val_score(dt, train_data, train_labels, cv=kfold, scoring='accuracy')
dt.fit(train_data, train_labels)
score = dt.score(train_data, train_labels)*100

print(f"DT CV Mean and Std{cv_results.mean() * 100}, {cv_results.std()}")
print(f'Decision Tree Training Score: {score}')
classifier_results.DecisionTree = [score, cv_results.mean(), cv_results.std()]

predictions_dt = dt.predict(test_data)
create_submission_csv(predictions_dt, "dt")

DT CV Mean and Std81.37203495630463, 0.03044648971540275
Decision Tree Training Score: 85.40965207631874
File saved successfully


# Random Forest Classifier

In [167]:
rfc=RandomForestClassifier(random_state=4)
cv_results = cross_val_score(rfc, train_data, train_labels, cv=kfold, scoring='accuracy')
rfc.fit(train_data, train_labels)
score = rfc.score(train_data, train_labels)*100

print(f"Random Forest CV Mean and Std:  {cv_results.mean() * 100}, {cv_results.std()}")
print(f'Random Forest Training Score: {score}')
classifier_results.RandomForest = [score, cv_results.mean(), cv_results.std()]

predictions_rf = rfc.predict(test_data)
create_submission_csv(predictions_rf, "rf")

Random Forest CV Mean and Std:  80.92384519350813, 0.03231408930355503
Random Forest Training Score: 95.95959595959596
File saved successfully


# KNN  Classifier

In [168]:
knn = KNeighborsClassifier()
cv_results = cross_val_score(knn, train_data, train_labels, cv=kfold, scoring='accuracy')
knn.fit(train_data, train_labels)

score = knn.score(train_data, train_labels)*100

print(f"KNN CV Mean and Std:  {cv_results.mean() * 100}, {cv_results.std()}")
print(f'KNN Training Score: {score}')
classifier_results.KNN = [score, cv_results.mean(), cv_results.std()]

predictions_knn = knn.predict(test_data)
create_submission_csv(predictions_knn, "knn")

KNN CV Mean and Std:  80.80898876404495, 0.022068285298320754
KNN Training Score: 85.85858585858585
File saved successfully


# XGB Classifier

In [169]:
seed = 10
test_size = 0.33

xgb = XGBClassifier()
cv_results = cross_val_score(xgb, train_data, train_labels, cv=kfold, scoring='accuracy')
xgb.fit(train_data, train_labels)
score = xgb.score(train_data, train_labels)*100

print(f"XGB CV Mean and Std:  {cv_results.mean() * 100}, {cv_results.std()}")
print(f'XGB Training Score: {score}')
classifier_results.XGB = [score, cv_results.mean(), cv_results.std()]

predictions_xgb = xgb.predict(test_data)
create_submission_csv(predictions_xgb, "xgb")

XGB CV Mean and Std:  81.37328339575531, 0.02818709771120117
XGB Training Score: 94.38832772166106
File saved successfully


# SVM Classifier

In [170]:
svm = SVC(C=0.5, kernel='rbf', tol=0.00001, degree=2)
cv_results = cross_val_score(svm, train_data, train_labels, cv=kfold, scoring='accuracy')
svm.fit(train_data, train_labels)
score = svm.score(train_data, train_labels)*100

print(f"SVM CV Mean and Std:  {cv_results.mean() * 100}, {cv_results.std()}")
print(f'SVM Training Score: {score}')
classifier_results.SVM = [score, cv_results.mean(), cv_results.std()]

predictions_svm = svm.predict(test_data)
create_submission_csv(predictions_svm, "svc")

SVM CV Mean and Std:  83.50062421972535, 0.019571071366547232
SVM Training Score: 83.72615039281706
File saved successfully


# Voting Classifier

In [171]:
voting_classifier = VotingClassifier(estimators=[
    ('svm', svm), 
    ('dt', dt),
    ('rfc', rfc),
    ('rf', rfc),
    ('knn', knn)], voting='hard')

cv_results = cross_val_score(voting_classifier, train_data, train_labels, cv=kfold, scoring='accuracy')
voting_classifier.fit(train_data, train_labels)

score = voting_classifier.score(train_data, train_labels)*100

print(f"voting_classifier CV Mean and Std:  {cv_results.mean() * 100}, {cv_results.std()}")
print(f'voting_classifier Training Score: {score}')
classifier_results.Voting = [score, cv_results.mean(), cv_results.std()]

predictions_voting = voting_classifier.predict(test_data)
create_submission_csv(predictions_voting, "voting")

voting_classifier CV Mean and Std:  81.93383270911362, 0.020631265554427555
voting_classifier Training Score: 91.02132435465768
File saved successfully


In [172]:
classifier_results

Unnamed: 0,DecisionTree,RandomForest,KNN,XGB,SVM,Voting
accuracy,85.409652,95.959596,85.858586,94.388328,83.72615,91.021324
CV_mean,0.81372,0.809238,0.80809,0.813733,0.835006,0.819338
CV_std,0.030446,0.032314,0.022068,0.028187,0.019571,0.020631


In [173]:
classifier_results.loc["Test_accuracies", 'DecisionTree'] = .76555
classifier_results.loc["Test_accuracies", 'RandomForest'] = .76794
classifier_results.loc["Test_accuracies", 'KNN'] = .76794
classifier_results.loc["Test_accuracies", 'XGB'] = .77272
classifier_results.loc["Test_accuracies", 'SVM'] = .77751
classifier_results.loc["Test_accuracies", 'Voting'] = .77272

In [174]:
classifier_results

Unnamed: 0,DecisionTree,RandomForest,KNN,XGB,SVM,Voting
accuracy,85.409652,95.959596,85.858586,94.388328,83.72615,91.021324
CV_mean,0.81372,0.809238,0.80809,0.813733,0.835006,0.819338
CV_std,0.030446,0.032314,0.022068,0.028187,0.019571,0.020631
Test_accuracies,0.76555,0.76794,0.76794,0.77272,0.77751,0.77272
