## Titanic Survival Prediction Model


#### Table of Contents

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [22]:
# import data
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [23]:
# view training set sample
df_train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [24]:
# view training set info
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [25]:
# view testing set info 
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [26]:
# remove redundant columns
df_train = df_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df_train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [27]:
# count null values
null_count = df_train.isna().sum()
print(null_count)

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64


In [28]:
# create a new column to indicate that age was null/missing
df_train['Age_was_missing'] = df_train['Age'].isnull().astype(int)
df_train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_was_missing
0,0,3,male,22.0,1,0,7.25,S,0
1,1,1,female,38.0,1,0,71.2833,C,0


In [29]:
# impute age column with mean value
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer.fit(df_train[['Age']])
df_train['Age'] = imputer.transform(df_train[['Age']])

# view null values
null_count = df_train.isna().sum()
print(null_count)

Survived           0
Pclass             0
Sex                0
Age                0
SibSp              0
Parch              0
Fare               0
Embarked           2
Age_was_missing    0
dtype: int64


In [30]:
# remove records with null values - expect to see 889 records
df_train = df_train.dropna()
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Survived         889 non-null    int64  
 1   Pclass           889 non-null    int64  
 2   Sex              889 non-null    object 
 3   Age              889 non-null    float64
 4   SibSp            889 non-null    int64  
 5   Parch            889 non-null    int64  
 6   Fare             889 non-null    float64
 7   Embarked         889 non-null    object 
 8   Age_was_missing  889 non-null    int32  
dtypes: float64(2), int32(1), int64(4), object(2)
memory usage: 66.0+ KB


In [31]:
# create dummy variables
pclass_dummies = pd.get_dummies(df_train['Pclass'], prefix='Pclass', drop_first=True)
sex_dummies = pd.get_dummies(df_train['Sex'], prefix='Sex', drop_first=True)
embarked_dummies = pd.get_dummies(df_train['Embarked'], prefix='Embarked', drop_first=True)

In [32]:
# add variables to train_df
df_train = pd.concat([df_train, pclass_dummies], axis=1)
df_train = pd.concat([df_train, sex_dummies], axis=1)
df_train = pd.concat([df_train, embarked_dummies], axis=1)
df_train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_was_missing,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0,3,male,22.0,1,0,7.25,S,0,0,1,1,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0,0,0


In [33]:
# normalize age 
df_train['N_Age'] = round(df_train['Age'] / df_train['Age'].max(), 2)
df_train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_was_missing,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,N_Age
0,0,3,male,22.0,1,0,7.25,S,0,0,1,1,0,1,0.28
1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0,0,0,0.48


In [37]:
# normalize fare
df_train['Log_Fare'] = round(np.log10(df_train['Fare'] + 1),2)

# impute Log_Fare column with mean value
imputer = SimpleImputer(strategy='mean')
imputer.fit(df_train[['Log_Fare']])
df_train['Log_Fare'] = imputer.transform(df_train[['Log_Fare']])

df_train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_was_missing,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,N_Age,Log_Fare
0,0,3,male,22.0,1,0,7.25,S,0,0,1,1,0,1,0.28,0.92
1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0,0,0,0.48,1.86


In [38]:
# remove columns
df_train = df_train.drop(['Pclass', 'Sex', 'Age', 'Fare', 'Embarked'], axis=1)
df_train.head(2)

Unnamed: 0,Survived,SibSp,Parch,Age_was_missing,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,N_Age,Log_Fare
0,0,1,0,0,0,1,1,0,1,0.28,0.92
1,1,1,0,0,0,0,0,0,0,0.48,1.86


In [39]:
# import libraries for training models and measuring performance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [40]:
# group data 
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# view data shape
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

X_train: (711, 10)
y_train: (711,)
X_test: (178, 10)
y_test: (178,)


In [41]:
# initialize the models
models = [
    LogisticRegression(),
    RandomForestClassifier(),
    SVC(),
    GradientBoostingClassifier(),
    MLPClassifier()
]

# for stored trained models
trained_models = {}

print("Number of models:", len(models))
# fit the models and print the accuracy score
for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(type(model).__name__)
    print("Accuracy score:", accuracy_score(y_test, predictions))
    trained_models[type(model).__name__] = model  # store the trained model


Number of models: 5
LogisticRegression
Accuracy score: 0.7921348314606742
RandomForestClassifier
Accuracy score: 0.7696629213483146
SVC
Accuracy score: 0.797752808988764
GradientBoostingClassifier
Accuracy score: 0.8370786516853933
MLPClassifier
Accuracy score: 0.8202247191011236




In [42]:
# clean df_test set to make predictions on for submission file

# remove redundant columns
df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)
passenger_ids = df_test['PassengerId'] # save for later when adding to the submission file
df_test = df_test.drop(['PassengerId'], axis=1)

# create a new column to indicate that age was null/missing
df_test['Age_was_missing'] = df_test['Age'].isnull().astype(int)

# impute age column with mean value
imputer = SimpleImputer(strategy='mean')
imputer.fit(df_test[['Age']])
df_test['Age'] = imputer.transform(df_test[['Age']])

# create dummy variables
pclass_dummies = pd.get_dummies(df_test['Pclass'], prefix='Pclass', drop_first=True)
sex_dummies = pd.get_dummies(df_test['Sex'], prefix='Sex', drop_first=True)
embarked_dummies = pd.get_dummies(df_test['Embarked'], prefix='Embarked', drop_first=True)

# add variables to train_df
df_test = pd.concat([df_test, pclass_dummies], axis=1)
df_test = pd.concat([df_test, sex_dummies], axis=1)
df_test = pd.concat([df_test, embarked_dummies], axis=1)

# normalize age and fare
df_test['N_Age'] = round(df_test['Age'] / df_test['Age'].max(), 2)
df_test['Log_Fare'] = round(np.log10(df_test['Fare'] + 1),2)
imputer = SimpleImputer(strategy='mean')
imputer.fit(df_test[['Log_Fare']])
df_test['Log_Fare'] = imputer.transform(df_test[['Log_Fare']])

# remove columns
df_test = df_test.drop(['Pclass', 'Sex', 'Age', 'Fare', 'Embarked'], axis=1)

# view data
df_test.head(2)

Unnamed: 0,SibSp,Parch,Age_was_missing,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,N_Age,Log_Fare
0,0,0,0,0,1,1,1,0,0.45,0.95
1,1,0,0,0,1,0,0,1,0.62,0.9


In [45]:
import os 

# create csv submission files for every trained model
for model_name, model in trained_models.items():
    # create directory if it doesn't exist
    directory = f'../submissions/{model_name}'
    if not os.path.exists(directory):
        os.makedirs(directory)

    # make predictions
    predictions = model.predict(df_test)

    # create a dataframe
    submission = pd.DataFrame(predictions, columns=['Survived'])
    
    # insert passengerIDs
    submission.insert(0, 'PassengerId', passenger_ids)
    
    # save to csv
    submission.to_csv(f'{directory}/1stSubm{model_name}.csv', index=False)
