The Titanic project - predictor

Import the data and the required libraries

In [1]:
#importing the libraries and the Titanic dataset
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import joblib

#df initially ingests the Titanic data
df_test = pd.read_csv('test.csv') #read the Titanic test data
model = joblib.load('Titanic_model.pkl')

In [11]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Family_Size  418 non-null    int64  
 12  Alone        418 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 42.6+ KB


Data pre-processing

In [13]:
# Adding a column Family_Size
df_test['Family_Size'] = 0
df_test['Family_Size'] = df_test['Parch']+df_test['SibSp']
 
# Adding a column Alone
df_test['Alone'] = 0
df_test.loc[df_test.Family_Size == 0, 'Alone'] = 1

#Assigning 
avg_age_M = df_test[df_test.Sex =='male'].Age.mean()
avg_age_F = df_test[df_test.Sex =='female'].Age.mean()
no_age_list = df_test.Age.isna() #build list of passengers that need their age fixed

In [15]:
#Filling in the null values in Embarked and Age and Fare
df_test.Fare = df_test.Fare.fillna((df_test.Fare.median())) #filling in Fare field with the median
df_test.Embarked = df_test.Embarked.fillna((df_test.Embarked.mode()[0])) #filling in Embarked field with the mode (S for the training set)
df_test.loc[(no_age_list & (df_test.Sex=='male')), 'Age'] = avg_age_M #all men w/o age get the average age for men
df_test.loc[(no_age_list & (df_test.Sex=='female')), 'Age'] = avg_age_F #all women w/o age get the average age for women
df_test_data = df_test.drop(['Name','SibSp','Parch','Ticket','Cabin'], axis=1)

In [16]:
#Scaling the numbers
from sklearn.preprocessing import StandardScaler
train_numerical_features = list(df_test_data.select_dtypes(include=['int64', 'float64', 'int32']).columns)
#train_numerical_features.pop[0] #remove Passenger ID from the list of
print(train_numerical_features)
ss_scaler = StandardScaler()
df_scaled_test = pd.DataFrame(data = df_test_data)
df_scaled_test[train_numerical_features] = ss_scaler.fit_transform(df_scaled_test[train_numerical_features])
df_scaled_test.head()

['PassengerId', 'Pclass', 'Age', 'Fare', 'Family_Size', 'Alone']


Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Family_Size,Alone
0,-1.727912,0.873482,male,0.334992,-0.497413,Q,-0.553443,0.807573
1,-1.719625,0.873482,female,1.325529,-0.512278,S,0.105643,-1.238278
2,-1.711337,-0.315819,male,2.514174,-0.4641,Q,-0.553443,0.807573
3,-1.70305,0.873482,male,-0.25933,-0.482475,S,-0.553443,0.807573
4,-1.694763,0.873482,female,-0.655545,-0.417492,S,0.764728,-1.238278


In [17]:
#One hot encoding of categorical data (Sex and Embarked)
one_hot_Sex = pd.get_dummies(df_scaled_test['Sex'])
df_scaled_test = df_scaled_test.join(one_hot_Sex)
one_hot_embarked = pd.get_dummies(df_scaled_test['Embarked'])
df_scaled_test = df_scaled_test.join(one_hot_embarked)
df_scaled_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Family_Size,Alone,female,male,C,Q,S
0,-1.727912,0.873482,male,0.334992,-0.497413,Q,-0.553443,0.807573,0,1,0,1,0
1,-1.719625,0.873482,female,1.325529,-0.512278,S,0.105643,-1.238278,1,0,0,0,1
2,-1.711337,-0.315819,male,2.514174,-0.4641,Q,-0.553443,0.807573,0,1,0,1,0
3,-1.70305,0.873482,male,-0.25933,-0.482475,S,-0.553443,0.807573,0,1,0,0,1
4,-1.694763,0.873482,female,-0.655545,-0.417492,S,0.764728,-1.238278,1,0,0,0,1


In [18]:
#Dropping columns rendered irrelevant from the training set DataFrame
df_scaled_test = df_scaled_test.drop(['PassengerId','Sex','Embarked','male'], axis=1)
df_scaled_test.head()

Unnamed: 0,Pclass,Age,Fare,Family_Size,Alone,female,C,Q,S
0,0.873482,0.334992,-0.497413,-0.553443,0.807573,0,0,1,0
1,0.873482,1.325529,-0.512278,0.105643,-1.238278,1,0,0,1
2,-0.315819,2.514174,-0.4641,-0.553443,0.807573,0,0,1,0
3,0.873482,-0.25933,-0.482475,-0.553443,0.807573,0,0,0,1
4,0.873482,-0.655545,-0.417492,0.764728,-1.238278,1,0,0,1


Export results

In [44]:
df_sub = df_test[['PassengerId']]
df_Survived = pd.DataFrame(model.predict(df_scaled_test))
df_Survived.columns = ['Survived']
df_sub.insert(1, "Survived", df_Survived['Survived'])
#df_sub = df_test[df_test['PassengerId']].join(df_Survived)
#df_sub['Survived'] = df_Survived['Survived']#pd.merge(df_test['PassengerId'], df_Survived, how = "inner")
#df_sub.reset_index(drop=True, inplace=True)
#print(df_Survived.head())
#df_sub['Survived'] = Survived.tolist()
print(df_sub.head())
csv_data = df_sub.to_csv('submission.csv', index = False)

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0
