In [1]:
# imports
import pandas as pd
import numpy as np

import os

# ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Suppress some pandas warnings
import warnings
warnings.filterwarnings("ignore")

## Loading Data

In [2]:
data_path  = os.path.join("data","titanic_train.csv")
df = pd.read_csv(data_path)

In [3]:
train_data, val_data = train_test_split(df,test_size=0.2, random_state=40)

In [4]:
test_data_path  = os.path.join("data","test.csv")
test_data = pd.read_csv(test_data_path)

## Exploring Data

In [5]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
661,662,0,3,"Badt, Mr. Mohamed",male,40.0,0,0,2623,7.225,,C
393,394,1,1,"Newell, Miss. Marjorie",female,23.0,1,0,35273,113.275,D36,C
266,267,0,3,"Panula, Mr. Ernesti Arvid",male,16.0,4,1,3101295,39.6875,,S
144,145,0,2,"Andrew, Mr. Edgardo Samuel",male,18.0,0,0,231945,11.5,,S
446,447,1,2,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S


In [6]:
train_data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.015867,-0.0496,0.03528,-0.073638,-0.020096,-0.007652
Survived,-0.015867,1.0,-0.330399,-0.087042,-0.021765,0.094115,0.241598
Pclass,-0.0496,-0.330399,1.0,-0.354429,0.088651,0.030263,-0.541928
Age,0.03528,-0.087042,-0.354429,1.0,-0.333575,-0.203402,0.068009
SibSp,-0.073638,-0.021765,0.088651,-0.333575,1.0,0.407568,0.174106
Parch,-0.020096,0.094115,0.030263,-0.203402,0.407568,1.0,0.212309
Fare,-0.007652,0.241598,-0.541928,0.068009,0.174106,0.212309,1.0


In [7]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,712.0,573.0,712.0,712.0,712.0
mean,446.814607,0.373596,2.317416,29.723822,0.539326,0.383427,31.57796
std,258.674718,0.484098,0.830386,14.485922,1.129392,0.802882,48.754579
min,2.0,0.0,1.0,0.67,0.0,0.0,0.0
25%,221.5,0.0,2.0,21.0,0.0,0.0,7.8958
50%,448.0,0.0,3.0,28.0,0.0,0.0,14.45625
75%,670.25,1.0,3.0,38.0,1.0,0.0,30.5
max,891.0,1.0,3.0,74.0,8.0,5.0,512.3292


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 661 to 326
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          573 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     711 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 72.3+ KB


## Feature Engineering

In [9]:
def remap_sex(x):
    """Function that remaps the gender words to binary"""
    if x == "male":
        return 0
    if x == "female":
        return 1

In [10]:
# Make minor preprocessing pipeline
def preprocessing(df):
    """One pipeline to perform full data preprocessing for a dataframe"""
    
    # Fill missing values
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Fare'].fillna(df['Fare'].mean(), inplace=True)
    
    # Apply remapping
    df['Sex_bin'] = df['Sex'].apply(remap_sex)
    
    return df

In [11]:
# Preprocess all different datasets
train_data = preprocessing(train_data)
val_data = preprocessing(val_data)
test_data = preprocessing(test_data)

In [12]:
train_data.drop('Sex',axis=1, inplace=True)

In [13]:
# Check correlation again, now with binary sex
train_data.corr().loc['Survived']

PassengerId   -0.015867
Survived       1.000000
Pclass        -0.330399
Age           -0.078947
SibSp         -0.021765
Parch          0.094115
Fare           0.241598
Sex_bin        0.527924
Name: Survived, dtype: float64

In [14]:
train_data.corr().loc['Survived']

PassengerId   -0.015867
Survived       1.000000
Pclass        -0.330399
Age           -0.078947
SibSp         -0.021765
Parch          0.094115
Fare           0.241598
Sex_bin        0.527924
Name: Survived, dtype: float64

## Get final data for models

In [15]:
# Get X_train and y_train
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']

# Get test features and labels
X_val = val_data.drop('Survived', axis=1)
y_val = val_data['Survived']

In [16]:
# Get just the set of correlated features
features = ['Sex_bin', 'Fare', 'Pclass', 'Age']
X_train = X_train[features]
X_val = X_val[features]

## Logistic Regression

In [17]:
# Fit Logistic Regression
clf = LogisticRegression(random_state=42).fit(X_train,y_train)

In [18]:
# Get accuracy
clf.score(X_val, y_val)

0.8212290502793296

## Random Forest

In [19]:
forest = RandomForestClassifier(max_depth=10, random_state=0)
forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=0)

In [20]:
forest.score(X_val, y_val)

0.8491620111731844

## Support Vector Machine

In [21]:
svm = SVC(kernel="linear")
svm.fit(X_train, y_train)

SVC(kernel='linear')

In [22]:
# Get accuracy
svm.score(X_val, y_val)

0.8044692737430168

## Run Model on Final Test Data

In [23]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_bin
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [24]:
test_data = preprocessing(test_data)
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_bin
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [25]:
# Select right features
passenger_id = test_data['PassengerId']
test_data = test_data[features]

In [26]:
# Fit forest on entire dataset
full_y = df['Survived']
full_data = preprocessing(df.drop('Survived', axis=1))[features]

# Fit and run model
forest.fit(full_data, full_y)
predictions = forest.predict(test_data)

# Generate output csv with predictions
output = pd.DataFrame({'PassengerId': passenger_id, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
