In [1]:
%autosave 120

Autosaving every 120 seconds


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from adam_prepare import titanic_pipeline

Let's read in our data from the titanic pipeline function!

In [3]:
train, val, test = titanic_pipeline()
train.shape, val.shape, test.shape

((623, 9), (134, 9), (134, 9))

We need to create our X and y subsets.

In [4]:
X_train = train.drop(columns = 'survived')
y_train = train.survived

X_val = val.drop(columns = 'survived')
y_val = val.survived

In [5]:
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)

X_train.head()

Unnamed: 0,age,sibsp,parch,fare,alone,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,19.0,1,0,53.1,0,False,True,True,False,False,False,False,True
45,29.0,0,0,8.05,1,False,True,False,False,True,False,False,True
28,29.0,0,0,7.8792,1,True,False,False,False,True,False,True,False
633,29.0,0,0,0.0,1,False,True,True,False,False,False,False,True
403,28.0,1,0,15.85,0,False,True,False,False,True,False,False,True


Before modeling, it's always important to define a baseline!

In [6]:
(y_train == 0).mean()

0.6163723916532905

The majority of people died on the titanic. Guessing died every time has a 62% accuracy.

Now we are ready to create a [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) object and start modeling!

In [7]:
X_train.head()

Unnamed: 0,age,sibsp,parch,fare,alone,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,19.0,1,0,53.1,0,False,True,True,False,False,False,False,True
45,29.0,0,0,8.05,1,False,True,False,False,True,False,False,True
28,29.0,0,0,7.8792,1,True,False,False,False,True,False,True,False
633,29.0,0,0,0.0,1,False,True,True,False,False,False,False,True
403,28.0,1,0,15.85,0,False,True,False,False,True,False,False,True


In [8]:
seed = 42

rf = RandomForestClassifier(max_depth = 5, random_state = seed)

rf.fit(X_train, y_train)

In [9]:
rf.score(X_train, y_train)

0.8507223113964687

In [10]:
rf.score(X_val, y_val)

0.8582089552238806

In [11]:
rf.feature_importances_

array([0.09672174, 0.03190949, 0.03734122, 0.12762573, 0.01634038,
       0.25532888, 0.26181189, 0.04777106, 0.0195783 , 0.07229473,
       0.01166075, 0.00479926, 0.01681657])

In [15]:
fi = pd.DataFrame({'feature' : X_train.columns,
              'importance' : rf.feature_importances_})

fi.sort_values(by = 'importance', ascending = False)

Unnamed: 0,feature,importance
6,sex_male,0.261812
5,sex_female,0.255329
3,fare,0.127626
0,age,0.096722
9,class_Third,0.072295
7,class_First,0.047771
2,parch,0.037341
1,sibsp,0.031909
8,class_Second,0.019578
12,embark_town_Southampton,0.016817
