In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from acquire import get_titanic_data
from prepare import prep_titanic_data

df = prep_titanic_data(get_titanic_data())
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
passenger_id       714 non-null int64
survived           714 non-null int64
pclass             714 non-null int64
sex                714 non-null object
age                714 non-null float64
sibsp              714 non-null int64
parch              714 non-null int64
fare               714 non-null float64
embarked           714 non-null object
class              714 non-null object
embark_town        714 non-null object
alone              714 non-null int64
embarked_encode    714 non-null int64
dtypes: float64(2), int64(7), object(4)
memory usage: 78.1+ KB


In [2]:
# Handle missing age values
df.dropna(inplace=True)

X = df[['pclass','age','fare','sibsp','parch']]
y = df.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
60,3,22.0,7.2292,0,0
348,3,3.0,15.9,1,1
606,3,30.0,7.8958,0,0
195,1,58.0,146.5208,0,0
56,2,21.0,10.5,0,0


In [3]:
df.isnull().sum()

passenger_id       0
survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
embark_town        0
alone              0
embarked_encode    0
dtype: int64

### Train Model
#### Create the Random Forest Object

In [4]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)
# min_samples_leaf is set to only 3 because dataset is small

#### Fit the model to the training data

In [5]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

#### Print Feature Importances

In [6]:
print(rf.feature_importances_)
# ['pclass','age','fare','sibsp','parch']
# this shows gini-index, shows you the importance of each feature in order
# shows that fare is biggest indicator of survival

[0.32480013 0.13752631 0.37630011 0.07262912 0.08874434]


#### Estimate whether or not a passenger would survive, using the training data

In [7]:
y_pred = rf.predict(X_train)

In [9]:
y_pred[0:10] # print first ten predictions

array([0, 1, 0, 1, 0, 0, 0, 1, 1, 1])

#### Estimate the probability of a passenger surviving, using the training data

In [8]:
y_pred_proba = rf.predict_proba(X_train)

In [10]:
y_pred_proba[0:10] # each row is for an observations
# prob of not survive (0), prob of survive (1) 

array([[0.80011203, 0.19988797],
       [0.44666637, 0.55333363],
       [0.81476779, 0.18523221],
       [0.27959219, 0.72040781],
       [0.62590135, 0.37409865],
       [0.80076013, 0.19923987],
       [0.80293215, 0.19706785],
       [0.29343826, 0.70656174],
       [0.48654958, 0.51345042],
       [0.43484357, 0.56515643]])

### Evaluate Model
##### Compute the Accuracy

In [11]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.75


#### Create a confusion matrix

In [12]:
print(confusion_matrix(y_train, y_pred))
# y_train is rows
# y_pred is columns

# 248 - pred died, died     |45 -  pred to survive, died
# 79 - pred died, survived  |127 - pred to survive, survived

# accuracy = (248 + 127) / (248 + 79 + 45 + 127)
# recall of surviving = sensitivity = 127 / (79 + 127)
# recall of not surviving = specificity = 248 / (248 + 5)
# precision of surviving = 127 / (45 + 127)
# precision of not surviving = 248 / (248 + 79)
# false negative = 79 / (248 + 79)

[[248  45]
 [ 79 127]]


#### Create a classificaiton report

In [13]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.85      0.80       293
           1       0.74      0.62      0.67       206

   micro avg       0.75      0.75      0.75       499
   macro avg       0.75      0.73      0.74       499
weighted avg       0.75      0.75      0.75       499



### Test Model
#### Compute the accuracy of the model when run on the test data

In [14]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.74


In [16]:
rf.score(X_train, y_train) 
#runs model on train data and gives accuracy of output

0.751503006012024

In [15]:
rf.score(X_test, y_test) 
#runs model on test data and gives accuracy of output

0.7441860465116279