In [184]:
import random
import joblib
import numpy as np
import pandas as pd
import seaborn as sns

In [185]:
df = pd.read_csv('full.csv')

In [186]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'WikiId', 'Name_wiki',
       'Age_wiki', 'Hometown', 'Boarded', 'Destination', 'Lifeboat', 'Body',
       'Class'],
      dtype='object')

In [187]:
df.drop(['PassengerId', 'Ticket', 'WikiId', 'Hometown', 'Boarded', 'Destination', 'Lifeboat', 'Body', 'Cabin', 'Name_wiki', 'Age_wiki', 'Embarked'], axis = 1, inplace = True)
df.drop(['SibSp', 'Parch', 'Class'], axis = 1, inplace = True)

In [188]:
df['Age'].fillna(value=df['Age'].mean(), inplace = True)

In [189]:
df.dropna(inplace = True)

In [190]:
def which_title(name):
    s = name[name.index(',')+2 : name.index('.')]
    acceptable = ['Mr', 'Miss', 'Mrs', 'Master']
    if s in acceptable:
        return s
    else:
        return 'Other'

In [191]:
df['Title'] = pd.DataFrame(df['Name'].apply(which_title))
df.drop('Name', axis = 1, inplace = True)

In [192]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Title
0,0.0,3,male,22.000000,7.2500,Mr
1,1.0,1,female,38.000000,71.2833,Mrs
2,1.0,3,female,26.000000,7.9250,Miss
3,1.0,1,female,35.000000,53.1000,Mrs
4,0.0,3,male,35.000000,8.0500,Mr
...,...,...,...,...,...,...
886,0.0,2,male,27.000000,13.0000,Other
887,1.0,1,female,19.000000,30.0000,Miss
888,0.0,3,female,29.881138,23.4500,Miss
889,1.0,1,male,26.000000,30.0000,Mr


In [193]:
#create dummy columns to keep track of gender, class, and embarked
gender = pd.get_dummies(df['Sex'],drop_first = True)
title = pd.get_dummies(df['Title'],drop_first = True)

In [194]:
#drop old categorical variables and replace with new df's
df.drop(['Sex','Title'], axis = 1, inplace = True)

df = pd.concat([df,gender,title], axis = 1)

In [195]:
X = df.drop('Survived', axis = 1)
y = df['Survived']

In [196]:
X

Unnamed: 0,Pclass,Age,Fare,male,Miss,Mr,Mrs,Other
0,3,22.000000,7.2500,1,0,1,0,0
1,1,38.000000,71.2833,0,0,0,1,0
2,3,26.000000,7.9250,0,1,0,0,0
3,1,35.000000,53.1000,0,0,0,1,0
4,3,35.000000,8.0500,1,0,1,0,0
...,...,...,...,...,...,...,...,...
886,2,27.000000,13.0000,1,0,0,0,1
887,1,19.000000,30.0000,0,1,0,0,0
888,3,29.881138,23.4500,0,1,0,0,0
889,1,26.000000,30.0000,1,0,1,0,0


In [197]:
from sklearn.model_selection import train_test_split

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [199]:
from sklearn.ensemble import RandomForestClassifier

In [200]:
rfc = RandomForestClassifier()

In [201]:
%%time
rfc.fit(X_train, y_train)

CPU times: user 214 ms, sys: 6.95 ms, total: 221 ms
Wall time: 261 ms


RandomForestClassifier()

In [202]:
predictions = rfc.predict(X_test)

In [203]:
from sklearn.metrics import classification_report

In [204]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.83      0.84      0.83       175
         1.0       0.76      0.74      0.75       120

    accuracy                           0.80       295
   macro avg       0.79      0.79      0.79       295
weighted avg       0.80      0.80      0.80       295



In [205]:
i = random.randint(0,200)
random_sample = X_test.iloc[i]
random_answer = y_test.iloc[i]

In [206]:
random_sample

Pclass     3.0000
Age       24.0000
Fare       7.4958
male       1.0000
Miss       0.0000
Mr         1.0000
Mrs        0.0000
Other      0.0000
Name: 514, dtype: float64

In [207]:
random_answer

0.0

In [208]:
rfc.predict_proba(np.array(random_sample).reshape(1,-1))

array([[0.91, 0.09]])

In [209]:
joblib.dump(rfc, "./random_forest.joblib")

['./random_forest.joblib']

In [210]:
print(rfc.feature_importances_)

[0.09007542 0.26852631 0.33321334 0.13191841 0.03210633 0.10052371
 0.03741233 0.00622414]


In [211]:
b = list(np.array(random_sample).reshape(1,-1)[0])

In [216]:
b[3] = 1

In [217]:
b

[3.0, 24.0, 7.4958, 1, 0.0, 1.0, 0.0, 0.0]

In [218]:
a = np.array(b).reshape(1,-1)

In [219]:
rfc.predict_proba(a)

array([[0.91, 0.09]])