In [1]:
import pandas as pd 
import sklearn.metrics
from sklearn.metrics import confusion_matrix
import numpy as np
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import env
import acquire as acq
import prepare as prp
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier,export_text,plot_tree
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier

In [2]:
# load titanic via acquire.py
df = acq.get_titanic_data('titanic_db')
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [5]:
df = prp.prep_titanic(df)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,26.0,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,35.0,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,35.0,0,0,8.05,Southampton,1,1,0,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 891 non-null    int64  
 1   pclass                   891 non-null    int64  
 2   sex                      891 non-null    object 
 3   age                      891 non-null    float64
 4   sibsp                    891 non-null    int64  
 5   parch                    891 non-null    int64  
 6   fare                     891 non-null    float64
 7   embark_town              891 non-null    object 
 8   alone                    891 non-null    int64  
 9   sex_male                 891 non-null    uint8  
 10  embark_town_Queenstown   891 non-null    uint8  
 11  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(5), object(2), uint8(3)
memory usage: 65.4+ KB


In [7]:
# Split dataset
train, validate, test = prp.split_data(df,'survived')
train.shape,validate.shape,test.shape

((498, 12), (214, 12), (179, 12))

In [8]:
#Determine drivers of target
train.columns[:-2]

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embark_town', 'alone', 'sex_male'],
      dtype='object')

In [9]:
cat_cols, num_cols = [], []
for col in train.columns[:-2]:
    if train[col].dtype == 'O':
        cat_cols.append(col)
    else:
        if train[col].nunique() < 10:
            cat_cols.append(col)
        else:
            num_cols.append(col)

In [10]:
cat_cols,num_cols

(['survived',
  'pclass',
  'sex',
  'sibsp',
  'parch',
  'embark_town',
  'alone',
  'sex_male'],
 ['age', 'fare'])

In [11]:
explore_cols = cat_cols + num_cols
explore_cols

['survived',
 'pclass',
 'sex',
 'sibsp',
 'parch',
 'embark_town',
 'alone',
 'sex_male',
 'age',
 'fare']

In [None]:
# Columns to potentially drop include:
#  sibsp and parch-assuming data integrity, then alone = 1 | 0 IS SUFFICIENT
#  unless specific domain knowledge says otherwise....drop Embark Town
#  sex is redundancy of sex_male

# Numerical columns to potentially bin as discretes:
#  both age and fare are potnetial candidates for binning...B/C this is Classification and not regression,
# we are not looking to determine or even use precice, exact values, we just need to know categories

'''
Not that it is surprising, but for categorical dimension except sibsp and parch, the feature class that
contained the largest number of instances also lead to the correspndingly lowest survival rate within
the feature.  For example, survival rate for 3rd class pax was lowest, but they constituted largest portion
of pax, survival rate for male was lower than female, and they also constituted largest portion of pax
within feature. 

'''

In [15]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
542,0,3,female,11.0,4,2,31.275,Southampton,0,0,0,1
457,1,1,female,29.699118,1,0,51.8625,Southampton,0,0,0,1
205,0,3,female,2.0,0,1,10.4625,Southampton,0,0,0,1
208,1,3,female,16.0,0,0,7.75,Queenstown,1,0,1,0
485,0,3,female,29.699118,3,1,25.4667,Southampton,0,0,0,1


In [14]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [16]:
train['baseline_pred'] = 0
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,baseline_pred
542,0,3,female,11.0,4,2,31.275,Southampton,0,0,0,1,0
457,1,1,female,29.699118,1,0,51.8625,Southampton,0,0,0,1,0
205,0,3,female,2.0,0,1,10.4625,Southampton,0,0,0,1,0
208,1,3,female,16.0,0,0,7.75,Queenstown,1,0,1,0,0
485,0,3,female,29.699118,3,1,25.4667,Southampton,0,0,0,1,0


In [17]:
pd.crosstab(train.baseline_pred,train.survived) 

survived,0,1
baseline_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,307,191


- 1 Baseline ACCURACY

In [18]:
# Baseline ACCURACY is 61.6%  
baseline_acc = (train.baseline_pred == train.survived).mean()
baseline_acc

0.6164658634538153

- 2 MAKE FIT and USE to Training data

In [37]:
X_cols = train.columns.to_list()
X_cols.remove('survived')
X_cols.remove('baseline_pred')
X_cols.remove('sex')
X_cols.remove('embark_town')
y_cols = 'survived'

In [38]:
X_cols

['pclass',
 'age',
 'sibsp',
 'parch',
 'fare',
 'alone',
 'sex_male',
 'embark_town_Queenstown',
 'embark_town_Southampton']

In [28]:
y_cols

'survived'

In [34]:
train[y_cols].head()

542    0
457    1
205    0
208    1
485    0
Name: survived, dtype: int64

In [39]:
clf = DecisionTreeClassifier()
clf

In [40]:
clf.fit(train[X_cols],train[y_cols])

In [None]:
clf.predict(train[X_cols]) # numpy array