# Decision Trees

In [1]:
# Get working directory
import os
import math
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns
sns.set(style='white', rc={'figure.figsize':(10,10)})
os.getcwd()

'C:\\Users\\roger\\OneDrive\\Github\\Data-Mining'

## 1. Load Data
### Our first dataset will analyze a surgery and 30 day mortality

In [2]:
# Read in csv file for World War 2 weather conditions that is stored in path:
url = 'https://raw.githubusercontent.com/RogerCui-GitHub/Data-Mining/master/datasets/Surgicaldeepnet.csv'
surg_df = pd.read_csv(url)

## 2. View Data

In [3]:
surg_df.head(3)

Unnamed: 0,bmi,Age,asa_status,baseline_cancer,baseline_charlson,baseline_cvd,baseline_dementia,baseline_diabetes,baseline_digestive,baseline_osteoart,baseline_psych,baseline_pulmonary,ahrq_ccs,ccsComplicationRate,ccsMort30Rate,complication_rsi,dow,gender,hour,month,moonphase,mort30,mortality_rsi,race,complication
0,19.31,59.2,1,1,0,0,0,0,0,0,0,0,19,0.18337,0.007424,-0.57,3,0,7.63,6,1,0,-0.43,1,0
1,18.73,59.1,0,0,0,0,0,0,0,0,0,0,1,0.312029,0.016673,0.21,0,0,12.93,0,1,0,-0.41,1,0
2,21.85,59.0,0,0,0,0,0,0,0,0,0,0,6,0.150706,0.001962,0.0,2,0,7.68,5,3,0,0.08,1,0


In [4]:
# We again see that the describe function only describes continuous variables
surg_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bmi,14635.0,31.295642,8.152709,2.15,26.51,28.98,35.295,92.59
Age,14635.0,63.205268,18.088191,6.1,51.5,59.7,74.7,90.0
asa_status,14635.0,0.63232,0.539952,0.0,0.0,1.0,1.0,2.0
baseline_cancer,14635.0,0.262316,0.439909,0.0,0.0,0.0,1.0,1.0
baseline_charlson,14635.0,0.97752,1.758355,0.0,0.0,0.0,2.0,13.0
baseline_cvd,14635.0,0.620294,0.48533,0.0,0.0,1.0,1.0,1.0
baseline_dementia,14635.0,0.004851,0.069485,0.0,0.0,0.0,0.0,1.0
baseline_diabetes,14635.0,0.120875,0.325993,0.0,0.0,0.0,0.0,1.0
baseline_digestive,14635.0,0.189546,0.391955,0.0,0.0,0.0,0.0,1.0
baseline_osteoart,14635.0,0.34274,0.474642,0.0,0.0,0.0,1.0,1.0


In [5]:
#Check dataframe shape
surg_df.shape

(14635, 25)

In [6]:
#Check nulls per column
surg_df.isnull().sum(axis = 0)

bmi                    0
Age                    0
asa_status             0
baseline_cancer        0
baseline_charlson      0
baseline_cvd           0
baseline_dementia      0
baseline_diabetes      0
baseline_digestive     0
baseline_osteoart      0
baseline_psych         0
baseline_pulmonary     0
ahrq_ccs               0
ccsComplicationRate    0
ccsMort30Rate          0
complication_rsi       0
dow                    0
gender                 0
hour                   0
month                  0
moonphase              0
mort30                 0
mortality_rsi          0
race                   0
complication           0
dtype: int64

In [7]:
surg_df.dtypes

bmi                    float64
Age                    float64
asa_status               int64
baseline_cancer          int64
baseline_charlson        int64
baseline_cvd             int64
baseline_dementia        int64
baseline_diabetes        int64
baseline_digestive       int64
baseline_osteoart        int64
baseline_psych           int64
baseline_pulmonary       int64
ahrq_ccs                 int64
ccsComplicationRate    float64
ccsMort30Rate          float64
complication_rsi       float64
dow                      int64
gender                   int64
hour                   float64
month                    int64
moonphase                int64
mort30                   int64
mortality_rsi          float64
race                     int64
complication             int64
dtype: object

In [8]:
# Let's check the proportion of our target that make it 5 years:
surg_df['mort30'].describe()

count    14635.000000
mean         0.003963
std          0.062830
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: mort30, dtype: float64

In [9]:
sum(surg_df['mort30'])

58

## 3. Build the Model

In [10]:
# Load libraries
from sklearn.tree import DecisionTreeClassifier

In [11]:
# Create decision tree classifer object using gini
dt_clf = DecisionTreeClassifier(criterion='gini', random_state=0)

In [12]:
target = surg_df['mort30']

In [13]:
x_vars = surg_df.drop(['mort30'], axis=1)

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_vars, 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=1)

In [15]:
# Train model
dt_model = dt_clf.fit(x_train, y_train)

### Let's check some attributes of our tree:

In [16]:
dt_model.classes_

array([0, 1], dtype=int64)

In [17]:
dt_model.n_classes_

2

In [18]:
dt_model.max_features_

24

In [19]:
dt_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 0,
 'splitter': 'best'}

## 4. Predict probabilities for this decision tree:

In [20]:
# Let's predict probabilities for this simple decision tree:
train_probs = dt_model.predict_proba(x_train)
test_probs = dt_model.predict_proba(x_test)

In [21]:
test_probs.shape

(2927, 2)

In [22]:
train_probs_df = pd.DataFrame(train_probs)
test_probs_df = pd.DataFrame(test_probs)
train_probs_df.columns = ['trainprobs' + str(col) for col in train_probs_df.columns]
test_probs_df.columns = ['testprobs' + str(col) for col in test_probs_df.columns]

In [23]:
train_probs_df.head()

Unnamed: 0,trainprobs0,trainprobs1
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [24]:
test_probs_df.describe()

Unnamed: 0,testprobs0,testprobs1
count,2927.0,2927.0
mean,0.996242,0.003758
std,0.061199,0.061199
min,0.0,0.0
25%,1.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,1.0,1.0


## 5. Evaluation: AUC

In [25]:
# Let's see hwo our model performed in terms of AUC on our training data
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train, train_probs[:,1])
metrics.auc(fpr, tpr)

1.0

In [26]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, test_probs[:,1])
metrics.auc(fpr, tpr)

0.5698837722524642

## 6. Tune Hyperparameters

### Wow! we are very overfit.  Let's manually tune hyperparameters to take care of this.  What are some hyperparameters we can consider playing with to fit our model more appropriately?

In [27]:
dt_clf = DecisionTreeClassifier(criterion='gini',
                                max_depth = 5,
                                min_samples_split = 2,
                                min_samples_leaf = 300,
                                max_features = 24,
                                min_impurity_decrease = 0,
                                class_weight = "balanced",
                                random_state=0)

In [28]:
# Train model
dt_model = dt_clf.fit(x_train, y_train)

In [29]:
train_probs = dt_model.predict_proba(x_train)
test_probs = dt_model.predict_proba(x_test)

In [30]:
train_probs_df = pd.DataFrame(train_probs)
test_probs_df = pd.DataFrame(test_probs)
train_probs_df.columns = ['trainprobs' + str(col) for col in train_probs_df.columns]
test_probs_df.columns = ['testprobs' + str(col) for col in test_probs_df.columns]

In [31]:
# Let's see hwo our model performed in terms of AUC on our training data
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train, train_probs[:,1])
metrics.auc(fpr, tpr)

0.9702123472378101

In [32]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, test_probs[:,1])
metrics.auc(fpr, tpr)

0.9354985042420676