# Part 29: Decision Trees

http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-29.html

A decision tree is essentially a flow chart for deciding how to classify an observation: it consists of a series of yes/no or if/else decisions that ultimately assign each observation to a certain probability or class. The series of yes/no decisions can be depicted as a series of branches that lead decisions or "leaves" at the bottom of the tree.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [2]:
import os

os.chdir('/home/sindhuvarun/github/ML-Learning/staticsAndProbability/PythonForDataAnalytics/dataset/Titanic')
titanic_train = pd.read_csv('train.csv')

In [3]:
char_cabin = titanic_train["Cabin"].astype(str)     # Convert cabin to str

new_Cabin = np.array([cabin[0] for cabin in char_cabin]) # Take first letter

titanic_train["Cabin"] = pd.Categorical(new_Cabin)  # Save the new cabin var

# Impute median Age for NA Age values
new_age_var = np.where(titanic_train["Age"].isnull(), # Logical check
                       28,                       # Value if check is true
                       titanic_train["Age"])     # Value if check is false

titanic_train["Age"] = new_age_var

In [4]:
from sklearn import tree
from sklearn import preprocessing

In [5]:
label_encoder = preprocessing.LabelEncoder()

encoded_sex = label_encoder.fit_transform(titanic_train['Sex'])

tree_model = tree.DecisionTreeClassifier()

tree_model.fit(X = pd.DataFrame(encoded_sex), y=titanic_train['Survived'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [6]:
preds = tree_model.predict_proba(X=pd.DataFrame(encoded_sex))
pd.crosstab(preds[:,0], titanic_train['Sex'])

Sex,female,male
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.257962,314,0
0.811092,0,577


In [7]:
# Predict survival based on two factors  -  Sex, Pclass
tree_model.fit(X=pd.DataFrame([encoded_sex, titanic_train['Pclass']]).T, y=titanic_train['Survived'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [9]:
preds = tree_model.predict_proba(X=pd.DataFrame([encoded_sex, titanic_train['Pclass']]).T)
pd.crosstab(preds[:,0], columns=[titanic_train['Pclass'], titanic_train['Sex']])

Pclass,1,1,2,2,3,3
Sex,female,male,female,male,female,male
row_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0.031915,94,0,0,0,0,0
0.078947,0,0,76,0,0,0
0.5,0,0,0,0,144,0
0.631148,0,122,0,0,0,0
0.842593,0,0,0,108,0,0
0.864553,0,0,0,0,0,347
