# Jupyter notebook - Decision Classifier Tree

We do an exercise to estimate the probability of survival using Decision classifier tree.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pydot
from IPython.display import Image

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.externals.six import StringIO  
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
# from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,confusion_matrix, classification_report

%matplotlib inline
plt.style.use('seaborn-white')

In [None]:
# This function creates images of tree models using pydot
def print_tree(estimator, features, class_names=None, filled=True):
    tree = estimator
    names = features
    color = filled
    classn = class_names
    
    dot_data = StringIO()
    export_graphviz(estimator, out_file=dot_data, feature_names=features, class_names=classn, filled=filled)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    return(graph)

In [None]:
data = pd.read_csv("../Dataset/Titanic.csv", sep=";")
data.head()

Predict the probability of survival using Pclass, Sex, Age, Fare.

In [None]:
X = data.drop(['PassengerId', 'Name','Ticket','Cabin', 'Survived'], axis=1)
y = data['Survived']

In [None]:
X.isnull().sum()

In [None]:
X.Age = X.Age.fillna(X.Age.mean())
X.head()

In [None]:
# Convert Sex in 1 oe 0
from sklearn.preprocessing import LabelEncoder
X_sex = LabelEncoder()
X['Sex'] = X_sex.fit_transform(X['Sex'])

# Embarked 
X_embark = pd.get_dummies(X.Embarked)

In [None]:
# Embarked 
X_embark = pd.get_dummies(X.Embarked, drop_first = True)
X_embark
X = pd.concat([X, X_embark], axis = 1)
X = X.drop('Embarked', axis = 1)
X.head()

In [None]:
# or
# X['Sex'] = X.Sex.map({'male':1, 'female':2})

In [None]:
# Divide in train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 10)

# Fit the model

In [None]:
tree_classifier = DecisionTreeClassifier(criterion = 'gini', max_depth=2, random_state = 10)

In [None]:
# fit the tree
tree_classifier.fit(X_train,y_train)

# Evaluate the tree on the training set.
print("Training Score: ", tree_classifier.score(X_train,y_train))
# Evaluate the tree on the training set.
print("Test Score: ", tree_classifier.score(X_test,y_test))

In [None]:
graph, = print_tree(tree_classifier, features= X.columns, class_names = ['No', 'Yes'])
Image(graph.create_png())

In [None]:
# More depth 
tree_classifier3 = DecisionTreeClassifier(criterion = 'gini', max_depth=3, random_state = 10)
# fit the tree
tree_classifier3.fit(X_train,y_train)

# Evaluate the tree on the training set.
print("Training Score: ", tree_classifier3.score(X_train,y_train))
# Evaluate the tree on the training set.
print("Test Score: ", tree_classifier3.score(X_test,y_test))

In [None]:
graph, = print_tree(tree_classifier3, features= X.columns, class_names = ['No', 'Yes'])
Image(graph.create_png())

In [None]:
# More depth 
tree_classifier4 = DecisionTreeClassifier(criterion = 'gini', max_depth=4, random_state = 10)
# fit the tree
tree_classifier4.fit(X_train,y_train)

# Evaluate the tree on the training set.
print("Training Score: ", tree_classifier4.score(X_train,y_train))
# Evaluate the tree on the training set.
print("Test Score: ", tree_classifier4.score(X_test,y_test))

In [None]:
graph, = print_tree(tree_classifier4, features= X.columns, class_names = ['No', 'Yes'])
Image(graph.create_png())

In [None]:
# More depth 
tree_classifier10 = DecisionTreeClassifier(criterion = 'gini', max_depth=10, random_state = 10)
# fit the tree
tree_classifier10.fit(X_train,y_train)

# Evaluate the tree on the training set.
print("Training Score: ", tree_classifier10.score(X_train,y_train))
# Evaluate the tree on the training set.
print("Test Score: ", tree_classifier10.score(X_test,y_test))

In [None]:
graph, = print_tree(tree_classifier6, features= X.columns, class_names = ['No', 'Yes'])
Image(graph.create_png())

In [None]:
# Comparison
# Evaluate the tree on the training set.
print("Depth = 2: " "Training Score = ", tree_classifier.score(X_train,y_train), "-----",
      "Test Score = ", tree_classifier.score(X_test,y_test))
print("")
print("Depth = 3: " "Training Score = ", tree_classifier3.score(X_train,y_train), "-----",
      "Test Score = ", tree_classifier3.score(X_test,y_test))
print("")
print("Depth = 4: " "Training Score = ", tree_classifier4.score(X_train,y_train), "-----",
      "Test Score = ", tree_classifier4.score(X_test,y_test))
print("")
print("Depth = 10: " "Training Score = ", tree_classifier10.score(X_train,y_train), "-----",
      "Test Score = ", tree_classifier10.score(X_test,y_test))


In [None]:
# let's look at which are the most important variables
# 1. depth = 2
features_imp = tree_classifier.feature_importances_
features_imp_df = pd.DataFrame(features_imp, index = X_train.columns, columns = ['Depth 2'])
features_imp_df

In [None]:
# now we concatenate the others
features_imp_df['Depth 3'] = tree_classifier3.feature_importances_
features_imp_df['Depth 4'] = tree_classifier4.feature_importances_
features_imp_df['Depth 10'] = tree_classifier10.feature_importances_
features_imp_df

In [None]:
# how can i predict the probability to survive is a new data comes in? given the most importan features?