In [15]:
import pickle
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree, export_text

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

In [66]:
coupon_type = 'Restaurant20To50'

with open(f'model/phase2/{coupon_type}_best_model_RFC.pickle', "rb") as f: 
    best_model = pickle.load(f)
    
X_train = pd.read_csv(f'data/{coupon_type}_X_train_phase2.csv')
X_test = pd.read_csv(f'data/{coupon_type}_X_test_phase2.csv')
y_test = pd.read_csv(f'data/{coupon_type}_y_test_phase2.csv')

In [105]:
def explore_tree(estimator, X_test, y_test, sample_id, feature_names):
    """
    a simplified version of the code from https://stackoverflow.com/a/48884917
    """
    feature = estimator.tree_.feature
    threshold = estimator.tree_.threshold

    # First let's retrieve the decision path of each sample. The decision_path
    # method allows to retrieve the node indicator functions. A non zero element of
    # indicator matrix at the position (i, j) indicates that the sample i goes
    # through the node j.

    node_indicator = estimator.decision_path(X_test)

    # Get the leaf ids reached by each sample.

    leaf_ids = estimator.apply(X_test)

    # Get the tests that were used to predict a sample
    node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                        node_indicator.indptr[sample_id + 1]]

    features_in_rules = []
    print('Rules used to predict sample %s:\n' % sample_id)
    for node_id in node_index:
        if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
            threshold_sign = "<="
        else:
            threshold_sign = ">"

        feature_name = feature_names[feature[node_id]]
        if not feature_name in features_in_rules:
            features_in_rules.append(feature_name)
            
        print(f"'{feature_name}' "
              f"= {X_test[sample_id, feature[node_id]]} {threshold_sign} {threshold[node_id]:0.2f}")
        
        if leaf_ids[sample_id] == node_id:
            print("\t==> end")

    if estimator.predict(X_test)[sample_id] == 0:
        ar = "Reject"
    elif estimator.predict(X_test)[sample_id] == 1:
        ar = "Accept"
    else: # add this clause just in case there are value other than 0 or 1
        ar = estimator.predict(X_test)[sample_id]
        
    if y_test[sample_id][0] == estimator.predict(X_test)[sample_id]:
        isok = "CORRECT"
    else:
        isok = "WRONG"
    print(f"Prediction: {ar} coupon ({isok})")
    
    return features_in_rules


In [107]:
# for i,e in enumerate(estimator.estimators_):
i=9
sampleid=10
print(f"Tree {i} ({best_model.estimators_[i]})")
features_in_rules = explore_tree(best_model.estimators_[i], X_test.values, y_test.values, 
                                 sampleid, X_test.columns.to_list())
X_test.loc[sampleid, features_in_rules]

Tree 9 (DecisionTreeClassifier(max_features=4, random_state=1914837113))
Rules used to predict sample 10:

'weather_Sunny' = 1.0 > 0.50
'occupation_Food Preparation & Serving Related' = 0.0 <= 0.50
'maritalStatus_Widowed' = 0.0 <= 0.50
'passanger_Partner' = 0.0 <= 0.50
'expiration_2h' = 1.0 > 0.50
'RestaurantLessThan20' = 4.0 > 1.50
'CarryAway' = 4.0 > 3.73
'occupation_Healthcare Practitioners & Technical' = 0.0 <= 0.50
'age_below21' = 0.0 <= 0.50
'CoffeeHouse' = 3.0 <= 3.99
'passanger_Kid(s)' = 0.0 <= 0.50
'age_31' = 0.0 <= 0.50
'income' = 2.0 <= 5.50
'destination_Work' = 1.0 > 0.50
'maritalStatus_Single' = 0.0 <= 0.50
'age_50plus' = 0.0 > -2.00
	==> end
Prediction: Reject coupon (CORRECT)


weather_Sunny                                     1.00000000
occupation_Food Preparation & Serving Related     0.00000000
maritalStatus_Widowed                             0.00000000
passanger_Partner                                 0.00000000
expiration_2h                                     1.00000000
RestaurantLessThan20                              4.00000000
CarryAway                                         4.00000000
occupation_Healthcare Practitioners & Technical   0.00000000
age_below21                                       0.00000000
CoffeeHouse                                       3.00000000
passanger_Kid(s)                                  0.00000000
age_31                                            0.00000000
income                                            2.00000000
destination_Work                                  1.00000000
maritalStatus_Single                              0.00000000
age_50plus                                        0.00000000
Name: 10, dtype: float64

In [110]:
print(export_text(best_model.estimators_[9], max_depth=30, feature_names=list(X_train)))

|--- weather_Sunny <= 0.50
|   |--- time <= 3.88
|   |   |--- destination_Work <= 0.50
|   |   |   |--- class: 1.0
|   |   |--- destination_Work >  0.50
|   |   |   |--- education <= 4.50
|   |   |   |   |--- Bar <= 3.50
|   |   |   |   |   |--- occupation_Arts Design Entertainment Sports & Media <= 0.50
|   |   |   |   |   |   |--- age_36 <= 0.50
|   |   |   |   |   |   |   |--- occupation_Installation Maintenance & Repair <= 0.50
|   |   |   |   |   |   |   |   |--- occupation_Transportation & Material Moving <= 0.50
|   |   |   |   |   |   |   |   |   |--- age_50plus <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- Restaurant20To50 <= 2.80
|   |   |   |   |   |   |   |   |   |   |   |--- maritalStatus_Single <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |   |--- age_41 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |   |   |--- occupation_Building & Grounds Cleaning & Maintenance <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |   |   |   |--- occupation_Computer 

In [109]:
#Extracting the decision path for instance i = 12
i_data = X_train.iloc[12].values.reshape(1,-1)
d_path = best_model.decision_path(i_data)