In [15]:
import pickle
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree, export_text

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 50
pd.set_option('display.float_format', lambda x: '%9.8f' % x)

In [127]:
def explore_tree(estimator, X_test, y_test, sample_id, feature_names):
    """
    a slightly modified version of the code from 
    https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#decision-path
    """
    feature = estimator.tree_.feature
    threshold = estimator.tree_.threshold

    # First let's retrieve the decision path of each sample. The decision_path
    # method allows to retrieve the node indicator functions. A non zero element of
    # indicator matrix at the position (i, j) indicates that the sample i goes
    # through the node j.

    node_indicator = estimator.decision_path(X_test)

    # Get the leaf ids reached by each sample.

    leaf_ids = estimator.apply(X_test)

    # Get the tests that were used to predict a sample
    node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                        node_indicator.indptr[sample_id + 1]]

    features_in_rules = []
    print('Rules used to predict sample %s:\n' % sample_id)
    for node_id in node_index:
        if leaf_ids[sample_id] == node_id:
            print("\t==> end")
            continue
            
        if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
            threshold_sign = "<="
        else:
            threshold_sign = ">"

        feature_name = feature_names[feature[node_id]]
        if not feature_name in features_in_rules:
            features_in_rules.append(feature_name)
            
        print(f"(node {node_id})'{feature_name}' "
              f"= {X_test[sample_id, feature[node_id]]} {threshold_sign} {threshold[node_id]:0.2f}")

    if estimator.predict(X_test)[sample_id] == 0:
        ar = "Reject"
    elif estimator.predict(X_test)[sample_id] == 1:
        ar = "Accept"
    else: # add this clause just in case there are value other than 0 or 1
        ar = estimator.predict(X_test)[sample_id]
        
    if y_test[sample_id][0] == estimator.predict(X_test)[sample_id]:
        isok = "CORRECT"
    else:
        isok = "WRONG"
    print(f"Prediction: {ar} coupon ({isok})")
    
    return features_in_rules


## Restaurant20To50

In [130]:
coupon_type = 'Restaurant20To50'

with open(f'model/phase2/{coupon_type}_best_model_RFC.pickle', "rb") as f: 
    best_model = pickle.load(f)
    
X_train = pd.read_csv(f'data/{coupon_type}_X_train_phase2.csv')
X_test = pd.read_csv(f'data/{coupon_type}_X_test_phase2.csv')
y_test = pd.read_csv(f'data/{coupon_type}_y_test_phase2.csv')

In [131]:
i=73 # pick an arbitrary tree in the forest
sampleid=13 # pick an arbitrary test sample
print(f"Tree {i} ({best_model.estimators_[i]})")
features_in_rules = explore_tree(best_model.estimators_[i], X_test.values, y_test.values, 
                                 sampleid, X_test.columns.to_list())
## sanity check -- do the values in the X_test record agree with the prediction path that was printed out? Yes! :)
X_test.loc[sampleid, features_in_rules]

Tree 73 (DecisionTreeClassifier(max_features=4, random_state=68574553))
Rules used to predict sample 13:

(node 0)'time' = 4.0 > 3.98
(node 768)'expiration_2h' = 0.0 <= 0.50
(node 769)'Restaurant20To50_freq_unknown' = 0.0 <= 0.50
(node 770)'occupation_Healthcare Support' = 0.0 <= 0.50
(node 771)'RestaurantLessThan20_freq_unknown' = 0.0 <= 0.50
(node 772)'occupation_Office & Administrative Support' = 0.0 <= 0.50
(node 773)'age_50plus' = 0.0 <= 0.50
(node 774)'occupation_Business & Financial' = 0.0 <= 0.50
(node 775)'CoffeeHouse_freq_unknown' = 0.0 <= 0.50
(node 776)'RestaurantLessThan20' = 2.0 <= 2.25
(node 777)'CoffeeHouse' = 2.0 <= 2.50
(node 778)'minsToCouponDest' = 0.0 <= 0.50
(node 779)'has_children' = 0.0 <= 0.50
(node 780)'CoffeeHouse' = 2.0 > 1.50
	==> end
Prediction: Reject coupon (CORRECT)


time                                         4.00000000
expiration_2h                                0.00000000
Restaurant20To50_freq_unknown                0.00000000
occupation_Healthcare Support                0.00000000
RestaurantLessThan20_freq_unknown            0.00000000
occupation_Office & Administrative Support   0.00000000
age_50plus                                   0.00000000
occupation_Business & Financial              0.00000000
CoffeeHouse_freq_unknown                     0.00000000
RestaurantLessThan20                         2.00000000
CoffeeHouse                                  2.00000000
minsToCouponDest                             0.00000000
has_children                                 0.00000000
Name: 13, dtype: float64

In [136]:
#hmmmmm, not sure how to read this output... but keep this code just in case I find a use for it
# print(export_text(best_model.estimators_[73], max_depth=10, feature_names=list(X_train)))

## Coffee House

In [132]:
coupon_type = 'CoffeeHouse'

with open(f'model/phase2/{coupon_type}_best_model_RFC.pickle', "rb") as f: 
    best_model = pickle.load(f)
    
X_train = pd.read_csv(f'data/{coupon_type}_X_train_phase2.csv')
X_test = pd.read_csv(f'data/{coupon_type}_X_test_phase2.csv')
y_test = pd.read_csv(f'data/{coupon_type}_y_test_phase2.csv')

In [141]:
i=73 # pick an arbitrary tree in the forest
sampleid=3 # pick an arbitrary test sample
print(f"Tree {i} ({best_model.estimators_[i]})")
features_in_rules = explore_tree(best_model.estimators_[i], X_test.values, y_test.values, 
                                 sampleid, X_test.columns.to_list())

Tree 73 (DecisionTreeClassifier(max_features=8, random_state=68574553))
Rules used to predict sample 3:

(node 0)'expiration_2h' = 0.0 <= 0.50
(node 1)'occupation_Business & Financial' = 0.0 <= 0.50
(node 2)'income_$12500 - $24999' = 0.0 <= 0.50
(node 3)'income_$75000 - $87499' = 0.0 <= 0.50
(node 4)'occupation_Student' = 0.0 <= 0.50
(node 5)'time' = 0.0 <= 3.50
(node 6)'minsToCouponDest' = 0.0 <= 1.50
(node 7)'CoffeeHouse' = 5.0 > 1.50
(node 81)'passanger_Partner' = 0.0 <= 0.50
(node 82)'destination_No Urgent Place' = 0.0 <= 0.50
(node 83)'has_children' = 1.0 > 0.50
(node 179)'RestaurantLessThan20_freq_unknown' = 0.0 <= 0.50
(node 180)'direction_same' = 1.0 > 0.50
(node 208)'occupation_Computer & Mathematical' = 1.0 > 0.50
	==> end
Prediction: Accept coupon (CORRECT)
