In [1]:
from data_gen.models import Models
from data_gen.models_dict_v2 import model_dict
from data_gen.generate_synthetic_df import generate_synthetic_df
import numpy as np
import matplotlib.pyplot as plt

SEED = np.random.seed(1)
number_of_samples = 5000
train_ratio, valiation_ratio, test_ratio = 0.6,0.2,0.2 #i.e. 0.6*number_of_samples for training, etc.

# 1. Generate Data

In [2]:
# Create synthetic dataframe
df = generate_synthetic_df(number_of_samples, SEED)

# Instantiate an object from the class "Models"
models = Models(model_dict)

# Calculate the cost and price
cost = models.calculate_cost(df)
pricing = models.calculate_pricing(df)

# Calculate the profit on the synthetic dataframe
df["profit"] = models.calculate_profit(cost, pricing)

# 2. Clean Data

In [3]:
def standardize_one_column(column):
    mean = np.mean(column)
    standard_deviation = np.std(column)
    return (column - mean)/standard_deviation

def standardize(dataframe):
    #Given a pandas dataframe, we standardize every column.
    number_of_columns = len(dataframe.columns)
    for j in range(number_of_columns):
        dataframe.iloc[:,j] = standardize_one_column(dataframe.iloc[:,j])
    
    return dataframe

In [4]:
X = df.loc[:, df.columns != 'profit']
Y = df["profit"]

X.loc[X["MARITAL_STATUS"]=="Single", "MARITAL_STATUS"] = 0
X.loc[X["MARITAL_STATUS"]=="Not_Single", "MARITAL_STATUS"] = 0

X = X.astype(int)
Y = Y.astype(int)

X_TRAIN = X.iloc[:int(train_ratio*number_of_samples),:]
Y_TRAIN = Y.iloc[:int(train_ratio*number_of_samples)]
X_VALIDATION = X.iloc[int(train_ratio*number_of_samples):int((train_ratio+valiation_ratio)*number_of_samples),:]
Y_VALIDATION = Y.iloc[int(train_ratio*number_of_samples):int((train_ratio+valiation_ratio)*number_of_samples)]
X_TEST = X.iloc[int((train_ratio+valiation_ratio)*number_of_samples):,:]
Y_TEST = Y.iloc[int((train_ratio+valiation_ratio)*number_of_samples):]

# 3. Train the Decision Tree.

In [5]:
def MSE(y, y_hat):
    return np.sum( np.square(y-y_hat) )
    

In [6]:
from sklearn.tree import DecisionTreeRegressor

min_error = 999999999999999999999
best_depth = None
best_regressor = None

for k in range(1,15):
    regressor = DecisionTreeRegressor(criterion="squared_error", max_depth=k)
    regressor = regressor.fit(X_TRAIN, Y_TRAIN)
    Y_VALIDATION_HAT = regressor.predict(X_VALIDATION)
    error = MSE(Y_VALIDATION, Y_VALIDATION_HAT)
    if error < min_error:
        min_error = error
        best_depth = k
        best_regressor = regressor

print(k)
print(min_error)


14
497937.87520615227


In [7]:
Y_TEST_HAT = best_regressor.predict(X_TEST)
error = MSE(Y_TEST_HAT, Y_TEST)
print(error)

611439.1069543249


# 4. Get the Transition Matrix

By a **state**, we mean a leaf in the best regressor trained above.
Our first goal is to get a list of all the states and get the decision path (i.e. given X, what leaf does X fall into?).

It turns out that sklearn has one id associated to each tree node. Our `state` will thus be a list of integers corresponding to these leaf nodes.

In [13]:
def get_state(regressor):
    #Source: https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
    #This method takes our regressor as input and returns an array of boolean plus an array of integers.
    #The array of booleans will have the length same as the total number of nodes and will indicate if each is a leaf.
    #The array of integers will have the same length as the total number of leaves and will correspond to the location of "True" in the boolean array.
    
    regressor_tree = regressor.tree_

    n_nodes = regressor_tree.node_count
    children_left = regressor_tree.children_left
    children_right = regressor_tree.children_right

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True
    
    print(is_leaves)
    return is_leaves, np.nonzero(is_leaves)[0]

is_leaves, states = get_state(best_regressor)
print(states)
print(len(states))


[False False False False False False False  True  True False  True  True
 False False  True  True False  True  True False False False  True  True
 False  True  True False False  True  True False  True  True False False
 False False  True  True False  True  True False False  True  True False
  True  True False False False  True  True  True False False  True  True
  True False False False False False  True  True False  True  True False
 False  True  True False  True  True False False False  True  True False
  True  True False False  True  True False  True  True False False False
 False  True  True False  True  True False False  True  True False  True
  True False False False  True  True False  True  True False False  True
  True False  True  True False False False False False False  True  True
 False  True  True False False  True  True  True False False False  True
  True False  True  True False False  True  True False  True  True False
 False False False  True  True False  True  True Fa

Now let us print, say, the decision path for `X_TEST`.

In [26]:
x_test_leaf_id = best_regressor.apply(X_TEST)
print(x_test_leaf_id)
#print(np.where(states == x_test_leaf_id[0])[0][0])
X_TEST_STATES = np.empty(len(X_TEST), dtype = np.uint16)
for k in range(len(X_TEST_STATES)):
    X_TEST_STATES[k] = np.where(states == x_test_leaf_id[k])[0][0]

print(X_TEST_STATES)

[144  76  41  74 238 162  67 178  22  74 170  89 175 160 146 123  46 144
 144 115 147 191  89 194  76 100 107 162 192 244 162 175 169 181 170 192
  85 162 174 235  77 205  10 112 159  85  81  81  85 192 160  97 131  67
  89 138 236 167  66 205 214  81 178 133  69 144  66 130  67 115 174  66
 112  67 130 115 115 115  67 159 177 144 144 182  42 174 143 194  41 107
 170 224 100 144 230 191  59  74 191  46 175 174 105 207 192 195 178 115
  73 144 204 162 144 204 208 112 177  59  73 143  15  89  77 184  53 182
 204 235 174  66 115  67 182 100 221 160 162 131 177 178 208 159 120 174
 221 162 215 235  60 170 221 119 137  39 175 162 162  84 130 160  74 163
 195 191 115 101 112 182  74 177 178  49 100  17  69 170  98  48  67 143
 195 144  49 130 131 191  66 130 160 160  59 162  42 143  89 107  25 150
  59 207 204 112 221 221 167  73 160 221 184 112 194  73 162 130 182 159
 230 194 144 144 185 138 101 130  74 144 143 115  66  41 192 104 191  46
 211 144 194 159 160 131 181 144 163 235  74  81 17