In [1]:
from data_gen.models import Models
from data_gen.models_dict_v2 import model_dict
from data_gen.generate_synthetic_df import generate_synthetic_df
import numpy as np
import matplotlib.pyplot as plt

SEED = np.random.seed(1)
number_of_samples = 5000
train_ratio, valiation_ratio, test_ratio = 0.6,0.2,0.2 #i.e. 0.6*number_of_samples for training, etc.

# 1. Generate Data

In [2]:
def generate_data(sample_number, seed):
    # Create synthetic dataframe
    df = generate_synthetic_df(sample_number, seed)

    # Instantiate an object from the class "Models"
    models = Models(model_dict)

    # Calculate the cost and price
    cost = models.calculate_cost(df)
    pricing = models.calculate_pricing(df)

    # Calculate the profit on the synthetic dataframe
    df["profit"] = models.calculate_profit(cost, pricing)

    return df

df = generate_data(number_of_samples, SEED)

# 2. Clean Data

In [3]:
def standardize_one_column(column):
    mean = np.mean(column)
    standard_deviation = np.std(column)
    return (column - mean)/standard_deviation

def standardize(dataframe):
    #Given a pandas dataframe, we standardize every column.
    number_of_columns = len(dataframe.columns)
    for j in range(number_of_columns):
        dataframe.iloc[:,j] = standardize_one_column(dataframe.iloc[:,j])
    
    return dataframe

In [4]:
def obtain_X_Y(df):
    X = df.loc[:, df.columns != 'profit']
    Y = df["profit"]

    X.loc[X["MARITAL_STATUS"]=="Single", "MARITAL_STATUS"] = 0
    X.loc[X["MARITAL_STATUS"]=="Not_Single", "MARITAL_STATUS"] = 0

    X = X.astype(int)
    Y = Y.astype(int)

    return X,Y

In [5]:
X,Y = obtain_X_Y(df)

X_TRAIN = X.iloc[:int(train_ratio*number_of_samples),:]
Y_TRAIN = Y.iloc[:int(train_ratio*number_of_samples)]
X_VALIDATION = X.iloc[int(train_ratio*number_of_samples):int((train_ratio+valiation_ratio)*number_of_samples),:]
Y_VALIDATION = Y.iloc[int(train_ratio*number_of_samples):int((train_ratio+valiation_ratio)*number_of_samples)]
X_TEST = X.iloc[int((train_ratio+valiation_ratio)*number_of_samples):,:]
Y_TEST = Y.iloc[int((train_ratio+valiation_ratio)*number_of_samples):]

# 3. Train the Decision Tree.

In [6]:
def MSE(y, y_hat):
    return np.sum( np.square(y-y_hat) )
    

In [7]:
from sklearn.tree import DecisionTreeRegressor

min_error = 999999999999999999999
best_depth = None
best_regressor = None

for k in range(1,15):
    regressor = DecisionTreeRegressor(criterion="squared_error", max_depth=k)
    regressor = regressor.fit(X_TRAIN, Y_TRAIN)
    Y_VALIDATION_HAT = regressor.predict(X_VALIDATION)
    error = MSE(Y_VALIDATION, Y_VALIDATION_HAT)
    if error < min_error:
        min_error = error
        best_depth = k
        best_regressor = regressor

print(k)
print(min_error)


14
609317.4734580594


In [8]:
Y_TEST_HAT = best_regressor.predict(X_TEST)
error = MSE(Y_TEST_HAT, Y_TEST)
print(error)

678004.7402901736


# 4. Get the Transition Matrix

By a **state**, we mean a leaf in the best regressor trained above.
Our first goal is to get a list of all the states and get the decision path (i.e. given X, what leaf does X fall into?).

It turns out that sklearn has one id associated to each tree node. Our `state` will thus be a list of integers corresponding to these leaf nodes.

In [9]:
def get_state(regressor):
    #Source: https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
    #This method takes our regressor as input and returns an array of boolean plus an array of integers.
    #The array of booleans will have the length same as the total number of nodes and will indicate if each is a leaf.
    #The array of integers will have the same length as the total number of leaves and will correspond to the location of "True" in the boolean array.
    
    regressor_tree = regressor.tree_

    n_nodes = regressor_tree.node_count
    children_left = regressor_tree.children_left
    children_right = regressor_tree.children_right

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True
    
    print(is_leaves)
    return is_leaves, np.nonzero(is_leaves)[0]

is_leaves, states = get_state(best_regressor)
print(states)
print(len(states))


[False False False False False False False False  True  True False  True
  True False False  True  True False  True  True False False False  True
  True False  True  True False False  True  True False  True  True False
 False False False  True  True False  True  True False False  True  True
 False  True  True False False False  True  True  True False False  True
  True False  True  True False False False False False  True  True False
  True  True False False  True  True False  True  True False False False
  True  True False  True  True False False  True  True False  True  True
 False False False False  True  True False  True  True False False  True
  True False  True  True False False False  True  True False  True  True
 False False  True  True False  True  True False False False False False
 False  True  True False  True  True False False  True  True False  True
  True False False False  True  True  True False False  True  True False
  True  True False False False False  True  True Fa

Now let us print, say, the decision path for `X_TEST`.

In [10]:
def get_state_of_one_year(x, regressor, states):
    x_leaf_id = regressor.apply(x)
    x_states = np.empty(len(x), dtype = np.uint16)
    for k in range(len(x)):
        x_states[k] = np.where(states == x_leaf_id[k])[0][0]
    return x_states

get_state_of_one_year(X_TEST, best_regressor, states)

array([208, 107,  78,  68, 160, 124, 105, 185,   9,  17,  24, 103, 185,
        66, 152,  24, 225, 110,  31,  83, 102,  82, 188, 207, 223, 162,
        70, 136,   5, 223,   4, 187,  63, 123, 199,  65,  22,   1, 154,
        94,   9,  27,  12, 144,  98, 106, 113, 184,  14, 156, 150, 178,
        69, 207,  35, 154, 194, 136,  83,   4, 211,  70, 162,  98,  36,
       157, 156,  68, 156, 123,  95, 180,   1,  49,  79,  65,  63, 108,
       152,  12, 158, 144, 144,  64,   9, 217,  22,  65,  83, 183, 200,
       154,  38,  65,  86,  70,  95,  67, 156, 148,   9, 127, 180,  13,
         6,  16,  65, 168, 100, 159, 124,  63,  67,  94, 187,  83, 130,
        63, 154,  82, 126, 180, 162,   1,  15,  94,  38,  81,  67,  65,
       106, 107, 189, 144, 139,  76,  65, 107,  98, 161,   1,  24,  94,
        80, 127, 171,  82, 157, 154,  38, 200,  96, 177, 103, 124, 120,
       185, 126,  15, 105,  50,  24,  49,  95, 103, 153,  63,  68, 185,
        67,   1,  65,  70,  63, 212, 223,  63, 223,  38, 124,  9

Since the data is random, we will use different seeds to generate $k$ pieces of data of same amound of clients. We assume that such is how one client changes over $k$ years. We shall focus on the mechanism of getting the transition probabilities and will ignore the fact that the client does not age by exactly one year old in the next year.

We remark that, in order for the calculation to work, we must have at least one client for every state. This might not hold. Therefore, we use a large group of clients.

In [11]:
k = 50 #number of years
seeds = np.random.choice(1000, k)
#yearly_X = []
yearly_state = []
sample_number = 5000

for i in range(k):
    new_dataframe = generate_data(sample_number, seeds[i])
    x,_ = obtain_X_Y(new_dataframe)
    #yearly_X.append(x)
    yearly_state.append(get_state_of_one_year(x,best_regressor, states))

print(yearly_state[0])

[168  95 120 ...   0  64 173]


In [12]:
def compute_transition_matrix(yearly_state, number_of_states):
    number_of_years = len(yearly_state)
    number_of_clients = len(yearly_state[0])
    transition_matrix = np.zeros((number_of_states,number_of_states), dtype = np.uint16)


    #For each time t, we count the number of clients starting at state l and end at every state.
    #Once this is done, we make each row a probability vector.

    for t in range(number_of_years-1):
        for n in range(number_of_clients):
            state_this_year = int(yearly_state[t][n])
            state_next_year = int(yearly_state[t+1][n])
            transition_matrix[state_this_year,state_next_year] += 1
    
    transition_matrix = transition_matrix.astype(np.float64)
    for l in range(number_of_states):
        row_sum = np.sum(transition_matrix[l])
        transition_matrix[l] = transition_matrix[l]/row_sum
    
    return transition_matrix

transition_matrix = compute_transition_matrix(yearly_state, len(states))
print(transition_matrix)

[[0.00286944 0.0143472  0.         ... 0.         0.00143472 0.        ]
 [0.00319795 0.01183243 0.0003198  ... 0.0003198  0.0003198  0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
