In [1]:
from data_gen.models import Models
from data_gen.models_dict_v2 import model_dict
from data_gen.generate_synthetic_df import generate_synthetic_df
import numpy as np
import matplotlib.pyplot as plt

SEED = np.random.seed(1)
number_of_samples = 5000
train_ratio, valiation_ratio, test_ratio = 0.6,0.2,0.2 #i.e. 0.6*number_of_samples for training, etc.

# 1. Generate Data

In [2]:
def generate_data(sample_number, seed):
    # Create synthetic dataframe
    df = generate_synthetic_df(sample_number, seed)

    # Instantiate an object from the class "Models"
    models = Models(model_dict)

    # Calculate the cost and price
    cost = models.calculate_cost(df)
    pricing = models.calculate_pricing(df)

    # Calculate the profit on the synthetic dataframe
    df["profit"] = models.calculate_profit(cost, pricing)

    return df

df = generate_data(number_of_samples, SEED)

# 2. Clean Data

In [3]:
def standardize_one_column(column):
    mean = np.mean(column)
    standard_deviation = np.std(column)
    return (column - mean)/standard_deviation

def standardize(dataframe):
    #Given a pandas dataframe, we standardize every column.
    number_of_columns = len(dataframe.columns)
    for j in range(number_of_columns):
        dataframe.iloc[:,j] = standardize_one_column(dataframe.iloc[:,j])
    
    return dataframe

In [4]:
def obtain_X_Y(df):
    X = df.loc[:, df.columns != 'profit']
    Y = df["profit"]

    X.loc[X["MARITAL_STATUS"]=="Single", "MARITAL_STATUS"] = 0
    X.loc[X["MARITAL_STATUS"]=="Not_Single", "MARITAL_STATUS"] = 0

    X = X.astype(int)
    Y = Y.astype(int)

    return X,Y

In [5]:
X,Y = obtain_X_Y(df)

X_TRAIN = X.iloc[:int(train_ratio*number_of_samples),:]
Y_TRAIN = Y.iloc[:int(train_ratio*number_of_samples)]
X_VALIDATION = X.iloc[int(train_ratio*number_of_samples):int((train_ratio+valiation_ratio)*number_of_samples),:]
Y_VALIDATION = Y.iloc[int(train_ratio*number_of_samples):int((train_ratio+valiation_ratio)*number_of_samples)]
X_TEST = X.iloc[int((train_ratio+valiation_ratio)*number_of_samples):,:]
Y_TEST = Y.iloc[int((train_ratio+valiation_ratio)*number_of_samples):]

# 3. Train the Decision Tree.

In [6]:
def MSE(y, y_hat):
    return np.sum( np.square(y-y_hat) )
    

In [7]:
from sklearn.tree import DecisionTreeRegressor

min_error = 999999999999999999999
best_depth = None
best_regressor = None

for k in range(1,15):
    regressor = DecisionTreeRegressor(criterion="squared_error", max_depth=k)
    regressor = regressor.fit(X_TRAIN, Y_TRAIN)
    Y_VALIDATION_HAT = regressor.predict(X_VALIDATION)
    error = MSE(Y_VALIDATION, Y_VALIDATION_HAT)
    if error < min_error:
        min_error = error
        best_depth = k
        best_regressor = regressor

print(k)
print(min_error)

14
570894.8059498647


In [8]:
Y_TEST_HAT = best_regressor.predict(X_TEST)
error = MSE(Y_TEST_HAT, Y_TEST)
print(error)

626696.8027692847


# 4. Get the Transition Matrix

By a **state**, we mean a leaf in the best regressor trained above.
Our first goal is to get a list of all the states and get the decision path (i.e. given X, what leaf does X fall into?).

It turns out that sklearn has one id associated to each tree node. Our `state` will thus be a list of integers corresponding to these leaf nodes.

In [9]:
def get_state(regressor):
    """ Return the indices of the leaf nodes in the decision tree. """
    regressor_tree = regressor.tree_
    return np.where(regressor_tree.children_left == regressor_tree.children_right)[0]

states = get_state(best_regressor)
print(states)

[  8   9  11  12  15  16  18  19  23  24  26  27  30  31  33  34  39  40
  42  43  46  47  49  50  54  55  57  58  61  62  64  65  71  72  74  75
  78  79  81  82  86  87  89  90  93  94  96  97 102 103 105 106 109 110
 112 113 117 118 120 121 124 125 127 128 135 136 138 139 142 143 145 146
 150 151 153 154 157 158 160 161 166 167 169 170 173 174 176 177 181 182
 184 185 188 189 191 192 198 199 201 202 205 206 208 209 213 214 216 217
 220 221 223 224 229 230 232 233 234 238 239 241 242 245 246 248 249 257
 258 260 261 264 265 267 268 272 273 275 276 278 279 284 285 287 288 291
 292 294 295 299 300 302 303 306 307 309 310 316 317 319 320 323 324 326
 327 331 332 334 335 338 339 341 342 347 348 350 351 354 355 357 358 361
 363 364 366 367 374 375 377 378 381 382 384 385 389 390 392 393 396 397
 398 403 404 406 407 410 411 413 414 418 419 421 422 425 426 428 429 435
 436 438 439 442 443 445 446 450 451 453 454 457 458 460 461 466 467 469
 470 473 474 476 477 480 482 483 484]


In [10]:
def get_profit_list(regressor):
    #this method returns a list such that, the profit of state i is list[i].
    states = get_state(best_regressor)
    value_list = regressor.tree_.value.squeeze()
    return value_list[states]

get_profit_list(best_regressor)

array([-2.29000000e+02, -1.04250000e+02, -6.99473684e+01, -9.02162162e+01,
       -2.48000000e+02, -2.21000000e+02, -1.16500000e+02, -1.79750000e+02,
       -7.78571429e+01, -5.61304348e+01, -8.27727273e+01, -1.27500000e+02,
       -5.32222222e+01, -2.76470588e+01, -8.25000000e+01, -5.01290323e+01,
       -2.31000000e+02, -1.55000000e+02, -3.02000000e+02, -3.01000000e+02,
       -1.64368421e+02, -1.18000000e+02, -2.30000000e+02, -2.80000000e+02,
       -1.29000000e+02, -1.58764706e+02, -2.69000000e+02, -1.52500000e+02,
       -1.00153846e+02, -7.21250000e+01, -1.94200000e+02, -1.28600000e+02,
       -3.70769231e+01, -1.56000000e+01, -8.26363636e+01, -5.32777778e+01,
       -3.02727273e+01, -9.81250000e+00, -2.58235294e+01, -3.89600000e+01,
        9.40000000e+00, -1.00000000e+01, -1.60000000e+01, -3.60000000e+01,
        4.25000000e+00,  1.35000000e+01,  2.70000000e+01,  2.50000000e+01,
       -7.23333333e+01, -1.02400000e+02, -3.90000000e+01, -3.00000000e+01,
       -1.18000000e+02, -

Now let us print, say, the decision path for `X_TEST`.

In [11]:
def get_state_of_one_year(x, regressor, states):
    x_leaf_id = regressor.apply(x)
    x_states = np.empty(len(x), dtype = np.uint16)
    for k in range(len(x)):
        x_states[k] = np.where(states == x_leaf_id[k])[0][0]
    return x_states

get_state_of_one_year(X_TEST, best_regressor, states)

array([  3,  96,  15, 139, 163,   1,  73, 184, 196,  82,  73, 129, 193,
        58, 202,  96,  36,  71, 107, 159,  59, 103, 157,  97,  97, 130,
        97, 186, 156, 157, 129, 140,  36, 202, 162, 139,  38, 125, 213,
       129, 159,  35, 140, 129, 157, 213, 129,  15, 215,  64,  78, 157,
       112, 139,  80, 117, 157, 157, 162, 158,  96,  10, 167,   9,   3,
       130,  49, 141, 111,  35,  36, 129, 129,  37, 129,   2,   2,  35,
       150, 202,  32, 239,  97,  72, 174,  64,   2,  36, 190,   3, 186,
        15, 152, 119,   1,   2,  39, 118, 100, 109, 177, 167,   1,   0,
       170, 141, 160, 140,  97, 148, 215, 163,   5, 220, 159, 139, 139,
       228, 163,  49,  35,  36, 129,  15,  75, 103, 193,   3,  96,  15,
       157,  39, 159, 200,   3, 160, 163,  39,  99,   2, 100, 211,  37,
        68, 140,   9,  28, 161,  25, 139,  44, 174, 156, 139,  97, 156,
       139,  13, 140, 203,  44,  11, 160,  62, 125,  97, 159,  97, 155,
       203,  35, 156, 154, 178, 139,  80, 130, 219,  73,  71,  7

Since the data is random, we will use different seeds to generate $k$ pieces of data of same amound of clients. We assume that such is how one client changes over $k$ years. We shall focus on the mechanism of getting the transition probabilities and will ignore the fact that the client does not age by exactly one year old in the next year.

We remark that, in order for the calculation to work, we must have at least one client for every state. This might not hold. Therefore, we use a large group of clients.

In [12]:
k = 50 #number of years
seeds = np.random.choice(1000, k)
#yearly_X = []
yearly_state = []
sample_number = 5000

for i in range(k):
    new_dataframe = generate_data(sample_number, seeds[i])
    x,_ = obtain_X_Y(new_dataframe)
    #yearly_X.append(x)
    yearly_state.append(get_state_of_one_year(x,best_regressor, states))

print(yearly_state[0])

[167 209 186 ...  99 141  97]


In [13]:
def compute_transition_matrix(yearly_state, number_of_states):
    number_of_years = len(yearly_state)
    number_of_clients = len(yearly_state[0])
    transition_matrix = np.zeros((number_of_states,number_of_states), dtype = np.uint16)

    #For each time t, we count the number of clients starting at state l and end at every state.
    #Once this is done, we make each row a probability vector.

    for t in range(number_of_years-1):
        for n in range(number_of_clients):
            state_this_year = int(yearly_state[t][n])
            state_next_year = int(yearly_state[t+1][n])
            transition_matrix[state_this_year,state_next_year] += 1
    
    transition_matrix = transition_matrix.astype(np.float64)
    for l in range(number_of_states):
        row_sum = np.sum(transition_matrix[l])
        if row_sum == 0:
            transition_matrix[l] = (1/number_of_states)*np.ones(len(transition_matrix[l]))
        else:
            transition_matrix[l] = transition_matrix[l]/row_sum
    
    return transition_matrix

transition_matrix = compute_transition_matrix(yearly_state, len(states))
print(transition_matrix)

[[0.         0.00895522 0.02686567 ... 0.         0.         0.        ]
 [0.00152497 0.00991231 0.0186809  ... 0.         0.00038124 0.        ]
 [0.00131027 0.01048218 0.01677149 ... 0.         0.00052411 0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.00714286 0.00714286 ... 0.         0.         0.        ]
 [0.         0.06976744 0.02325581 ... 0.         0.02325581 0.        ]]


# 5. Computation of Customer Lifetime Value(CLV)

Fix a client of Intact.
We are now in a position of defining the concept of customer lifetime value and introduce our algorithm to compute it.

Let $A$ with some $\sigma$-algebra be a measurable space, called the **state space**. For each $t \in \{0,1,2,\cdots\}$, the **client state** is an $A$-valued random variable $S_t$, all defined on one common probability space $(\Omega, F, P)$. For fixed bounded measurable **profit function** $f: A \to \mathbb{R}$ and a **discounting factor** $\gamma = \frac{1}{1.15}$, we define, for every non-negative integer $t_0$ and every $a \in A$:

\begin{equation*}
CLV_{t_0}(a) = \mathbb{E}[\sum_{t=t_0 + 1}^{\tau} \gamma^t f(S_t) \mid S_{t_0} = a]
\end{equation*}

Here $\tau$ is a positive finite stopping time indicating the time which client first quits using Intact. Our goal is to compute $CLV_0(a)$ for every $a \in A$. 

[//]: <1. Enlarge the transition matrix with one row at bottom and one column at right. The bottom right corner of the matrix is $1$. Fill the last row with $0$ and the last column with $0.15$. Normalize the matrix so that it remains a transition matrix. The new state we have added represents the probability of client quiting Intact.>


Let $a \in A$ be given. For every $a' \in A$, we have:

\begin{equation*}
P\{S_1=a' \mid S_0=a\} = P\{S_1=a' \mid S_0=a, \text{client remains}\} P\{ \text{client remains} \mid S_0=a\} 
\end{equation*}

The term $P\{S_1=a' \mid S_0=a, \text{client remains}\}$ on the right hand side is taken care of by the transition matrix computed above. We take $P\{ \text{client remains} \mid S_0=a\} = 0.15$. Generate standard uniform $V$. If $V < P\{ \text{client remains} \mid S_0=a\}$, then the client quits. Otherwise, generate independent standard uniform $U$, which looks at the transition matrix and decide the value of $S_1$. Had simulated the state of $S_t$, repeat in this manner to generate the state of $S_{t+1}$. In doing so, we have generating a sample for the integrand. We then compute the expectation using Kolmogorov's strong law of large numbers.

In [14]:
def generate_sample_paths(number_of_paths, transition_matrix, initial_state, years_limit = 100, client_quit_rate = 0.15):
    sample_paths = []
    for _ in range(number_of_paths):
        sample_paths.append(generate_one_sample_path(transition_matrix, initial_state, years_limit, client_quit_rate))
    return sample_paths

def generate_one_sample_path(transition_matrix, initial_state, years_limit = 100, client_quit_rate = 0.15):
    number_of_states = transition_matrix.shape[0]
    assert initial_state < number_of_states, "State cannot be larger than dimension of matrix."
    sample_path = [initial_state]
    while True:
        v = np.random.uniform(low=0.0, high=1.0, size=1)[0]
        if v < client_quit_rate:
            sample_path.append(-1)
            return sample_path
        last_state = sample_path[-1]
        next_state = np.random.choice([i for i in range(number_of_states)], p=transition_matrix[last_state])
        sample_path.append(next_state)
        if len(sample_path) > years_limit:
            return sample_path

In [15]:
def CLV_one_path(path, profit_list, initial_time = 0, discounting_factor = 1/1.15):
    #Let only one sample path be given. We compute the CLV.
    clv = 0
    for t in range(len(path)):
        if path[t] != -1: #i.e. the client does not quit
            clv += discounting_factor**(initial_time + 1 + t)*profit_list[path[t]]
    return clv

def CLV_estimation(profit_list, initial_state, transition_matrix, initial_time = 0, discounting_factor = 1/1.15, number_of_paths = 10**4):
    sample_paths = generate_sample_paths(number_of_paths, transition_matrix, initial_state, years_limit = 100, client_quit_rate = 0.15)
    clv_samples = np.array([CLV_one_path(path, profit_list, initial_time, discounting_factor) for path in sample_paths])
    return np.mean(clv_samples)


In [16]:
CLV_0 = []
for i in range(len(states)):
    CLV_0.append(CLV_estimation(profit_list = get_profit_list(best_regressor), initial_state= i,
transition_matrix = transition_matrix, initial_time= 0, discounting_factor = 1/1.15, number_of_paths = 10))

print(CLV_0)

[-203.50669765715207, -77.15378044894227, -7.010352071103151, -91.19328063228552, -202.88298712348083, -206.8065889017592, -103.0836576550898, -160.9683468494196, -46.47859885076396, -24.591855593605068, -57.775594710112976, -100.03890380384334, -48.990025708284136, -12.535604364161658, -100.11403339659728, -7.016493745989683, -189.26918620913267, -109.38025164924052, -262.75592510871473, -261.97417774743064, -137.14757538762015, -69.80341893223341, -157.5435059382397, -284.6649921435943, -145.69462017201963, -112.28221849263643, -236.0579828281193, -155.59537460916474, -87.63872293334585, -53.46772296576118, -135.37600064023934, -81.99942686191726, -34.51250373159436, 0.5343804435717019, -50.18415586871921, -30.52701018486756, -20.67569563125925, -9.466942602310592, -16.745131108883847, -63.22835525248227, 76.57589726431415, -1.619321184245584, 11.213499566120504, -7.86284249371312, 22.976501538527863, 20.3784677566642, 23.17912116291539, 24.794581750858605, -68.19818975389032, -60.70