# CLTV Prediction

In [75]:
# Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report,precision_recall_fscore_support,accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
import xgboost as xgb

## Data Preperation

In [76]:
import pickle

with open("tx_class.pkl", "rb") as f:
    tx_class = pickle.load(f)

In [77]:
tx_class.describe()

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Revenue,RevenueCluster,OverallScore,m6_Revenue,LTVCluster
count,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0
mean,15564.447059,91.649361,2.098721,83.230691,0.112788,1274.397832,0.057033,2.268542,722.131338,0.276215
std,1575.324626,100.365462,1.055131,131.120889,0.329833,1896.155427,0.231936,1.26992,1057.080028,0.526525
min,12346.0,0.0,0.0,1.0,0.0,-4287.63,0.0,0.0,-4287.63,0.0
25%,14212.25,16.0,1.0,16.25,0.0,279.23,0.0,1.0,101.25,0.0
50%,15575.0,49.5,2.0,40.0,0.0,618.04,0.0,2.0,358.585,0.0
75%,16913.75,144.0,3.0,98.0,0.0,1483.4975,0.0,3.0,917.29,0.0
max,18287.0,373.0,3.0,2782.0,2.0,21535.9,1.0,6.0,8432.68,2.0


In [78]:
#calculate and show correlations
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)

LTVCluster            1.000000
m6_Revenue            0.878757
Revenue               0.776924
RevenueCluster        0.608345
Frequency             0.568411
OverallScore          0.540214
FrequencyCluster      0.515850
Segment_High-Value    0.496705
RecencyCluster        0.355204
Segment_Mid-Value     0.188219
CustomerID           -0.027388
Recency              -0.346686
Segment_Low-Value    -0.377251
Name: LTVCluster, dtype: float64

In [79]:
#create X and y, X will be feature set and y is the label - LTV
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']

#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=56)

## Models

Since our LTV Clusters are 3 types, high LTV, mid LTV and low LTV; we will perform multi class classification.

### 1. Logistic Regression

In [94]:
from sklearn.linear_model import LogisticRegression

basemodelname = "Logit_test"
params = {
    "penalty": None,
    "class_weight": 'balanced'}
parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
modelname=f"{basemodelname}_{parsuf}"

ltv_logit = LogisticRegression(
    penalty=params['penalty'],
    class_weight=params['class_weight'],
    max_iter=1000
).fit(X_train, y_train)

acc_train = ltv_logreg.score(X_train, y_train)
acc_test = ltv_logreg.score(X_test[X_train.columns], y_test)

print(f"Modelname: {modelname}")
print('Accuracy of Logit classifier on training set: {:.2f}'.format(acc_train))
print('Accuracy of Logit classifier on test set: {:.2f}'.format(acc_test))

y_pred = ltv_logreg.predict(X_test)
clfreport = classification_report(y_test, y_pred)
print(clfreport)

Modelname: Logit_test_penaltyNone_classweightbalanced
Accuracy of Logit classifier on training set: 0.89
Accuracy of Logit classifier on test set: 0.85
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       146
           1       0.69      0.61      0.65        44
           2       1.00      0.50      0.67         6

    accuracy                           0.85       196
   macro avg       0.86      0.68      0.74       196
weighted avg       0.85      0.85      0.85       196



### 2. XGBoost

In [80]:
import xgboost as xgb

basemodelname = "xgboost_test"
params = {
        "max_depth": 5,
        "learning_rate":0.1}
parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
modelname=f"{basemodelname}_{parsuf}"

ltv_xgb = xgb.XGBClassifier(
    max_depth=params['max_depth'], 
    learning_rate=params['learning_rate'],
    n_jobs=-1
).fit(X_train, y_train)

print(f"Modelname: {modelname}")
acc_train = ltv_xgb.score(X_train, y_train)
acc_test = ltv_xgb.score(X_test[X_train.columns], y_test)

print('Accuracy of XGB classifier on training set: {:.2f}'.format(acc_train))
print('Accuracy of XGB classifier on test set: {:.2f}'.format(acc_test))

y_pred = ltv_xgb.predict(X_test)
clfreport = classification_report(y_test, y_pred)

Modelname: xgboost_test_maxdepth5_learningrate01
Accuracy of XGB classifier on training set: 0.95
Accuracy of XGB classifier on test set: 0.86


In [81]:
print(clfreport)

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       146
           1       0.70      0.68      0.69        44
           2       0.62      0.83      0.71         6

    accuracy                           0.86       196
   macro avg       0.75      0.81      0.77       196
weighted avg       0.86      0.86      0.86       196

