In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report,precision_recall_fscore_support,accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
import xgboost as xgb

import mlflow

In [49]:
import pickle

with open("tx_class.pkl", "rb") as f:
    tx_class = pickle.load(f)

In [50]:
tx_class.describe()

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Revenue,RevenueCluster,OverallScore,m6_Revenue,LTVCluster
count,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0
mean,15564.447059,91.649361,2.098721,83.230691,0.112788,1274.397832,0.057033,2.268542,722.131338,0.276215
std,1575.324626,100.365462,1.055131,131.120889,0.329833,1896.155427,0.231936,1.26992,1057.080028,0.526525
min,12346.0,0.0,0.0,1.0,0.0,-4287.63,0.0,0.0,-4287.63,0.0
25%,14212.25,16.0,1.0,16.25,0.0,279.23,0.0,1.0,101.25,0.0
50%,15575.0,49.5,2.0,40.0,0.0,618.04,0.0,2.0,358.585,0.0
75%,16913.75,144.0,3.0,98.0,0.0,1483.4975,0.0,3.0,917.29,0.0
max,18287.0,373.0,3.0,2782.0,2.0,21535.9,1.0,6.0,8432.68,2.0


In [51]:
#calculate and show correlations
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)

LTVCluster            1.000000
m6_Revenue            0.878757
Revenue               0.776924
RevenueCluster        0.608345
Frequency             0.568411
OverallScore          0.540214
FrequencyCluster      0.515850
Segment_High-Value    0.496705
RecencyCluster        0.355204
Segment_Mid-Value     0.188219
CustomerID           -0.027388
Recency              -0.346686
Segment_Low-Value    -0.377251
Name: LTVCluster, dtype: float64

In [52]:
#create X and y, X will be feature set and y is the label - LTV
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']

#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=56)

## Models

Since our LTV Clusters are 3 types, high LTV, mid LTV and low LTV; we will perform multi class classification.

In [53]:
mlflow.set_experiment("CLTV")
mlflow.sklearn.autolog(disable=True)

### 1. XGBoost 

In [54]:
basemodelname = "xgboost_test"
with mlflow.start_run(run_name=basemodelname):
    params = {
        "max_depth": 5,
        "learning_rate":0.1}
    parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
    modelname=f"{basemodelname}_{parsuf}"
    mlflow.set_tag("model_name", modelname)
    mlflow.log_params(params)
    
    ltv_xgb = xgb.XGBClassifier(
        max_depth=params['max_depth'], 
        learning_rate=params['learning_rate'],
        n_jobs=-1
    ).fit(X_train, y_train)
    
    acc_train = ltv_xgb.score(X_train, y_train)
    acc_test = ltv_xgb.score(X_test[X_train.columns], y_test)
    
    print(f"Modelname: {modelname}")
    print('Accuracy of XGB classifier on training set: {:.2f}'
           .format(acc_train))
    print('Accuracy of XGB classifier on test set: {:.2f}'
          .format(acc_test))
    
    y_pred = ltv_xgb.predict(X_test)
    # clfreport = classification_report(y_test, y_pred)
    test_prf1s = precision_recall_fscore_support(y_test, y_pred)
    # print(f"precision:{prf1s[0]}\nrecall:{prf1s[1]}\nf1-score:{prf1s[2]}\naccuracy:{acc}")
    
    # log the skill metrics
    mlflow.log_metric('train_acc', acc_train)
    mlflow.log_metric('test_acc', acc_test)
    mlflow.log_metric('test_macroavg_f1', np.mean(prf1s[2]))
    
    # log the model as an artifact to enable later use
    mlflow.xgboost.log_model(ltv_xgb, "ltv_xgb")

Modelname: xgboost_test_maxdepth5_learningrate01
Accuracy of XGB classifier on training set: 0.95
Accuracy of XGB classifier on test set: 0.86


In [55]:
basemodelname = "xgboost_paropt"
with mlflow.start_run(run_name=basemodelname):
    params_list = {
        "max_depth": [4, 5],
        "learning_rate":[0.05, 0.1]}
    
    run_i = 0
    for max_depth in params_list['max_depth']:
        for learning_rate in params_list['learning_rate']:
            run_i += 1
            params = {
                "max_depth": max_depth,
                "learning_rate": learning_rate}
            parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
                
            with mlflow.start_run(run_name=parsuf, nested=True) as subrun_i:
                    
                modelname=f"{basemodelname}_{parsuf}"
                mlflow.set_tag("model_name", modelname)
                mlflow.log_params(params)

                ltv_xgb = xgb.XGBClassifier(
                    max_depth=params['max_depth'], 
                    learning_rate=params['learning_rate'],
                    n_jobs=-1
                ).fit(X_train, y_train)

                acc_train = ltv_xgb.score(X_train, y_train)
                acc_test = ltv_xgb.score(X_test[X_train.columns], y_test)

                print(f"Modelname: {modelname}")
                print('Accuracy of XGB classifier on training set: {:.2f}'
                       .format(acc_train))
                print('Accuracy of XGB classifier on test set: {:.2f}'
                      .format(acc_test))

                y_pred = ltv_xgb.predict(X_test)
                # clfreport = classification_report(y_test, y_pred)
                test_prf1s = precision_recall_fscore_support(y_test, y_pred)
                # print(f"precision:{prf1s[0]}\nrecall:{prf1s[1]}\nf1-score:{prf1s[2]}\naccuracy:{acc}")

                # log the skill metrics
                mlflow.log_metric('train_acc', acc_train)
                mlflow.log_metric('test_acc', acc_test)
                mlflow.log_metric('test_macroavg_f1', np.mean(prf1s[2]))

                # log the model as an artifact to enable later use
                mlflow.xgboost.log_model(ltv_xgb, "ltv_xgb")

Modelname: xgboost_paropt_maxdepth4_learningrate005
Accuracy of XGB classifier on training set: 0.92
Accuracy of XGB classifier on test set: 0.86
Modelname: xgboost_paropt_maxdepth4_learningrate01
Accuracy of XGB classifier on training set: 0.93
Accuracy of XGB classifier on test set: 0.85
Modelname: xgboost_paropt_maxdepth5_learningrate005
Accuracy of XGB classifier on training set: 0.93
Accuracy of XGB classifier on test set: 0.85
Modelname: xgboost_paropt_maxdepth5_learningrate01
Accuracy of XGB classifier on training set: 0.95
Accuracy of XGB classifier on test set: 0.86


## Use a model that we saved earlier

In [58]:
import mlflow
import pickle
import xgboost
from sklearn.model_selection import train_test_split

with open("tx_class.pkl", "rb") as f:
    tx_class = pickle.load(f)

    
#create X and y, X will be feature set and y is the label - LTV
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']

#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=56)

ltv_xgb = mlflow.xgboost.load_model("runs:/e54c8f43068f45358ad5ba63fe42b589/ltv_xgb")

print('Accuracy of XGB classifier on training set: {:.2f}'
       .format(ltv_xgb.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
      .format(ltv_xgb.score(X_test[X_train.columns], y_test)))

Accuracy of XGB classifier on training set: 0.95
Accuracy of XGB classifier on test set: 0.86
