<H1> CLTV Prediction with MLflow</H1>

We are going to build simple machine learning models that predicts our customers lifetime value and compare their performances, now using MLflow.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report,precision_recall_fscore_support,accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split

import mlflow

<a name=1> <h1> 1. Feature Engineering </h2></a>

In [None]:
# get the data
import pickle

with open("tx_cluster.pkl", "rb") as f:
    tx_cluster = pickle.load(f)

tx_cluster.head()

In [None]:
#convert categorical columns to numerical
tx_class = pd.get_dummies(tx_cluster) #There is only one categorical variable segment
tx_class.head()

In [None]:
tx_class.describe()

In [None]:
#calculate and show correlations
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)

In [None]:
#create X and y, X will be feature set and y is the label - LTV
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']

#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

## Models

Since our LTV Clusters are 3 types, high LTV, mid LTV and low LTV; we will perform multi class classification.

In [None]:
mlflow.set_experiment("CLTV_testsize005")
mlflow.sklearn.autolog(disable=True)

### 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

basemodelname = "Logit_test"
with mlflow.start_run(run_name=basemodelname):
    params = {
        "penalty": None,
        "class_weight": 'balanced'}
    parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
    modelname=f"{basemodelname}_{parsuf}"
    mlflow.set_tag("model_name", modelname)
    mlflow.log_params(params)

    ltv_logreg = LogisticRegression(
        penalty=params['penalty'],
        class_weight=params['class_weight'],
        max_iter=1000
    ).fit(X_train, y_train)

    acc_train = ltv_logreg.score(X_train, y_train)
    acc_test = ltv_logreg.score(X_test[X_train.columns], y_test)

    print(f"Modelname: {modelname}")
    print('Accuracy of Logit classifier on training set: {:.2f}'.format(acc_train))
    print('Accuracy of Logit classifier on test set: {:.2f}'.format(acc_test))

    y_pred = ltv_logreg.predict(X_test)
    # clfreport = classification_report(y_test, y_pred)
    # print(clfreport)
    test_prf1s = precision_recall_fscore_support(y_test, y_pred)
    # print(f"precision:{prf1s[0]}\nrecall:{prf1s[1]}\nf1-score:{prf1s[2]}\naccuracy:{acc}")

    # log the skill metrics
    mlflow.log_metric('train_acc', acc_train)
    mlflow.log_metric('test_acc', acc_test)
    mlflow.log_metric('test_macroavg_f1', np.mean(test_prf1s[2]))

    # log the model as an artifact to enable later use
    mlflow.sklearn.log_model(ltv_logreg, "ltv_logreg")

### 2. XGBoost: testing

In [None]:
import xgboost as xgb

basemodelname = "xgboost_test"
with mlflow.start_run(run_name=basemodelname):
    params = {
        "max_depth": 4,
        "learning_rate":0.05}
    parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
    modelname=f"{basemodelname}_{parsuf}"
    mlflow.set_tag("model_name", modelname)
    mlflow.log_params(params)
    
    ltv_xgb = xgb.XGBClassifier(
        max_depth=params['max_depth'], 
        learning_rate=params['learning_rate'],
        n_jobs=-1
    ).fit(X_train, y_train)
    
    acc_train = ltv_xgb.score(X_train, y_train)
    acc_test = ltv_xgb.score(X_test[X_train.columns], y_test)
    
    print(f"Modelname: {modelname}")
    print('Accuracy of XGB classifier on training set: {:.2f}'.format(acc_train))
    print('Accuracy of XGB classifier on test set: {:.2f}'.format(acc_test))
    
    y_pred = ltv_xgb.predict(X_test)
    # clfreport = classification_report(y_test, y_pred)
    # print(clfreport)
    test_prf1s = precision_recall_fscore_support(y_test, y_pred)
    # print(f"precision:{prf1s[0]}\nrecall:{prf1s[1]}\nf1-score:{prf1s[2]}\naccuracy:{acc}")
    
    # log the skill metrics
    mlflow.log_metric('train_acc', acc_train)
    mlflow.log_metric('test_acc', acc_test)
    mlflow.log_metric('test_macroavg_f1', np.mean(test_prf1s[2]))
    
    # log the model as an artifact to enable later use
    mlflow.xgboost.log_model(ltv_xgb, "ltv_xgb")

### 2b. XGboost: parameter optimization

In [None]:
basemodelname = "xgboost_paropt"
with mlflow.start_run(run_name=basemodelname):
    params_list = {
        "max_depth": [4, 5],
        "learning_rate":[0.05, 0.1]}
    
    run_i = 0
    for max_depth in params_list['max_depth']:
        for learning_rate in params_list['learning_rate']:
            run_i += 1
            params = {
                "max_depth": max_depth,
                "learning_rate": learning_rate}
            parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
                
            with mlflow.start_run(run_name=parsuf, nested=True) as subrun_i:
                    
                modelname=f"{basemodelname}_{parsuf}"
                mlflow.set_tag("model_name", modelname)
                mlflow.log_params(params)

                ltv_xgb = xgb.XGBClassifier(
                    max_depth=params['max_depth'], 
                    learning_rate=params['learning_rate'],
                    n_jobs=-1
                ).fit(X_train, y_train)

                acc_train = ltv_xgb.score(X_train, y_train)
                acc_test = ltv_xgb.score(X_test[X_train.columns], y_test)

                print(f"Modelname: {modelname}")
                print('Accuracy of XGB classifier on training set: {:.2f}'.format(acc_train))
                print('Accuracy of XGB classifier on test set: {:.2f}'.format(acc_test))

                y_pred = ltv_xgb.predict(X_test)
                # clfreport = classification_report(y_test, y_pred)
                # print(clfreport)
                test_prf1s = precision_recall_fscore_support(y_test, y_pred)
                # print(f"precision:{prf1s[0]}\nrecall:{prf1s[1]}\nf1-score:{prf1s[2]}\naccuracy:{acc}")

                # log the skill metrics
                mlflow.log_metric('train_acc', acc_train)
                mlflow.log_metric('test_acc', acc_test)
                mlflow.log_metric('test_macroavg_f1', np.mean(test_prf1s[2]))

                # log the model as an artifact to enable later use
                mlflow.xgboost.log_model(ltv_xgb, "ltv_xgb")

## Test split: 0.2

In [None]:
#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_experiment("CLTV_testsize02")
mlflow.sklearn.autolog(disable=True)

In [None]:
from sklearn.linear_model import LogisticRegression

basemodelname = "TestSplit02_Logit_test"
with mlflow.start_run(run_name=basemodelname):
    params = {
        "penalty": None,
        "class_weight": 'balanced'}
    parsuf = '_'.join([key.replace('_','')+str(val).replace('.','') for key,val in params.items()])
    modelname=f"{basemodelname}_{parsuf}"
    mlflow.set_tag("model_name", modelname)
    mlflow.log_params(params)

    ltv_logit = LogisticRegression(
        penalty=params['penalty'],
        class_weight=params['class_weight'],
        max_iter=1000
    ).fit(X_train, y_train)

    acc_train = ltv_logit.score(X_train, y_train)
    acc_test = ltv_logit.score(X_test[X_train.columns], y_test)

    print(f"Modelname: {modelname}")
    print('Accuracy of Logit classifier on training set: {:.2f}'.format(acc_train))
    print('Accuracy of Logit classifier on test set: {:.2f}'.format(acc_test))

    y_pred = ltv_logreg.predict(X_test)
    # clfreport = classification_report(y_test, y_pred)
    # print(clfreport)
    test_prf1s = precision_recall_fscore_support(y_test, y_pred)
    # print(f"precision:{prf1s[0]}\nrecall:{prf1s[1]}\nf1-score:{prf1s[2]}\naccuracy:{acc}")

    # log the skill metrics
    mlflow.log_metric('train_acc', acc_train)
    mlflow.log_metric('test_acc', acc_test)
    mlflow.log_metric('test_macroavg_f1', np.mean(test_prf1s[2]))

    # log the model as an artifact to enable later use
    mlflow.sklearn.log_model(ltv_logit, "ltv_logit")

## Use a model that we saved earlier

In [None]:
if False:
    import mlflow
    import pandas as pd
    import pickle
    import xgboost
    from sklearn.model_selection import train_test_split

    with open("tx_class.pkl", "rb") as f:
        tx_cluster = pickle.load(f)

    #convert categorical columns to numerical
    tx_class = pd.get_dummies(tx_cluster) #There is only one categorical variable segment
    tx_class.head()

    #create X and y, X will be feature set and y is the label - LTV
    X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
    y = tx_class['LTVCluster']

    #split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

    ltv_xgb = mlflow.xgboost.load_model("runs:/f9f153aa679d4a8697cf1b23a0d479ac/ltv_xgb")

    acc_train = ltv_xgb.score(X_train, y_train)
    acc_test = ltv_xgb.score(X_test[X_train.columns], y_test)

    print('Accuracy of XGB classifier on training set: {:.2f}'.format(acc_train))
    print('Accuracy of XGB classifier on test set: {:.2f}'.format(acc_test))