# Predictions for the 4th Quarter of 2022 - Recurring customers

Compare the predictions of revenues with the clustering and without the clustering of customers. For the predictions without the clustering, the XGBoost model is trained on all data from the 1st Quarter of 2019 until the 3rd Quarter of 2022 and used to generate predictions for the 4th Quarter of 2022. For the predictions with clustering, separate XGBoost models are trained for each cluster on data from the 1st Quarter of 2019 until the 3rd Quarter of 2022 and used to generate predictions for the 4th Quarter of 2022.

In [None]:
# Import libraries
import pandas as pd
import pickle
import numpy as np
import itertools
import math
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf
import random
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler, RobustScaler
from tensorflow.keras.optimizers import Adam
from keras_tuner import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.model_selection import cross_val_score, RepeatedKFold, RandomizedSearchCV
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.multitest import multipletests

## Without clustering

In [None]:
# Upload data
with open('Data final//data_recurring_imputed.pkl', 'rb') as file:
    data = pickle.load(file)

In [None]:
# Split data into dependent and independent variables
X = data.drop(["Total_revenue"], axis = "columns")
y = data["Total_revenue"]

In [None]:
# Define model
xgb = XGBRegressor(subsample = 0.9, objective = 'reg:squarederror', n_estimators = 200, min_child_weight = 2, 
                    max_depth = 6, gamma = 0.1, eta = 0.3, colsample_bytree = 0.8, booster = 'gbtree')
  
# Fit the model
xgb.fit(X, y)

In [None]:
# Upload test data
with open('data_recurring_test.pkl', 'rb') as file:
    data_recurring_test = pickle.load(file)

In [None]:
# Transform test data to have the columns in the same order
data_recurring_test = data_recurring_test.to_numpy()
column_names_recurring = ["Number of accounts", "Age", "Longevity", "Insurance", "Total revenue", "Loan extensions", "Co-applicant", 
                "Invoice accounts", "Buy-now-pay-later", "Credit cards A", 
                "Credit cards B", "Credit cards C", "Consumer loans", "Default probability", 
                "Minimum limit", "Maximum limit", "Minimum balance", "Maximum balance", "Late payment",
                "Number of transactions", "Exchange rate", "GDP growth", "Inflation", "Unemployment rate",
                "Consumer confidence index", "Consumption of durables", "Interest rate", "Gender",
                "Quarter_2", "Quarter_3", "Quarter_4"]
data_recurring_test = pd.DataFrame(np.row_stack(data_recurring_test), columns = column_names_recurring)
data_recurring_test["Quarter_3"] = 0 
data_recurring_test["Quarter_4"] = 1
# Remove rows where default probability is missing
data_recurring_test = data_recurring_test.loc[-data_recurring_test["Default probability"].isna()]

In [None]:
# Split data into dependent and independent variables
X_test = data_recurring_test.drop(["Total revenue"], axis = "columns")
y_test = data_recurring_test["Total revenue"]

In [None]:
# Generate predictions for test data
pred = xgb.predict(X_test)

In [None]:
# Calculate MAE
mean_absolute_error(y_test, pred)

In [None]:
# Calculate MSE
mean_squared_error(y_test, pred)

In [None]:
# Calculate RMSE
mean_squared_error(y_test, pred, squared=False)

## With clustering

In [None]:
# Upload test data
with open("data_recurring_test.pkl", "rb") as file:
    data_recurring_test = pickle.load(file)

In [None]:
# Transform test data to have the columns in the same order
data_recurring_test = data_recurring_test.to_numpy()
column_names_recurring = ["Number of accounts", "Age", "Longevity", "Insurance", "Total revenue", "Loan extensions", "Co-applicant", 
                "Invoice accounts", "Buy-now-pay-later", "Credit cards A", 
                "Credit cards B", "Credit cards C", "Consumer loans", "Default probability", 
                "Minimum limit", "Maximum limit", "Minimum balance", "Maximum balance", "Late payment",
                "Number of transactions", "Exchange rate", "GDP growth", "Inflation", "Unemployment rate",
                "Consumer confidence index", "Consumption of durables", "Interest rate", "Gender",
                "Quarter_2", "Quarter_3", "Quarter_4"]
data_recurring_test = pd.DataFrame(np.row_stack(data_recurring_test), columns = column_names_recurring)
data_recurring_test["Quarter_3"] = 0 
data_recurring_test["Quarter_4"] = 1
# Remove rows where default probability is missing
data_recurring_test = data_recurring_test.loc[-data_recurring_test["Default probability"].isna()]

In [None]:
# Upload predicted clusters
with open("data_recurring_test_clusters.pkl", "rb") as file:
    data_recurring_test_clusters = pickle.load(file)

# Add predicted clusters to the test data 
data_recurring_test["Predicted_cluster"] = data_recurring_test_clusters

In [None]:
# Parameters for random search
params = {
    'n_estimators':[70, 100, 200],
    'min_child_weight':[1, 2, 3], 
    'gamma':[i/10.0 for i in range(0,3)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,5,6,7],
    'objective': ['reg:squarederror'],
    'booster': ['gbtree'],
    'eta': [i/10.0 for i in range(3,6)],
}

reg = XGBRegressor(nthread=-1)

n_iter_search = 50

random_search = RandomizedSearchCV(reg, param_distributions=params,
                                   n_iter=n_iter_search, cv=5, scoring='neg_mean_absolute_error')

### Low revenue

In [None]:
# Upload predicted clusters 
with open('data_recurring_clusters_low.pkl', 'rb') as file:
    data_low = pickle.load(file)

In [None]:
# Remove the columnn of clusters
data_low = data_low.drop(["Cluster"], axis = "columns")
# Change column names
data_low = data_low.to_numpy()
column_names_recurring = ["Number of accounts", "Age", "Longevity", "Insurance", "Total revenue", "Loan extensions", "Co-applicant", 
                "Invoice accounts", "Buy-now-pay-later", "Credit cards A", 
                "Credit cards B", "Credit cards C", "Consumer loans", "Default probability", 
                "Minimum limit", "Maximum limit", "Minimum balance", "Maximum balance", "Late payment",
                "Number of transactions", "Exchange rate", "GDP growth", "Inflation", "Unemployment rate",
                "Consumer confidence index", "Consumption of durables", "Interest rate", "Gender",
                "Quarter_2", "Quarter_3", "Quarter_4"]
data_low = pd.DataFrame(np.row_stack(data_low), columns = column_names_recurring)

In [None]:
# Split data into dependent and independent variables
X = data_low.drop(["Total revenue"], axis = "columns")
y = data_low["Total revenue"]

In [None]:
# Conduct a random search for low revenue cluster
random_search.fit(X, y)
print(random_search.best_params_)

In [None]:
# Define model
xgb = XGBRegressor(subsample = 0.9, objective = 'reg:squarederror', n_estimators = 200, min_child_weight = 1, 
                    max_depth = 7, gamma = 0, eta = 0.3, colsample_bytree = 1, booster = 'gbtree')
  
# Fit the model
xgb.fit(X, y)

In [None]:
# Filter customers with predicted low revenues
data_recurring_test_low = data_recurring_test.loc[data_recurring_test["Predicted_cluster"] == 0]

In [None]:
# Split data into dependent and independent variables
data_recurring_test_low_X = data_recurring_test_low.drop(["Predicted_cluster", "Total revenue"], axis = "columns")
data_recurring_test_low_y = data_recurring_test_low["Total revenue"]

In [None]:
# Predict revenue
pred = xgb.predict(data_recurring_test_low_X)

In [None]:
# Save predicted revenues
with open('data_recurring_clusters_test_predictions_XGB_all_classes.pkl', 'wb') as file:
    pickle.dump(pred, file)
# Save real revenues 
with open('data_recurring_clusters_test_predictions_XGB_y_all_classes.pkl', 'wb') as file:
    pickle.dump(data_recurring_test_low_y, file)

### Middle revenue

In [None]:
# Upload predicted clusters 
with open('data_recurring_clusters_mid.pkl', 'rb') as file:
    data_mid = pickle.load(file)

In [None]:
# Remove the columnn of clusters
data_mid = data_mid.drop(["Cluster"], axis = "columns")
# Change column names
data_mid = data_mid.to_numpy()
data_mid = pd.DataFrame(np.row_stack(data_mid), columns = column_names_recurring)

In [None]:
# Split data into dependent and independent variables
X = data_mid.drop(["Total revenue"], axis = "columns")
y = data_mid["Total revenue"]

In [None]:
# Conduct a random search for middle revenue cluster
random_search.fit(X, y)
print(random_search.best_params_)

In [None]:
# Define model
xgb = XGBRegressor(subsample = 0.9, objective = 'reg:squarederror', n_estimators = 200, min_child_weight = 3, 
                    max_depth = 7, gamma = 0.1, eta = 0.3, colsample_bytree = 0.7, booster = 'gbtree')
  
# Fit the model
xgb.fit(X, y)

In [None]:
# Filter customers with predicted low revenues
data_recurring_test_mid = data_recurring_test.loc[data_recurring_test["Predicted_cluster"] == 1]
# Split data into dependent and independent variables
data_recurring_test_mid_X = data_recurring_test_mid.drop(["Predicted_cluster", "Total revenue"], axis = "columns")
data_recurring_test_mid_y = data_recurring_test_mid["Total revenue"]
# Predict revenue
pred = xgb.predict(data_recurring_test_mid_X)

In [None]:
# Upload predictions for low revenues
with open('data_recurring_clusters_test_predictions_XGB_all_classes.pkl', 'rb') as file:
    pred_all_classes = pickle.load(file)

In [None]:
# Join predictions for low and middle revenues
pred_all_classes = np.concatenate([pred_all_classes, pred])

In [None]:
# Save predicted revenues
with open('data_recurring_clusters_test_predictions_XGB_all_classes.pkl', 'wb') as file:
    pickle.dump(pred_all_classes, file)

In [None]:
# Upload real revenues for low revenue segment
with open('data_recurring_clusters_test_predictions_XGB_y_all_classes.pkl', 'rb') as file:
    y_all_classes = pickle.load(file)

In [None]:
# Join real revenues for low and middle revenue segment
y_all_classes = np.concatenate([y_all_classes, data_recurring_test_mid_y])

In [None]:
# Save real revenues for low and middle revenue segment
with open('data_recurring_clusters_test_predictions_XGB_y_all_classes.pkl', 'wb') as file:
    pickle.dump(y_all_classes, file)

### High revenue

In [None]:
# Upload predicted clusters 
with open('data_recurring_clusters_high.pkl', 'rb') as file:
    data_high = pickle.load(file)

In [None]:
# Remove the columnn of clusters
data_high = data_high.drop(["Cluster"], axis = "columns")
# Change column names
data_high = data_high.to_numpy()
data_high = pd.DataFrame(np.row_stack(data_high), columns = column_names_recurring)

In [None]:
# Split data into dependent and independent variables
X = data_high.drop(["Total revenue"], axis = "columns")
y = data_high["Total revenue"]

In [None]:
# Conduct a random search for high revenue cluster
random_search.fit(X, y)
print(random_search.best_params_)

In [None]:
# Define model
xgb = XGBRegressor(subsample = 1, objective = 'reg:squarederror', n_estimators = 200, min_child_weight = 2, 
                    max_depth = 7, gamma = 0.2, eta = 0.3, colsample_bytree = 0.8, booster = 'gbtree')
  
# Fit the model
xgb.fit(X, y)

In [None]:
# Filter customers with predicted low revenues
data_recurring_test_high = data_recurring_test.loc[data_recurring_test["Predicted_cluster"] == 2]
# Split data into dependent and independent variables
data_recurring_test_high_X = data_recurring_test_high.drop(["Predicted_cluster", "Total revenue"], axis = "columns")
data_recurring_test_high_y = data_recurring_test_high["Total revenue"]
# Predict revenue
pred = xgb.predict(data_recurring_test_high_X)

In [None]:
# Upload predictions for low and middle revenues
with open('data_recurring_clusters_test_predictions_XGB_all_classes.pkl', 'rb') as file:
    pred_all_classes = pickle.load(file)

In [None]:
# Join predictions for low, middle and high revenues
pred_all_classes = np.concatenate([pred_all_classes, pred])

In [None]:
# Save predicted revenues
with open('data_recurring_clusters_test_predictions_XGB_all_classes.pkl', 'wb') as file:
    pickle.dump(pred_all_classes, file)

In [None]:
# Upload real revenues for low and middle revenue segment
with open('data_recurring_clusters_test_predictions_XGB_y_all_classes.pkl', 'rb') as file:
    y_all_classes = pickle.load(file)

In [None]:
# Join real revenues for low, middle and high revenue segment
y_all_classes = np.concatenate([y_all_classes, data_recurring_test_high_y])

In [None]:
# Save real revenues for low, middle and high revenue segment
with open('data_recurring_clusters_test_predictions_XGB_y_all_classes.pkl', 'wb') as file:
    pickle.dump(y_all_classes, file)

## Evaluate models

In [None]:
# Upload predicted revenues for all clusters
with open('data_recurring_clusters_test_predictions_XGB_all_classes.pkl', 'rb') as file:
    pred_all_classes = pickle.load(file)

In [None]:
# Upload real revenues for all clusters
with open('data_recurring_clusters_test_predictions_XGB_y_all_classes.pkl', 'rb') as file:
    y_all_classes = pickle.load(file)

In [None]:
# Calculate MAE
mean_absolute_error(y_all_classes, pred_all_classes)

In [None]:
# Calculate MSE
mean_squared_error(y_all_classes, pred_all_classes)

In [None]:
# Calculate RMSE
mean_squared_error(y_all_classes, pred_all_classes, squared=False)