# Predictions for the 4th Quarter of 2022 - New customers

First, train the XGBoost model on all the data from the 1st Quarter of 2019 until the 3rd Quarter of 2022 and make predictions for the 4th Quarter of 2022.

Next, train separate models for each cluster on data from the 1st Quarter of 2019 until the 3rd Quarter of 2022. Get predictions for new customers of the 4th Quarter of 2022. First, this is done with the cluster assignments generated by the XGBoost model and, second, with the cluster assignments generated by XGBoost with SMOTE+Tomek resampling.

In [None]:
# Import necessary libraries
import pickle
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

## XGBoost without clustering

In [None]:
# Load data for training the model
with open("data_new.pkl", "rb") as file:
    data_new = pickle.load(file)

y = data_new["Total revenue"]
X = data_new.drop("Total revenue", axis = "columns")

In [None]:
# Define model with the best parameters 
model_xgb = XGBRegressor(subsample = 1, objective = 'reg:squarederror', n_estimators = 70, min_child_weight = 3, 
                    max_depth = 6, gamma = 0.2, eta = 0.3, colsample_bytree = 0.8, booster = 'gbtree')

In [None]:
# Train model
model_xgb.fit(X, y)

In [None]:
# Import data for new customers on the 4th Quarter of 2022
with open("data_new_test.pkl", "rb") as file:
    test_data = pickle.load(file)

In [None]:
# Fix Quarters variables
test_data["Quarter_2"] = 0
test_data["Quarter_4"] = 1 
# Remove observations with missing values in Default probability
test_data = test_data.loc[-test_data["Default probability"].isna()]

In [None]:
# Prepare test data 
test_y = test_data["Total revenue"]
test_X = test_data.drop(["Total revenue"], axis = "columns")

In [None]:
# Get predictions
pred = model_xgb.predict(test_X)

In [None]:
# Calculate MAE, MSE and RMSE
mae = mean_absolute_error(test_y, pred)
mse = mean_squared_error(test_y, pred)
rmse = mean_squared_error(test_y, pred, squared = False)

## Clusters + XGBoost

In [None]:
# Import data for new customers on the 4th Quarter of 2022
with open("data_new_test.pkl", "rb") as file:
    test_data = pickle.load(file)

In [None]:
# Fix Quarters variables
test_data["Quarter_2"] = 0
test_data["Quarter_4"] = 1 
# Remove observations with missing values in Default probability
test_data = test_data.loc[-test_data["Default probability"].isna()]

In [None]:
# Import cluster assignments 
with open("data_new_test_clusters.pkl", "rb") as file:
    test_clusters = pickle.load(file)
# Add cluster assignments to the data
test_data["Cluster"] = test_clusters

In [None]:
# Parameters for random search
params = {
    'n_estimators':[70, 100, 200],
    'min_child_weight':[1, 2, 3], 
    'gamma':[i/10.0 for i in range(0,3)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,5,6,7],
    'objective': ['reg:squarederror'],
    'booster': ['gbtree'],
    'eta': [i/10.0 for i in range(3,6)],
}

# Define random search
reg = XGBRegressor(nthread=-1)
n_iter_search = 50
random_search = RandomizedSearchCV(reg, param_distributions=params,
                                   n_iter=n_iter_search, cv=5, scoring='neg_mean_absolute_error')

### Cluster: Low

In [None]:
# Load data for training the model
with open("data_new_clusters_low.pkl", "rb") as file:
    train_low = pickle.load(file)

train_y_low = train_low["Total revenue"]
train_X_low = train_low.drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Conduct a random search for low revenue cluster
random_search.fit(train_X_low, train_y_low)
print(random_search.best_params_)

In [None]:
# Define model with the best parameters for low revenue cluster
model_xgb_low = XGBRegressor(subsample = 0.8, objective = 'reg:squarederror', n_estimators = 200, min_child_weight = 3, 
                             max_depth = 5, gamma = 0.0, eta = 0.3, colsample_bytree = 1.0, booster = 'gbtree')

In [None]:
# Train model for low revenue cluster
model_xgb_low.fit(train_X_low, train_y_low)

In [None]:
# Prepare test data for low revenue cluster
test_y_low = test_data.loc[test_data["Cluster"] == 0, "Total revenue"]
test_X_low = test_data.loc[test_data["Cluster"] == 0].drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Get predictions for low revenue cluster
pred_low = model_xgb_low.predict(test_X_low)

### Cluster: Medium

In [None]:
# Load data for training the model
with open("data_new_clusters_mid.pkl", "rb") as file:
    train_medium = pickle.load(file)
    
train_y_medium = train_medium["Total revenue"]
train_X_medium = train_medium.drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Conduct a random search for medium revenue cluster
random_search.fit(train_X_medium, train_y_medium)
print(random_search.best_params_)

In [None]:
# Define model with the best parameters for medium revenue cluster
model_xgb_medium = XGBRegressor(subsample = 1, objective = 'reg:squarederror', n_estimators = 100, min_child_weight = 3, 
                                max_depth = 3, gamma = 0, eta = 0.3, colsample_bytree = 1, booster = 'gbtree')

In [None]:
# Train model for medium revenue cluster
model_xgb_medium.fit(train_X_medium, train_y_medium)

In [None]:
# Prepare test data for medium revenue cluster
test_y_medium = test_data.loc[test_data["Cluster"] == 1, "Total revenue"]
test_X_medium = test_data.loc[test_data["Cluster"] == 1].drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Get predictions for medium revenue cluster
pred_medium = model_xgb_medium.predict(test_X_medium)

### Cluster: High

In [None]:
# Load data for training the model
with open("data_new_clusters_high.pkl", "rb") as file:
    train_high = pickle.load(file)
    
train_y_high = train_high["Total revenue"]
train_X_high = train_high.drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Conduct a random search for high revenue cluster
random_search.fit(train_X_high, train_y_high)
print(random_search.best_params_)

In [None]:
# Define model with the best parameters for high revenue cluster
model_xgb_high = XGBRegressor(subsample = 0.9, objective = 'reg:squarederror', n_estimators = 70, min_child_weight = 3, 
                                max_depth = 2, gamma = 0.1, eta = 0.3, colsample_bytree = 0.8, booster = 'gbtree')

In [None]:
# Train model for high revenue cluster
model_xgb_high.fit(train_X_high, train_y_high)

In [None]:
# Prepare test data for high revenue cluster
test_y_high = test_data.loc[test_data["Cluster"] == 2, "Total revenue"]
test_X_high = test_data.loc[test_data["Cluster"] == 2].drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Get predictions for high revenue cluster
pred_high = model_xgb_high.predict(test_X_high)

### Final evaluation

In [None]:
# Concatenate predictions
pred = np.concatenate((pred_low, pred_medium, pred_high))
# Concatenate test data
test_y = np.concatenate((test_y_low, test_y_medium, test_y_high))

In [None]:
# Calculate MAE, MSE, RMSE
mae = mean_absolute_error(test_y, pred)
mse = mean_squared_error(test_y, pred)
rmse = mean_squared_error(test_y, pred, squared = False)

## Clusters with SMOTE+Tomek resampling + XGBoost

In [None]:
# Import cluster assignments 
with open("data_new_test_clusters_XGBST.pkl", "rb") as file:
    test_clusters = pickle.load(file)
# Add cluster assignments to the data
test_data["Cluster"] = test_clusters

### Cluster: Low

In [None]:
# Prepare test data for low revenue cluster
test_y_low = test_data.loc[test_data["Cluster"] == 0, "Total revenue"]
test_X_low = test_data.loc[test_data["Cluster"] == 0].drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Get predictions for low revenue cluster
pred_low = model_xgb_low.predict(test_X_low)

### Cluster: Medium

In [None]:
# Prepare test data for medium revenue cluster
test_y_medium = test_data.loc[test_data["Cluster"] == 1, "Total revenue"]
test_X_medium = test_data.loc[test_data["Cluster"] == 1].drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Get predictions for medium revenue cluster
pred_medium = model_xgb_medium.predict(test_X_medium)

### Cluster: High

In [None]:
# Prepare test data for high revenue cluster
test_y_high = test_data.loc[test_data["Cluster"] == 2, "Total revenue"]
test_X_high = test_data.loc[test_data["Cluster"] == 2].drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Get predictions for high revenue cluster
pred_high = model_xgb_high.predict(test_X_high)

### Final evaluation

In [None]:
# Concatenate predictions
pred = np.concatenate((pred_low, pred_medium, pred_high))
# Concatenate test data
test_y = np.concatenate((test_y_low, test_y_medium, test_y_high))

In [None]:
# Calculate MAE, MSE, RMSE
mae = mean_absolute_error(test_y, pred)
mse = mean_squared_error(test_y, pred)
rmse = mean_squared_error(test_y, pred, squared = False)