# Cluster assignments

Get cluster assignments for customers in the 4th Quarter of 2022. That is, predict what cluster the customers in th 4th Quarter of 2022 belong to.

In [None]:
# Import necessary libraries
import pickle 
import pandas as pd
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTENC
from xgboost import XGBRegressor, XGBClassifier

## Recurring customers

For recurring customers, only XGBoost was used to get cluster assignments.

### XGBoost

In [None]:
# Import training data
with open('Data clustering\\data_recurring_clusters.pkl', 'rb') as file:
    data_recurring_clusters = pickle.load(file)

In [None]:
# Divide data into dependent and independent variables
y = data_recurring_clusters["Cluster"]
X = data_recurring_clusters.drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Define the model with the best parameters 
model_xgb = XGBClassifier(subsample = 0.9, objective = 'multi:softmax', n_estimators = 200, min_child_weight = 3, 
                    max_depth = 6, gamma = 0.2, eta = 0.3, colsample_bytree = 0.6, booster = 'gbtree')

In [None]:
# Fit the model
model_xgb.fit(X, y)

In [None]:
# Import data for predictions
with open('Data final\\data_recurring_test.pkl', 'rb') as file:
    data_recurring_test = pickle.load(file)

In [None]:
# Fix Quarter variables
data_recurring_test["Quarter_3"] = 0
data_recurring_test["Quarter_4"] = 1 
# Remove total revenue (we don't predict clusters based on this variable)
data_recurring_test = data_recurring_test.drop(["Total revenue"], axis = "columns")
# Remove observations with missing values in Default probability
data_recurring_test = data_recurring_test.loc[-data_recurring_test["Default probability"].isna()]

In [None]:
# Get predictions
pred = model_xgb.predict(data_recurring_test)

In [None]:
# Save predictions
with open("data_recurring_test_clusters.pkl", "wb") as file:
    pickle.dump(pred, file)

## New customers

For new customers, XGBoost and XGBoost with SMOTE+Tomek resampling were used to get cluster assignments.

### XGBoost

In [None]:
# Import training data
with open('Data clustering\\data_new_clusters.pkl', 'rb') as file:
    data_new_clusters = pickle.load(file)

In [None]:
# Divide data into dependent and independent variables
y = data_new_clusters["Cluster"]
X = data_new_clusters.drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Define the model with the best parameters 
model_xgb = XGBClassifier(subsample = 1, objective = 'multi:softmax', n_estimators = 200, min_child_weight = 2, 
                    max_depth = 5, gamma = 0.2, eta = 0.5, colsample_bytree = 0.9, booster = 'gbtree')

In [None]:
# Fit the model
model_xgb.fit(X, y)

In [None]:
# Import data for predictions
with open('Data final\\data_new_test.pkl', 'rb') as file:
    data_new_test = pickle.load(file)

In [None]:
# Fix Quarter variables
data_new_test["Quarter_2"] = 0
data_new_test["Quarter_4"] = 1 
# Remove total revenue (we don't predict clusters based on this variable)
data_new_test = data_new_test.drop(["Total revenue"], axis = "columns")
# Remove observations with missing values in Default probability
data_new_test = data_new_test.loc[-data_new_test["Default probability"].isna()]

In [None]:
# Get predictions
pred = model_xgb.predict(data_new_test)

In [None]:
# Save predictions
with open("data_new_test_clusters.pkl", "wb") as file:
    pickle.dump(pred, file)

### XGBoost with SMOTE+Tomek resampling

In [None]:
# Import training data
with open('Data clustering\\data_new_clusters.pkl', 'rb') as file:
    data_new_clusters = pickle.load(file)

In [None]:
# Dataframe with features and array with outcomes
y = data_new_clusters["Cluster"]
X = data_new_clusters.drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Categorical variables 
cat_vars = X.columns.isin(["Insurance", "Co-applicant", "Gender", "Quarter_2", "Quarter_3", "Quarter_4"])

In [None]:
# Resample with Smote-Tomek links
resample = SMOTETomek(smote = SMOTENC(categorical_features = cat_vars, sampling_strategy = "not majority", k_neighbors = 5),  
                      tomek = TomekLinks(sampling_strategy = "all"), 
                      random_state = 3)
resampled_data = resample.fit_resample(X, y)
X_resampled = resampled_data[0]
y_resampled = resampled_data[1]

In [None]:
# Define a model with the best parameters 
model_xgb = XGBClassifier(subsample = 1, objective = 'multi:softmax', n_estimators = 200, min_child_weight = 2, 
                    max_depth = 5, gamma = 0.2, eta = 0.5, colsample_bytree = 0.9, booster = 'gbtree')

In [None]:
# Fit a model
model_xgb.fit(X_resampled, y_resampled)

In [None]:
# Import data for predictions
with open('Data final\\data_new_test.pkl', 'rb') as file:
    data_new_test = pickle.load(file)

In [None]:
# Fix Quarter variables
data_new_test["Quarter_2"] = 0
data_new_test["Quarter_4"] = 1 
# Remove total revenue (we don't predict clusters based on this variable)
data_new_test = data_new_test.drop(["Total revenue"], axis = "columns")
# Remove observations with missing values in Default probability
data_new_test = data_new_test.loc[-data_new_test["Default probability"].isna()]

In [None]:
# Get predictions
pred = model_xgb.predict(data_new_test)

In [None]:
# Save predictions
with open("data_new_test_clusters_XGBST.pkl", "wb") as file:
    pickle.dump(pred, file)