# Meta Model

## Imports and custom functions

First, we import libraries, arrays and list we are going to use for this section.

We also redefine a custom function `display_classification_reports_confusion_matrices()` used after GridSearch to display classification report and confusion matrix overall, but also by customer class to assess predictive power for each customer class. `custom_format()` is used to reformat gridsearch results and improve readability.

In [1]:
import os
import time
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import cycle
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import (accuracy_score, make_scorer, f1_score, 
                             precision_score, recall_score, 
                             classification_report, ConfusionMatrixDisplay,
                             average_precision_score)
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier

import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler                    

In [2]:
def display_classification_reports_confusion_matrices(y_pred):
    """
    Takes predictions array as input and display classificaiton report overall and then
    Confusion matrices by customer class (rfm_label) that has been previously one hot encoded
    """
    # Create two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4))
    fig.subplots_adjust(wspace=0.8)

    # Plots standard confusion matrix
    ax1.set_title("Confusion Matrix (counts)")
    disp1 = ConfusionMatrixDisplay.from_predictions(
        y_test, y_pred, display_labels=class_names, ax=ax1)
    disp1.ax_.set_xticklabels(class_names, rotation=90)
    disp1.im_.colorbar.remove()

    # Plots normalized confusion matrix
    ax2.set_title("Confusion Matrix (ratios)")
    disp2 = ConfusionMatrixDisplay.from_predictions(
        y_test, y_pred, normalize="true", display_labels=class_names, ax=ax2)
    disp2.ax_.set_xticklabels(class_names, rotation=90)
    disp2.im_.colorbar.remove()

    # Get classification report
    print(classification_report(y_test, y_pred, output_dict=False, zero_division=1))
    
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=1)
    # Get precision, recall, f1 from report dict
    precision = round(report['weighted avg']['precision'], 2)
    recall = round(report['weighted avg']['recall'], 2)
    f1 = round(report['weighted avg']['f1-score'], 2)
    
    # Format title
    title_formatted = f"Overall Confusion Matrices\nprecision={precision} recall={recall} f1={f1}"
    
    fig.suptitle(title_formatted, y=1.005)
    plt.show()
    
    # OHE columns matching our rfm_labels
    labels_of_interest = ['rfm_label_Good Customers', 'rfm_label_Low Value Customers', 'rfm_label_VIP']

    # Find column indices corresponding to OHE rfm_label columns
    label_indices = [np.where(columns == label)[0][0] for label in labels_of_interest]

    # Iterate over each OHE rfm_label
    for i, label in enumerate(labels_of_interest):    
        # Find column position of OHE rfm_label
        label_index = label_indices[i]

        # Find rows of X_test where OHE rfm_label is true
        # as data has been processed using scaler
        # we select positive instead of equal to 1
        rows_of_interest= np.where(X_test[:, label_index] >= 0)

        # Select corresponding rows of y_test and y_pred
        y_test_subset = y_test[rows_of_interest]
        y_pred_subset = y_pred[rows_of_interest]

        # Restrict our class_names to only classes in subset
        # VIP can only become 5 values, other 6.
        classes = np.unique(y_test_subset)
        class_names_subset = [class_names[label] for label in classes if label in class_labels]

        fig, axs = plt.subplots(1, 2, figsize=(9, 4))

        # Plots standard confusion matrix
        axs[0].set_title("Confusion Matrix (counts)")
        disp1 = ConfusionMatrixDisplay.from_predictions(
            y_test_subset, y_pred_subset, display_labels=class_names_subset, ax=axs[0])
        disp1.ax_.set_xticklabels(class_names_subset, rotation=90)
        disp1.im_.colorbar.remove()

        # Plots normalized confusion matrix
        axs[1].set_title("Confusion Matrix (ratios)")
        disp2 = ConfusionMatrixDisplay.from_predictions(
            y_test_subset, y_pred_subset, normalize="true", display_labels=class_names_subset, ax=axs[1])
        disp2.ax_.set_xticklabels(class_names_subset, rotation=90)
        disp2.im_.colorbar.remove()

        # Get classification report
        report = classification_report(y_test_subset, y_pred_subset, output_dict=True, zero_division=1)
        # Get precision, recall, f1 from report dict
        precision = round(report['weighted avg']['precision'], 2)
        recall = round(report['weighted avg']['recall'], 2)
        f1 = round(report['weighted avg']['f1-score'], 2)
        # Format title
        title_formatted = f"{label}\nprecision={precision} recall={recall} f1={f1}"
        
        # Set figure title to label of interest
        fig.subplots_adjust(wspace=0.8)
        fig.suptitle(title_formatted, y=1.005)
        plt.show()

In [3]:
# Custom formatting for cv_results
pd.options.display.max_columns = None

def custom_format(value, col_name):
    if col_name.startswith('param_'):
        return value
    elif 'time' in col_name:
        return round(value, 0)
    elif 'mean' in col_name:
        return round(value, 3)
    elif 'std' in col_name:
        return round(value, 4)
    else:
        return value

Now, we load train and test set and also outliers arrays that can be used in outlier removal.

In [4]:
class_labels = [0, 1, 2, 3, 4]
class_names = ['Sleeping Dog', 'Low Value Customers', 'Sleeping Beauty', 'Good Customers', 'VIP']
colors = ['red', 'orange', 'purple', 'blue', 'green']

# Load test set from pickle file
with open('testset.pkl', 'rb') as f:
    testset = pickle.load(f)
X_test, y_test = testset['X_test'], testset['y_test']

# Load train set from pickle file
with open('trainset_meta.pkl', 'rb') as f:
    trainset = pickle.load(f)
X_train, y_train = trainset['X_train'], trainset['y_train']

with open('columns.pkl', 'rb') as f:
    columns = pickle.load(f)

In [5]:
model_file_name = 'MetaModel.csv'

In [6]:
X_train.shape

(27300, 83)

Here, we use Robust Scaler to standardize our features before going into gridSearch.

In [7]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
models = [
    {
        "model_name": "LogisticRegression",
        "model": LogisticRegression(C=0.5,
                                     multi_class='multinomial',
                                     penalty='l2',
                                     solver='saga',
                                     max_iter=1000,
                                     tol=1e-3),
        "model_filename": "LogisticRegression.csv"
    },
    {
        "model_name": "RandomForest",
        "model": RandomForestClassifier(n_estimators=300, 
                                         max_features=30,
                                         criterion='gini', 
                                         max_depth=10),
        "model_filename": "RandomForest.csv"
    },
    {
        "model_name": "KNeighbors",
        "model": KNeighborsClassifier(n_neighbors=10),
        "model_filename": "KNeighbors.csv"
    },
    {
        "model_name": "XGBClassifier",
        "model": xgb.XGBClassifier(objective='multi:softmax', 
                                    eval_metric='mlogloss',
                                    max_depth=4,
                                    n_estimators=300,
                                    learning_rate=0.2,
                                    colsample_bytree=0.8),
        "model_filename": "XGBClassifier.csv"
    },
    {
        "model_name": "MLPClassifier",
        "model": MLPClassifier(tol=1e-3,
                               hidden_layer_sizes=(100,),
                               activation='logistic',
                               solver='adam',
                               alpha=0.01,
                               learning_rate='constant',
                               max_iter=350),
        "model_filename": "MLPClassifier.csv"
    }
]

In [9]:
# Define base_models
base_models = [(model["model_name"], model["model"]) for model in models]

# Choose a meta-model
meta_model = LogisticRegression(max_iter=1000, tol=1e-3)

# Create the StackingClassifier
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=3)

# Define the param_grid
param_grid = [
    {
        'final_estimator__C': np.logspace(-2, 2, num=4),
        'final_estimator__penalty': ['l1', 'l2'],
        'final_estimator__solver': ['liblinear'],
        'final_estimator__multi_class': ['ovr']
    },
    {
        'final_estimator__C': np.logspace(-2, 2, num=4),
        'final_estimator__penalty': ['l1', 'l2'],
        'final_estimator__solver': ['saga'],
        'final_estimator__multi_class': ['multinomial']
    }
]

# Use GridSearch to find the best parameters for the meta-model
grid_search = GridSearchCV(
    estimator=stacking_classifier,
    param_grid=param_grid,
    cv=3,
    n_jobs=6,
    verbose=1,
    return_train_score=True,
    scoring={
        'f1': make_scorer(f1_score, average='weighted'),
        'precision': make_scorer(precision_score, average='weighted'),
        'recall': make_scorer(recall_score, average='weighted')
    },
    refit='f1'
)

# Start timer
start_time = time.time()

# Fit the GridSearchCV on your training data
grid_search.fit(X_train, y_train)

# End timer
end_time = time.time()
elapsed_time = end_time - start_time

# Print time taken to fit the model
print("Time taken to fit the model: {:.2f} seconds".format(elapsed_time))

# Print best parameters found by GridSearchCV
print("Best parameters found by GridSearchCV:", grid_search.best_params_)

# Evaluation on test data
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Accuracy on unseen test data: {:.2f}%".format(test_score * 100))

Fitting 3 folds for each of 16 candidates, totalling 48 fits




Time taken to fit the model: 3317.14 seconds
Best parameters found by GridSearchCV: {'final_estimator__C': 4.6415888336127775, 'final_estimator__multi_class': 'ovr', 'final_estimator__penalty': 'l1', 'final_estimator__solver': 'liblinear'}
Accuracy on unseen test data: 61.57%


In [10]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_f1", ascending=False)

file_path = os.path.join('gs_results', model_file_name)
cv_results.to_csv(file_path, index=False)

# Get columns starting with 'param_'
param_columns = [col for col in cv_results.columns if col.startswith('param_')]

# Create a list of desired column names
desired_columns = [
    'mean_fit_time',
    'mean_test_f1', 'mean_train_f1',
    *param_columns, 
    'std_test_f1', 'std_train_f1',
    'mean_test_precision', 'mean_train_precision',
    'std_test_precision', 'std_train_precision',
    'mean_test_recall', 'mean_train_recall',
    'std_test_recall', 'std_train_recall',    
]

# Apply custom formatting function and select columns
cv_results = cv_results.apply(lambda col: col.apply(lambda value: custom_format(value, col.name)))
cv_results = cv_results[desired_columns]
cv_results.head(10)

Unnamed: 0,mean_fit_time,mean_test_f1,mean_train_f1,param_final_estimator__C,param_final_estimator__multi_class,param_final_estimator__penalty,param_final_estimator__solver,std_test_f1,std_train_f1,mean_test_precision,mean_train_precision,std_test_precision,std_train_precision,mean_test_recall,mean_train_recall,std_test_recall,std_train_recall
4,365.0,0.586,0.824,4.641589,ovr,l1,liblinear,0.0038,0.0025,0.586,0.828,0.0047,0.0025,0.594,0.826,0.0033,0.0024
5,360.0,0.586,0.822,4.641589,ovr,l2,liblinear,0.0039,0.0029,0.587,0.827,0.0049,0.0029,0.594,0.824,0.0038,0.0028
7,360.0,0.585,0.822,100.0,ovr,l2,liblinear,0.0043,0.0027,0.586,0.827,0.0054,0.0025,0.594,0.824,0.004,0.0026
10,361.0,0.585,0.813,0.215443,multinomial,l1,saga,0.0033,0.0033,0.585,0.816,0.0038,0.0033,0.591,0.815,0.0034,0.0032
14,340.0,0.584,0.814,100.0,multinomial,l1,saga,0.0032,0.0029,0.585,0.818,0.0042,0.003,0.591,0.816,0.0029,0.0028
3,362.0,0.584,0.819,0.215443,ovr,l2,liblinear,0.0033,0.0026,0.585,0.823,0.0042,0.0024,0.593,0.821,0.0032,0.0025
12,360.0,0.584,0.814,4.641589,multinomial,l1,saga,0.0027,0.0024,0.584,0.818,0.0033,0.0026,0.591,0.816,0.0029,0.0023
13,355.0,0.584,0.815,4.641589,multinomial,l2,saga,0.0038,0.0036,0.584,0.819,0.0047,0.0037,0.59,0.817,0.0037,0.0035
6,365.0,0.584,0.823,100.0,ovr,l1,liblinear,0.0043,0.0023,0.584,0.827,0.0054,0.0025,0.593,0.825,0.0038,0.0022
15,339.0,0.584,0.817,100.0,multinomial,l2,saga,0.0055,0.0032,0.584,0.82,0.0061,0.0032,0.59,0.818,0.0051,0.0031


## Conclusion on Meta Model

After evaluating the performance of the meta model in comparison to its base models, we have decided not to continue with it as it did not demonstrate a significant improvement in f1 weighted score. While a meta model can have advantages such as improving interpretability or reducing overfitting, our primary goal was to improve f1 weighted score, and we did not find that to be the case with the meta model.