# Introduction

The notebook is intended to perform a binary classification over the 'Response' label.

In [1]:
# Import Standard Modules
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

import xgboost as xgb

# Set Pandas Options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Notebook configurations
scaler_type = 'StandardScaler' # Options: StandardScaler
model_type = 'XGBoost' # Options: XGBoost

# Read Data

In [3]:
# Read data
data = pd.read_csv('./../data/marketing_campaign_prepared.csv', encoding='latin1', sep=';')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1533 entries, 0 to 1532
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     1533 non-null   int64  
 1   Year_Birth             1533 non-null   int64  
 2   Education              1533 non-null   object 
 3   Marital_Status         1533 non-null   object 
 4   Income                 1533 non-null   float64
 5   Kidhome                1533 non-null   int64  
 6   Teenhome               1533 non-null   int64  
 7   Recency                1533 non-null   int64  
 8   MntWines               1533 non-null   int64  
 9   MntFruits              1533 non-null   int64  
 10  MntMeatProducts        1533 non-null   int64  
 11  MntFishProducts        1533 non-null   int64  
 12  MntSweetProducts       1533 non-null   int64  
 13  MntGoldProds           1533 non-null   int64  
 14  NumDealsPurchases      1533 non-null   int64  
 15  NumW

# Data Pre-processing

## Features & Label Definition

The 'ID' column does not bring any useful information.

In [5]:
# Define x and y
X = data.drop(['ID', 'Response'], axis=1)
y = data['Response']

## Train & Test Split

Since the label is characterized by a strong imbalancing in the class distribution, we need to address it carefully:
1. Ensure that the training and test sets have the same proportions of the two classes
2. Oversample the minor class (i.e., randomly duplicate examples)
3. Undersample the major class (i.e., randomly delete examples)
4. Use several metrics (e.g., Accuracy, Precision, Recall, AUC)

Use StratifiedShuffleSplit. This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class.

Note: like the ShuffleSplit strategy, stratified random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets.

In [6]:
# Define the Splitter
stratified_kfold = StratifiedShuffleSplit(n_splits=10,
                                          test_size=.3, 
                                          random_state=0)

## Feature Selection

In [7]:
# Numerical Features
numerical_features = ['Year_Birth', 
                      'Income', 
                      'Recency', 
                      'MntWines', 
                      'MntFruits', 
                      'MntMeatProducts', 
                      'MntFishProducts', 
                      'MntSweetProducts', 
                      'MntGoldProds', 
                      'NumDealsPurchases', 
                      'NumWebPurchases', 
                      'NumCatalogPurchases', 
                      'NumStorePurchases', 
                      'NumWebVisitsMonth']

# Categorical Text Features
categorical_text_features = ['Education', 
                             'Marital_Status']

# Categorical Numerical Features
categorical_numerical_features = ['Kidhome', 
                                  'Teenhome', 
                                  'AcceptedCmp1', 
                                  'AcceptedCmp2', 
                                  'AcceptedCmp3', 
                                  'AcceptedCmp4', 
                                  'AcceptedCmp5', 
                                  'Complain', 
                                  'Dt_Customer_month', 
                                  'Dt_Customer_dayofweek']

## Data Standardization

Transform the individual features to look more or less like standard normally distributed data: Gaussian with zero mean and unit variance.

Keep in mind that tree-based methods are scale-invariant, so data standardization is not required.

Standardization has to go after training-test split. That's because, standardizing the whole dataset and then split, would introduce into the training set some information about the mean and std of the test set. Remember to standardize the test set with the same scaler trained on the training set. This would be addressed by constructing a pipeline with the scaler as a step.

In [8]:
# Define the ColumnTransformer
column_transformer = ColumnTransformer([
    ('numerical_transformation', StandardScaler(), numerical_features),
    ('categorical_text_transformation', OrdinalEncoder(), categorical_text_features),
    ('categorical_numerical_transformation', 'passthrough', categorical_numerical_features)
])

# Model Definition

## Bayesian Optimization with HYPEROPT

In [9]:
# Define the Hyperparamters space for Hyperopt
hyperopt_parameters_space = {
    'max_depth': hp.quniform("max_depth", 3, 40, 2),
    'gamma': hp.uniform ('gamma', 1, 15),
    'reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 60, 3),
    'n_estimators': hp.quniform('n_estimators', 100, 3000, 10),
    'seed': 0
}

In [16]:
# Define the Objective Function
def objective(space, column_transformer=column_transformer, cv=stratified_kfold, X=X, y=y, scoring='roc_auc'):
    
    # Create the estimator
    clf=xgb.XGBClassifier(objective='binary:logistic',
                          eval_metric='auc',
                          n_estimators=int(space['n_estimators']), 
                          max_depth=int(space['max_depth']), 
                          gamma=space['gamma'],
                          reg_alpha=int(space['reg_alpha']),
                          min_child_weight=int(space['min_child_weight']),
                          colsample_bytree=int(space['colsample_bytree']),
                          use_label_encoder=False)
    
    # Define the Pipeline
    pipeline = Pipeline(steps=[
        ('feature_transformation', column_transformer), 
        ('classifier', clf)
    ])
    
    pipeline.fit(X, y)
        
    # Calculate the score
    score = cross_val_score(pipeline, X, y, cv=cv, scoring=scoring, n_jobs=1).mean()
    
    print(score)
                
    return score

In [None]:
parameters = fmin(fn=objective,
                  space=hyperopt_parameters_space,
                  algo=tpe.suggest,
                  max_evals=50,
                  trials=Trials())

  0%|                                                                            | 0/50 [00:00<?, ?trial/s, best loss=?]