# Introduction

The notebook is intended to perform a binary classification over the 'Response' label.

In [1]:
# Import Standard Modules
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_score, accuracy_score
from sklearn.linear_model import LogisticRegression

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

import xgboost as xgb

# Set Pandas Options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Read Data

In [2]:
# Read data
data = pd.read_csv('./../data/marketing_campaign_prepared.csv', encoding='latin1', sep=';')

# Data Pre-processing

## Features & Label Definition

The 'ID' column does not bring any useful information.

In [3]:
# Define x and y
X = data.drop(['ID', 'Response'], axis=1)
y = data['Response']

## Train & Test Split

Since the label is characterized by a strong imbalancing in the class distribution, we need to address it carefully:
1. Ensure that the training and test sets have the same proportions of the two classes
2. Oversample the minor class (i.e., randomly duplicate examples)
3. Undersample the major class (i.e., randomly delete examples)
4. Use several metrics (e.g., Accuracy, Precision, Recall, AUC)

Use StratifiedShuffleSplit. This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class.

Note: like the ShuffleSplit strategy, stratified random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets.

In [4]:
# Standard train & test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
# Define the a Stratified K-fold Shuffle Splitter
stratified_kfold = StratifiedShuffleSplit(n_splits=10,
                                          test_size=.3, 
                                          random_state=0)

## Feature Selection

In [6]:
# Numerical Features
numerical_features = ['Year_Birth', 
                      'Income', 
                      'Recency', 
                      'MntWines', 
                      'MntFruits', 
                      'MntMeatProducts', 
                      'MntFishProducts', 
                      'MntSweetProducts', 
                      'MntGoldProds', 
                      'NumDealsPurchases', 
                      'NumWebPurchases', 
                      'NumCatalogPurchases', 
                      'NumStorePurchases', 
                      'NumWebVisitsMonth']

# Categorical Text Features
categorical_text_features = ['Education', 
                             'Marital_Status']

# Categorical Numerical Features
categorical_numerical_features = ['Kidhome', 
                                  'Teenhome', 
                                  'AcceptedCmp1', 
                                  'AcceptedCmp2', 
                                  'AcceptedCmp3', 
                                  'AcceptedCmp4', 
                                  'AcceptedCmp5', 
                                  'Complain', 
                                  'Dt_Customer_month', 
                                  'Dt_Customer_dayofweek']

## Data Standardization

Transform the individual features to look more or less like standard normally distributed data: Gaussian with zero mean and unit variance.

Keep in mind that tree-based methods are scale-invariant, so data standardization is not required.

Standardization has to go after training-test split. That's because, standardizing the whole dataset and then split, would introduce into the training set some information about the mean and std of the test set. Remember to standardize the test set with the same scaler trained on the training set. This would be addressed by constructing a pipeline with the scaler as a step.

In [7]:
# Define the ColumnTransformer
column_transformer = ColumnTransformer([
    ('numerical', StandardScaler(), numerical_features),
    ('categorical_text', OneHotEncoder(), categorical_text_features),
    ('categorical_numerical', 'passthrough', categorical_numerical_features)
], verbose_feature_names_out=True)

In [8]:
# Fit the ColumnTransformer
_ = column_transformer.fit(X_train)

In [9]:
# Transform the columns
X_train_transformed = pd.DataFrame(column_transformer.transform(X_train), columns=column_transformer.get_feature_names_out())
X_test_transformed = pd.DataFrame(column_transformer.transform(X_test), columns=column_transformer.get_feature_names_out())

# Model Definition

## Logistic Regression

First benchmark model. Use standard train & test split and fit the logistic regression.

In [10]:
# Define the model
model_lr = LogisticRegression(max_iter=500)

# Train the model
model_lr.fit(X_train_transformed, y_train)

# Predictions
predictions_lr = model_lr.predict(X_test_transformed)

In [11]:
# Model evaluation
accuracy = round(accuracy_score(y_test, predictions_lr) * 100, 2)
precision = round(precision_score(y_test, predictions_lr) * 100, 2)

print('Model accuracy: {}%'.format(accuracy))
print('Model precision: {}%'.format(precision))

Model accuracy: 90.12%
Model precision: 53.85%


## Logistic Regression - Pipeline

Use the same model as before, but within a pipeline (experimental purposes only).

In [12]:
# Define the model
model_lr_pipe = LogisticRegression(max_iter=500)

# Define the pipeline
lr_pipe = Pipeline([
    ('feature_transformation', column_transformer),
    ('logistic_regression', model_lr_pipe)
])

# Train the pipeline
lr_pipe.fit(X_train, y_train)

# Predictions
predictions_lr_pipe = lr_pipe.predict(X_test)

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Bayesian Optimization with HYPEROPT

In [None]:
# Define the Hyperparamters space for Hyperopt
hyperopt_parameters_space = {
    'max_depth': hp.quniform("max_depth", 3, 40, 2),
    'gamma': hp.uniform ('gamma', 1, 15),
    'reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 60, 3),
    'n_estimators': hp.quniform('n_estimators', 100, 3000, 10),
    'seed': 0
}

In [None]:
# Define the Objective Function
def objective(space, column_transformer=column_transformer, cv=stratified_kfold, X=X, y=y, scoring='roc_auc'):
    
    # Create the estimator
    clf=xgb.XGBClassifier(objective='binary:logistic',
                          eval_metric='auc',
                          n_estimators=int(space['n_estimators']), 
                          max_depth=int(space['max_depth']), 
                          gamma=space['gamma'],
                          reg_alpha=int(space['reg_alpha']),
                          min_child_weight=int(space['min_child_weight']),
                          colsample_bytree=int(space['colsample_bytree']),
                          use_label_encoder=False)
    
    # Define the Pipeline
    pipeline = Pipeline(steps=[
        ('feature_transformation', column_transformer), 
        ('classifier', clf)
    ])
    
    # Init accuracy and precision list for K-fold
    accuracy = []
    precision = []
    
    # Train the model with K-fold
    for train_index, test_index in cv.split(X, y):
    
        # Train the model
        pipeline.fit(X[train_index], y[train_index])

        # Get predicitons
        predictions = pipeline.predict(X[test_index])
        
        # Calculate accuracy
        accuracy.append(accuracy_score(y[test_index], predictions))
        
        # Calculate precision
        precision.append(precision_score(y[test_index], predictions))
                                  
        
    # Calculate the score
    cv_score = cross_val_score(pipeline, X, y, cv=cv, scoring=scoring, n_jobs=1).mean()
    accuracy_mean = accuracy.mean()
    precision_score = precision.mean()
    
    print(cv_score)
    print(accuracy_mean)
    print(precision_score)
                
    return cv_score

In [None]:
parameters = fmin(fn=objective,
                  space=hyperopt_parameters_space,
                  algo=tpe.suggest,
                  max_evals=50,
                  trials=Trials())