# Customer Propensity Modelling

In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

import seaborn as sns

In [3]:
marketing_df = pd.read_csv('./Data/cleaned_marketing_engineered.csv')
marketing_df.head()

Unnamed: 0,Education,Marital_Status,Income,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,...,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Minorshome,TotalMnt,TotalPurchases,TotalCampPar,Age
0,Graduation,Single,58138.0,2012-09-04,58,635.0,88.0,546.0,172.0,88.0,...,0,0,0,0,1,0,1617.0,22,0,55
1,Graduation,Single,46344.0,2014-03-08,38,11.0,1.0,6.0,2.0,1.0,...,0,0,0,0,0,2,27.0,4,0,60
2,Graduation,Married,71613.0,2013-08-21,26,426.0,49.0,127.0,111.0,21.0,...,0,0,0,0,0,0,776.0,20,0,48
3,Graduation,Married,26646.0,2014-02-10,26,11.0,4.0,20.0,10.0,3.0,...,0,0,0,0,0,1,53.0,6,0,30
4,PhD,Married,58293.0,2014-01-19,94,173.0,43.0,118.0,46.0,27.0,...,0,0,0,0,0,1,422.0,14,0,33


In [4]:
marketing_df.Dt_Customer = marketing_df.Dt_Customer.astype('datetime64[ns]')

## Encoding Categorical Variables

In [5]:
marketing_df_modelling = marketing_df.copy()

In [6]:
def ordinal_encoding(categorical_columns, df):
    enc = OrdinalEncoder()
    for column in categorical_columns:    
        required_values = marketing_df_modelling.loc[:, [column]]
        df[column] = enc.fit_transform(required_values)
    return df

In [7]:
marketing_df_modelling = ordinal_encoding(['Education', 'Marital_Status'], marketing_df)

## Imbalanced Data

In [8]:
marketing_df_modelling.Response.value_counts()

0    1906
1     334
Name: Response, dtype: int64

In [9]:
marketing_df_modelling.drop(columns = ['Dt_Customer'], inplace = True)
X = marketing_df_modelling.drop(['Response'], axis = 1)
y = marketing_df_modelling.Response

In [10]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [11]:
sampled_marketing_df_modelling = pd.DataFrame(data = X, columns = X.columns.tolist())
sampled_marketing_df_modelling['Response'] = y

## Model Fitting and Prediction Workflow

In [12]:
X = sampled_marketing_df_modelling.drop(['Response'], axis = 1)
y = sampled_marketing_df_modelling.Response

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 24, test_size=0.2)

## Random Forest

### Hyperparamter Tunning

In [13]:
# Number of trees in Random Forest

rf_n_estimators = [int(x) for x in np.linspace(200, 1000, 5)]
rf_n_estimators.append(1500)
rf_n_estimators.append(2000)

# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 55, 11)]

# Add the default as a possible value
rf_max_depth.append(None)

# Number of features to consider at every split
rf_max_features = ['auto', 'sqrt', 'log2']

# Criterion to split on
rf_criterion = ['gini', 'entropy', 'log_loss']

# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]

# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1]

# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'bootstrap': rf_bootstrap}

In [14]:
rf_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1500, 2000],
 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'criterion': ['gini', 'entropy', 'log_loss'],
 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
 'min_impurity_decrease': [0.0, 0.05, 0.1],
 'bootstrap': [True, False]}

In [15]:
rf = RandomForestClassifier()

# Create the random search Random Forest
rf_search = RandomizedSearchCV(estimator = rf, param_distributions = rf_grid, 
                               n_iter = 200, cv = 3, verbose = 2, random_state = 42, 
                               n_jobs = -1)

In [16]:
# Fit the random search model
rf_search.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [18]:
rf_search.best_params_

{'n_estimators': 1500,
 'min_samples_split': 4,
 'min_impurity_decrease': 0.0,
 'max_features': 'log2',
 'max_depth': 35,
 'criterion': 'log_loss',
 'bootstrap': False}

### Prediction

In [24]:
rf = RandomForestClassifier(n_estimators = 1500, min_samples_split = 4, min_impurity_decrease = 0.0, max_features = 'log2', max_depth = 35, criterion = 'log_loss', bootstrap = False)

In [25]:
rf.fit(X_train, y_train)

In [26]:
y_pred = rf.predict(X_test)

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       364
           1       0.93      0.93      0.93       399

    accuracy                           0.93       763
   macro avg       0.93      0.93      0.93       763
weighted avg       0.93      0.93      0.93       763



## KNN

### Hyperparamter Tunning

In [31]:
leaf_size = list(range(1,50))

n_neighbors = list(range(1,30))

p = [1,2]

weights = ['uniform', 'distance'] 

algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']

knn_grid = {
    'n_neighbors' : n_neighbors, 
    'leaf_size' : leaf_size, 
    'algorithm' : algorithm, 
    'p': p, 
    'weights' : weights
}

In [33]:
knn = KNeighborsClassifier()

knn_search = RandomizedSearchCV(estimator = knn, param_distributions = knn_grid, 
                               n_iter = 200, cv = 3, verbose = 2, random_state = 42, 
                               n_jobs = -1)

In [34]:
knn_search.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [36]:
knn_search.best_params_

{'weights': 'distance',
 'p': 1,
 'n_neighbors': 1,
 'leaf_size': 6,
 'algorithm': 'ball_tree'}

### Prediction

In [38]:
knn = KNeighborsClassifier(weights = 'distance', p = 1, n_neighbors = 1, leaf_size=6, algorithm='ball_tree')

In [39]:
knn.fit(X_train, y_train)

In [40]:
y_pred = knn.predict(X_test)

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.80      0.85       364
           1       0.83      0.91      0.87       399

    accuracy                           0.86       763
   macro avg       0.86      0.86      0.86       763
weighted avg       0.86      0.86      0.86       763



> Random Forest seems to perform better than KNN and hence, random forest will be used to predict the response for the present dataset