# Data Cleaning And Preprocessing

In [2]:
#Data Preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from datetime import datetime, date, timedelta
import seaborn as sns
import plotly.express as px
from sklearn import preprocessing

#Model Application
from sklearn.model_selection import train_test_split
from xgboost import XGBRFClassifier

#Model Evaluation
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from mlxtend.evaluate import bias_variance_decomp
from sklearn import metrics

#Parameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

# Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
np.random.seed(42)

In [4]:
marketing_data = pd.read_csv('marketing_campaign.csv')

### Check Null Values

In [5]:
mean_income = marketing_data['Income'].mean()
print("%.2f" % mean_income)

52247.25


In [6]:
marketing_data['Income'].fillna(value=mean_income, inplace=True)
marketing_data.isnull().values.any()

False

### Remove Unwanted Columns 

In [7]:
remove_cols= ['Z_CostContact','Z_Revenue']
marketing_data = marketing_data.drop(remove_cols,axis=1)

### Alter Data Values

In [8]:
# Marital Status
marketing_data['Marital_Status'] = marketing_data['Marital_Status'].replace(['Together', 'Married'],'Couple')
marketing_data['Marital_Status'] = marketing_data['Marital_Status'].replace(['Divorced', 'Widow', 'Alone'],'Single')
marketing_data['Marital_Status'] = marketing_data['Marital_Status'].replace(['Absurd', 'YOLO'],'Unspecified')

In [9]:
# Education
marketing_data['Education'] = marketing_data['Education'].replace(['Graduation','Basic', '2n Cycle'],['Graduate','High School','Unspecified'])
marketing_data['Education'] = marketing_data['Education'].replace(['PhD','Master'],'Post-Graduate')

### Alter Data Types

In [10]:
# Year Birth
print("Current Year_Birth Format = ", marketing_data['Year_Birth'].dtypes)
marketing_data['Year_Birth'] = pd.to_datetime(marketing_data['Year_Birth'], format='%Y')
print("Altered Year_Birth Format = ",marketing_data['Year_Birth'].dtypes)

Current Year_Birth Format =  int64
Altered Year_Birth Format =  datetime64[ns]


In [11]:
# Joined Date
print("Current Dt_Customer Format = ", marketing_data['Dt_Customer'].dtypes)
marketing_data['Dt_Customer'] = pd.to_datetime(marketing_data['Dt_Customer'], format='%Y-%m-%d')
print("Altered Dt_Customer Format = ", marketing_data['Dt_Customer'].dtypes)

Current Dt_Customer Format =  object
Altered Dt_Customer Format =  datetime64[ns]


In [12]:
# Currency Values to Float
marketing_data['MntWines'] = marketing_data['MntWines'].astype(float)
marketing_data['MntFruits'] = marketing_data['MntFruits'].astype(float)
marketing_data['MntMeatProducts'] = marketing_data['MntMeatProducts'].astype(float)
marketing_data['MntFishProducts'] = marketing_data['MntFishProducts'].astype(float)
marketing_data['MntSweetProducts'] = marketing_data['MntSweetProducts'].astype(float)
marketing_data['MntGoldProds'] = marketing_data['MntGoldProds'].astype(float)

# Feature Engineering

### Age

In [13]:
#function to calculate age
def calculateAge(birthDate):
    today = date.today()
    age = today.year - birthDate.year - ((today.month, today.day) < (birthDate.month, birthDate.day)) 
    return age

#apply function to all rows
marketing_data['Year_Birth'] = marketing_data['Year_Birth'].apply(calculateAge)

In [14]:
marketing_data.rename(columns = {'Year_Birth':'Age'}, inplace = True)
print(marketing_data['Age'].head())

0    66
1    69
2    58
3    39
4    42
Name: Age, dtype: int64


### Overall Acceptance Rate

In [15]:
marketing_data['Overall_Acceptance_Rate'] = marketing_data['AcceptedCmp1'] + marketing_data['AcceptedCmp2'] + marketing_data['AcceptedCmp3'] + marketing_data['AcceptedCmp4'] + marketing_data['AcceptedCmp5']
print(marketing_data['Overall_Acceptance_Rate'].unique())

[0 1 2 3 4]


### Membership duration

In [16]:
marketing_data['Membership_Duration'] = marketing_data['Dt_Customer'].apply(calculateAge)
print(marketing_data['Dt_Customer'].max())
print(marketing_data['Dt_Customer'].min())

2014-06-29 00:00:00
2012-07-30 00:00:00


### Total Amount Spent

In [17]:
marketing_data['Total_Amt_Spent'] = marketing_data['MntWines'] + marketing_data['MntFruits'] + marketing_data['MntMeatProducts'] + marketing_data['MntFishProducts'] + marketing_data['MntSweetProducts'] + marketing_data['MntGoldProds']
print(marketing_data['Total_Amt_Spent'].head())

0    1617.0
1      27.0
2     776.0
3      53.0
4     422.0
Name: Total_Amt_Spent, dtype: float64


### Total People In The Household

In [18]:
total_count = []

for status in marketing_data['Marital_Status']:
    
    adults = 0

    if status == 'Couple':
        adults = total_count.append(2)  
    else:
        adults = total_count.append(1)  
 

In [19]:
marketing_data['Total_People'] = total_count + marketing_data['Kidhome'] + marketing_data['Teenhome']
print(marketing_data['Total_People'].head())

0    1
1    3
2    2
3    3
4    3
Name: Total_People, dtype: int64


### Outlier Detection

In [20]:
mode_age = marketing_data['Age'].mode()
mean_income = marketing_data['Income'].mean()

marketing_data['Age'] = np.where(marketing_data['Age'] >83, mode_age ,marketing_data['Age'])
marketing_data['Income'] = marketing_data['Income'].replace([666666.00], mean_income)

print(mean_income)

52247.25135379059


### Encoding

In [21]:
unencoded_cols = ['Education','Marital_Status']
le = preprocessing.LabelEncoder()
enc_marketing_data = marketing_data.copy()

for col in unencoded_cols:
    enc_marketing_data[col] = le.fit_transform(marketing_data[col])
    
enc_marketing_data.head()

Unnamed: 0,ID,Age,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Overall_Acceptance_Rate,Membership_Duration,Total_Amt_Spent,Total_People
0,5524,66,0,1,58138.0,0,0,2012-09-04,58,635.0,88.0,546.0,172.0,88.0,88.0,3,8,10,4,7,0,0,0,0,0,0,1,0,11,1617.0,1
1,2174,69,0,1,46344.0,1,1,2014-03-08,38,11.0,1.0,6.0,2.0,1.0,6.0,2,1,1,2,5,0,0,0,0,0,0,0,0,9,27.0,3
2,4141,58,0,0,71613.0,0,0,2013-08-21,26,426.0,49.0,127.0,111.0,21.0,42.0,1,8,2,10,4,0,0,0,0,0,0,0,0,10,776.0,2
3,6182,39,0,0,26646.0,1,0,2014-02-10,26,11.0,4.0,20.0,10.0,3.0,5.0,2,2,0,4,6,0,0,0,0,0,0,0,0,9,53.0,3
4,5324,42,2,0,58293.0,1,0,2014-01-19,94,173.0,43.0,118.0,46.0,27.0,15.0,5,5,3,6,5,0,0,0,0,0,0,0,0,9,422.0,3


### Log Transformation

In [23]:
cols = ['Income','MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds','NumDealsPurchases','NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth','Total_Amt_Spent']

for chosen_col in cols:
    enc_marketing_data[chosen_col] = np.log(enc_marketing_data[chosen_col] + 1)


# Feature Selection

In [24]:
cols_to_remove = ['ID', 'Dt_Customer']
enc_marketing_data.drop(cols_to_remove, axis=1, inplace=True)

In [25]:
cols_to_remove = ['Complain','Overall_Acceptance_Rate']
enc_marketing_data.drop(cols_to_remove, axis=1, inplace=True)
enc_marketing_data.shape

(2240, 27)

# Model Application And Evaluation

### Base Model

##### Application

In [26]:
# Split the data into features (X) and target (y)
X = enc_marketing_data
y = enc_marketing_data.pop('Response')

In [27]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

X_train_arr = np.array(X_train)
y_train_arr = np.array(y_train)
X_test_arr = np.array(X_test)
y_test_arr = np.array(y_test)

print("Train size X : ",X_train.shape)
print("Train size y : ",y_train.shape)
print("\nTest size X : ",X_test.shape)
print("Test size y : ",y_test.shape)

Train size X :  (1792, 26)
Train size y :  (1792,)

Test size X :  (448, 26)
Test size y :  (448,)


In [28]:
#Training the model
xgbrf_basemodel = XGBRFClassifier()
xgbrf_basemodel.fit(X_train,y_train)

XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=None,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=None, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, random_state=None,
                reg_alpha=None, ...)

##### Evaluation

In [29]:
y_pred_base = xgbrf_basemodel.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred_base))
print('Precision: %.3f' % precision_score(y_test, y_pred_base))
print('Recall: %.3f' % recall_score(y_test, y_pred_base))
print('F1 Score: %.3f' % f1_score(y_test, y_pred_base))

Accuracy: 0.862
Precision: 0.652
Recall: 0.217
F1 Score: 0.326


### Tuned Model

###### Application

In [31]:
#define model
xgbrf = XGBRFClassifier()

#specifying all hyperparameters with possible values
param = {
    'max_depth': [3, 5, 6, 10, 15, 20],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [10,25,30,50,100,200],
    'colsample_bynode': [0.2, 0.4, 0.6, 0.8],
}

# define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
grid_search = GridSearchCV(estimator=xgbrf, param_grid=param, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.893968 using {'colsample_bynode': 0.4, 'learning_rate': 0.01, 'max_depth': 20, 'n_estimators': 25}
0.869239 (0.010195) with: {'colsample_bynode': 0.2, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10}
0.870721 (0.010208) with: {'colsample_bynode': 0.2, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 25}
0.870908 (0.009302) with: {'colsample_bynode': 0.2, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 30}
0.873138 (0.009791) with: {'colsample_bynode': 0.2, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.873328 (0.010677) with: {'colsample_bynode': 0.2, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
0.873325 (0.010905) with: {'colsample_bynode': 0.2, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
0.874632 (0.014146) with: {'colsample_bynode': 0.2, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 10}
0.874256 (0.013344) with: {'colsample_bynode': 0.2, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 25}
0.8

In [30]:
xgbrf_tunedmodel = XGBRFClassifier(learning_rate=0.01, max_depth=20, n_estimators=25, colsample_bynode= 0.4)
xgbrf_tunedmodel.fit(X_train,y_train)

XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=0.4,
                colsample_bytree=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.01, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=20, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=25, n_jobs=None, num_parallel_tree=None,
                objective='binary:logistic', predictor=None, ...)

###### Evaluation

In [31]:
y_pred_tuned = xgbrf_tunedmodel.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred_tuned))
print('Precision: %.3f' % precision_score(y_test, y_pred_tuned))
print('Recall: %.3f' % recall_score(y_test, y_pred_tuned))
print('F1 Score: %.3f' % f1_score(y_test, y_pred_tuned))

Accuracy: 0.871
Precision: 0.677
Recall: 0.304
F1 Score: 0.420
