In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
from datetime import date
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, normalize
from sklearn import metrics
from sklearn.mixture import GaussianMixture
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
%config Completer.use_jedi = False #AutoCompletion

In [3]:
data = pd.read_csv("/kaggle/input/customer-personality-analysis/marketing_campaign.csv", sep='\t')
data.head()

In [4]:
print(len(data.columns))
data.columns

In [5]:
data.info()

In [6]:
data['Dt_Customer'] =pd.to_datetime(data['Dt_Customer'])

In [7]:
data.describe().T

* Income Column contains 24 null valued rows, we can remove these rows
* KidHome and TeenHome could be combined
* Z_CostContact and Z_revenue column doesn't contains any variability, hence we can remove these cols. 
* Age feature can be added by calculating difference between Year- YearBirth

In [8]:
print(data.shape)
print("After performing some transformations")
tran_data= data.copy()
tran_data = tran_data[tran_data['Income'].notnull()]

tran_data['Year'] = tran_data['Dt_Customer'].apply(lambda row:row.year)


tran_data['Children'] = tran_data['Kidhome']+ tran_data['Teenhome']
tran_data['Age'] = pd.Timestamp('now').year- tran_data['Year_Birth']

tran_data = tran_data.drop(['ID','Z_CostContact', 'Z_Revenue', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Year', 'Year_Birth'], axis=1)

print(tran_data.shape)

In [9]:
tran_data['Education'].value_counts()

In [10]:
#regrouping EDucaation as Fully Graduated and under_graduated
tran_data.Education = tran_data.Education.replace(['PhD','Graduation', 'Master'], 'fully_Graduated')

tran_data.Education = tran_data.Education.replace(['Basic', '2n Cycle'], 'under_Graduated')
tran_data['Education'].value_counts()

In [11]:
tran_data['Marital_Status'].value_counts()

In [12]:
tran_data.Marital_Status = tran_data.Marital_Status.replace(['Married','Together'], 'Partner')

tran_data.Marital_Status = tran_data.Marital_Status.replace(['Single','Divorced', 'Widow', 'Alone','Absurd', 'YOLO'], 'Single')
tran_data['Marital_Status'].value_counts()

In [13]:
sns.boxplot(tran_data['Income'])


In [14]:
#removing Income >200000
tran_data=tran_data[tran_data['Income']<200000]
tran_data.shape

In [15]:
#Calling all the amount on commmodities as expense
cols = [i for i in tran_data.columns if str(i).startswith('Mnt')]
print(cols)
tran_data['Expense'] = tran_data[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis=1)
tran_data=tran_data.drop(cols, axis=1)
tran_data['Expense']

In [16]:
#response to campaign in column
xgb_data = tran_data.copy()
cols_response= [i for i in tran_data.columns if str(i).startswith('Accept')]
cols_response= cols_response +['Response']
tran_data['Responses'] = tran_data[cols_response].sum(axis = 1)

# if there is any responses it would be 1 otherwise it would be 0
def mapp(num):
    if num >=1:
        result = 1
    else:
        result = 0
    return result


tran_data['Responses'] = tran_data['Responses'].apply(mapp)
tran_data= tran_data.drop(cols_response, axis=1)
tran_data['Responses'].value_counts()

In [17]:
tran_data.head().T

In [18]:
# Replacing the age with the age groups

def age_category(age):
    if  25<=age <= 35:
        age = 0
    elif 35 < age <= 45:
        age = 1
    elif 45 < age <= 55:
        age = 2
    elif 55 < age <= 65:
        age = 3
    elif 65 < age <= 75:
        age = 4
    elif age > 75:  
        age = 5
    return age  

tran_data.Age = tran_data.Age.apply(age_category)
tran_data.Age.unique()

xgb_data.Age = xgb_data.Age.apply(age_category)
xgb_data.Age.unique()

In [19]:
#Dropping unwanted Columns 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth'
drop_cols= ['NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
tran_data.drop(drop_cols, axis = 1,inplace = True)
xgb_data.drop(drop_cols, axis = 1,inplace = True)

In [20]:
#scaling numerical and categorical data
cat_cols= tran_data.select_dtypes('object').columns 
print(cat_cols)
tran_data =pd.get_dummies(columns=cat_cols,data=tran_data)
xgb_data =pd.get_dummies(columns=cat_cols,data=xgb_data)



In [21]:
plt.figure(figsize= (15,15))
sns.heatmap(tran_data.corr(), annot=True)

Looking into the heat map it is clear that Columns with the nature of expences purchases a higher correlation with the income

In [22]:
tran_data.nunique()

In [44]:
from sklearn.preprocessing import StandardScaler
to_scale= ['Income', 'Recency', 'Expense']
X_Scaled =StandardScaler().fit_transform(tran_data[to_scale])
X_Scaled[:5,:]

In [24]:
X_Scaled =pd.DataFrame(X_Scaled, columns =to_scale)
final_data = pd.concat((tran_data.drop(to_scale, axis=1), X_Scaled), axis=1)
final_data.head()


In [25]:
tran_data.info()

In [26]:
final_data.dropna(inplace=True)

# Build unsupervised Algorithm to cluster customer data

In [46]:
features = ['Income', 'Age', 'Expense']

In [57]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

model = KMeans(init = 'k-means++')
visualizer = KElbowVisualizer(model, k = 10, random_state = 42)

visualizer.fit(final_data[['Income',  'Expense']])
visualizer.show()

In [65]:
model = KMeans(n_clusters=4, init='k-means++', random_state=42).fit(final_data[['Income', 'Expense']])
preds = model.predict(final_data[['Income', 'Expense']])
dt_Kmeans = final_data[['Income','Expense']]
dt_Kmeans['Cluster'] = preds

The number of optimum clusters are 4.

In [66]:
plt.figure(figsize=(10,7))
sns.boxplot(data=dt_Kmeans, x='Cluster', y = 'Income');
plt.xlabel('Cluster', fontsize=20, labelpad=20)
plt.ylabel('Income', fontsize=20, labelpad=20)
plt.title("PLot Showing Income distribution for different clusters")

In [67]:
plt.figure(figsize=(10,7))
sns.boxplot(data=dt_Kmeans, x='Cluster', y = 'Expense');
plt.xlabel('Cluster', fontsize=20, labelpad=20)
plt.ylabel('Expense', fontsize=20, labelpad=20)
plt.title("PLot Showing Expense distribution for different clusters")

In [68]:
plt.figure(figsize=(10,7))
sns.scatterplot(data=dt_Kmeans, x='Income', y='Expense', hue='Cluster');
plt.xlabel('Income', fontsize=20, labelpad=20)
plt.ylabel('Total Expense', fontsize=20, labelpad=20);

In the above plot we have developed multiple cluster mostly based on the income and Expense of the customer , like high income and high Expense customer , high income and low spending customer , low income low spending customer and a very few points of customers
who are low income and high spending customers .
This gives us various segments of Customers based on their income and Spending.

# 2 Build a classification model (binomial or multinomial - recommended: xgboost python implementation) on the promotion done by the company.

Build a classification model (binomial or multinomial - recommended: xgboost python implementation) on the promotion done by the company. Run SHAP analysis on the model results, and write a short text of what would be your recommendation to business for the next round of campaigns.

In [69]:
import xgboost as xgb

In [70]:
xgb_data.head()

In [71]:
cat_cols= xgb_data.select_dtypes('object').columns 
xgb_data =pd.get_dummies(columns=cat_cols,data=xgb_data)

to_scale= ['Income', 'Recency', 'Expense']
X_Scaled =StandardScaler().fit_transform(xgb_data[to_scale])
X_Scaled[:5,:]

X_Scaled =pd.DataFrame(X_Scaled, columns =to_scale)
final_xgb_data = pd.concat((xgb_data.drop(to_scale, axis=1), X_Scaled), axis=1)
final_xgb_data.head()

In [72]:
final_xgb_data.dropna(inplace=True)

In [73]:
xi = final_xgb_data.drop(['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response'], axis=1)
yi = final_xgb_data[['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']]

In [77]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

In [90]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(xi, yi.iloc[:,0], test_size=0.2, random_state=7)
xgb_train = xgb.DMatrix(x_train, label=y_train)
xgb_test = xgb.DMatrix(x_test, label=y_test)

In [91]:
y1= yi.iloc[:,0]
y2= yi.iloc[:,1]
y3= yi.iloc[:,2]
y4= yi.iloc[:,3]
y5= yi.iloc[:,4]
y6= yi.iloc[:,5]
list_prom=[y1,y2,y3,y4,y5, y6]

In [95]:
import shap

shap_values_lis=[]
for i in range (0, len(list_prom)):
    params = {
    "eta": 0.002,
    "max_depth": 3,
    "objective": "survival:cox",
    "subsample": 0.5
    }
    X_train, X_test, y_train, y_test = train_test_split(xi, list_prom[i], test_size=0.2, random_state=7)
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_test = xgb.DMatrix(X_test, label=y_test)
    model_train = xgb.train(params, xgb_train, 10000, evals = [(xgb_test, "test")], verbose_eval=1000)
    shap_values = shap.TreeExplainer(model_train).shap_values(xi)
    shap_values_lis.append(shap_values)

In [97]:
for i in range (0, len(shap_values_lis)):
    print("Shap values for AcceptedCmp{} ".format(i))
    shap.summary_plot(shap_values,shap_values_lis[i], feature_names=xi.columns)

We can see that with the SHAP values for each label with respect to the features modelled:
1. Education, Complain, Marital_status_Single have low impact on the model.
2. Income, Recency and Expense have huge positive impact on Model.