In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from scipy.stats import iqr
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
import matplotlib
matplotlib.use('nbagg')
%matplotlib inline


In [3]:
df = pd.read_csv('OVcustomers.csv')
df

Unnamed: 0,custid,av_on_web,av_purch_price,days_since_purch,Loyalty Member,av_purch_time
0,321,49.40,549.42,504,1,20.97
1,322,46.51,560.48,360,0,23.00
2,323,69.04,804.07,421,0,13.02
3,324,66.55,806.24,5,0,23.25
4,325,65.26,882.50,182,1,12.53
...,...,...,...,...,...,...
1157,1478,34.00,895.00,680,0,1.43
1158,1479,76.00,569.00,268,1,13.20
1159,1480,13.00,473.00,23,1,4.97
1160,1481,67.00,156.00,101,1,17.28


In [4]:
# Checking if there are any nulls

df.isnull().values.any()

False

In [5]:
df1 = df[(df["Loyalty Member"] == 1)]
df0 = df[(df["Loyalty Member"] == 0)]

In [6]:
# Checking for correlation between data columnns
df.corr()

Unnamed: 0,custid,av_on_web,av_purch_price,days_since_purch,Loyalty Member,av_purch_time
custid,1.0,-0.100192,-0.163852,-0.003573,-0.020787,-0.006179
av_on_web,-0.100192,1.0,0.737694,0.011154,-0.021718,-0.000767
av_purch_price,-0.163852,0.737694,1.0,0.025476,-0.028548,0.028317
days_since_purch,-0.003573,0.011154,0.025476,1.0,-0.04844,0.002336
Loyalty Member,-0.020787,-0.021718,-0.028548,-0.04844,1.0,-0.010406
av_purch_time,-0.006179,-0.000767,0.028317,0.002336,-0.010406,1.0


In [7]:
# Checking for correlation for loyalty group customers
df1.corr()

Unnamed: 0,custid,av_on_web,av_purch_price,days_since_purch,Loyalty Member,av_purch_time
custid,1.0,-0.084574,-0.185092,0.053838,,0.016297
av_on_web,-0.084574,1.0,0.715146,0.002063,,0.004721
av_purch_price,-0.185092,0.715146,1.0,-0.021547,,0.053927
days_since_purch,0.053838,0.002063,-0.021547,1.0,,0.039024
Loyalty Member,,,,,,
av_purch_time,0.016297,0.004721,0.053927,0.039024,,1.0


In [8]:
# Checking for correlation for non-loyalty group customers
df0.corr()

Unnamed: 0,custid,av_on_web,av_purch_price,days_since_purch,Loyalty Member,av_purch_time
custid,1.0,-0.115446,-0.144793,-0.060609,,-0.027475
av_on_web,-0.115446,1.0,0.757664,0.016887,,-0.005636
av_purch_price,-0.144793,0.757664,1.0,0.065781,,0.005187
days_since_purch,-0.060609,0.016887,0.065781,1.0,,-0.030727
Loyalty Member,,,,,,
av_purch_time,-0.027475,-0.005636,0.005187,-0.030727,,1.0


Only correlation is between av_on_web and  av_purch_price which is a strong positive relationship. The remaining data columns have no correlation

In [None]:
# A seaborn pair plot
# https://seaborn.pydata.org/generated/seaborn.pairplot.html

import seaborn as sns
sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x1cac508e430>

In [None]:
# A seaborn pair plot
# https://seaborn.pydata.org/generated/seaborn.pairplot.html
# loyalty group

import seaborn as sns
sns.pairplot(df1)

In [None]:
# A seaborn pair plot
# https://seaborn.pydata.org/generated/seaborn.pairplot.html
# Non-loyalty group

import seaborn as sns
sns.pairplot(df0)

In [None]:
# Creating Exploratory Data Analysis plot for the strong correlating values
# Grouping av_on_web

def group(av_on_web):
    if av_on_web <8:
        return "0-8"
    elif av_on_web > 8 and av_on_web <17:
        return "9-16"
    elif av_on_web > 16 and av_on_web <25:
        return "17-24"
    elif av_on_web > 24 and av_on_web <33:
        return "25-32"
    elif av_on_web > 32 and av_on_web <41:
        return "33-40"
    elif av_on_web > 40 and av_on_web <49:
        return "41-48"
    elif av_on_web > 48 and av_on_web <57:
        return "49-56"
    elif av_on_web > 56 and av_on_web <65:
        return "57-64"
    elif av_on_web > 64 and av_on_web <73:
        return "65-72"
    elif av_on_web > 72 and av_on_web <81:
        return "73-80"
    elif av_on_web > 80 and av_on_web <89:
        return "81-88"
    elif av_on_web > 89:
        return ">89"

df["av_on_web Group"] =df["av_on_web"].apply(group)
# To order plotly index
order = ["0-8", "9-16", "17-24", "25-32", "33-40", "41-48", "49-56", "57-64", "65-72", "73-80", "81-88",">89"]

mask = df.groupby("av_on_web Group")["av_purch_price"].median()
mask = mask.reset_index()
fig = px.bar(data_frame=mask, x="av_on_web Group", y="av_purch_price", height=500)

annotation = []
for x, y in zip(mask["av_on_web Group"], mask["av_purch_price"]):
    annotation.append(
        dict(x=x, y=y + 30,
             text=str(round(y, 2)) + '$',
             font=dict(family='Arial', size=14, color='rgb(66, 99, 236)'), showarrow=False)
    )
fig.update_xaxes(categoryorder='array', categoryarray= order)
fig.update_layout(annotations=annotation)
fig.show()

In [None]:
# K-clustering, checking for the optimal number of clusters to create customer groups. 3 is optimal from Elbow plot

data = df[["av_on_web", "av_purch_price"]]

df_log = np.log(data)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

std_scaler = StandardScaler()
df_scaled = std_scaler.fit_transform(df_log)

errors = []
for k in range(1, 11):
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(df_scaled)
    errors.append(model.inertia_)
    


plt.title('The Elbow Method')
plt.xlabel('k'); plt.ylabel('SSE')
sns.pointplot(x=list(range(1, 11)), y=errors)
plt.savefig("Elbow.png")



In [None]:
model = KMeans(n_clusters=3, random_state=42)
model.fit(df_scaled)


In [None]:
data = data.assign(ClusterLabel= model.labels_)

data.groupby("ClusterLabel")[["av_on_web", "av_purch_price"]].median()

In [None]:
# Plot displaying three customer segments 

fig = px.scatter(
    data_frame=data,
    x = "av_on_web",
    y= "av_purch_price",
    title = "Relationship between av_on_web VS av_purch_price",
    color = "ClusterLabel",
    height=500
)
fig.show()

From av_on_web and av_purch_price attributes, there are three resulting customer segments with median values in table above.

Group 0 - Customers who spend medium time on web & medium average purchase price.

Group 1 - Customers who spend the highest time on web & high average purchase price.

Group 2 - Customers who spend the least time on web & spend thr least on purchase.

From this grouping, adverts with items with the highest prices should target customers with highest average time on web,
medium priced items adverts should target customers with medium average time on web and items with the lowest prices should target customers with least average time on web

In [None]:
# Creating Exploratory Data Analysis plot for av_purch_time and av_purch_price with no correlation value of 0.028317
# Grouping av_purch_time using a 3hr window within 24hrs of the day

def group(av_purch_time):
    if av_purch_time <3:
        return "0-2.98"
    elif av_purch_time > 2.98 and av_purch_time <6:
        return "3-5.98"
    elif av_purch_time > 5.98 and av_purch_time <9:
        return "6-8.98"
    elif av_purch_time > 8.98 and av_purch_time <12:
        return "9-11.98"
    elif av_purch_time > 11.98 and av_purch_time <15:
        return "12-14.98"
    elif av_purch_time > 14.98 and av_purch_time <18:
        return "15-17.98"
    elif av_purch_time > 17.98 and av_purch_time <21:
        return "18-20.98"
    elif av_purch_time > 20.98:
        return ">21"



df["av_purch_time Group"] =df["av_purch_time"].apply(group)

# To order plotly index
order = ["0-2.98","3-5.98", "6-8.98", "9-11.98", "12-14.98", "15-17.98", "18-20.98",">21"]

# Creating Exploratory Data Analysis plot for the strong correlating values

mask = df.groupby("av_purch_time Group")["av_purch_price"].median()
mask = mask.reset_index()
fig = px.bar(data_frame=mask, x="av_purch_time Group", y="av_purch_price", height=500)

annotation = []
for x, y in zip(mask["av_purch_time Group"], mask["av_purch_price"]):
    annotation.append(
        dict(x=x, y=y + 30,
             text=str(round(y, 2)) + '$',
             font=dict(family='Arial', size=14, color='rgb(66, 99, 236)'), showarrow=False)
    )
fig.update_xaxes(categoryorder='array', categoryarray= order)
fig.update_layout(annotations=annotation)
fig.show()

From the plot above, within a 24hr day there is no distinct average purchase time group, there is no customer segment to create from this attribute comparison as shown in correlation.

In [None]:
# K-clustering, checking for the optimal number of clusters to create customer groups.  
# 4 is optimal from Elbow plot for av_on_web & days_since_purch attribute comparison.

data = df[["av_on_web", "days_since_purch"]]

df_log = np.log(data)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

std_scaler = StandardScaler()
df_scaled = std_scaler.fit_transform(df_log)

errors = []
for k in range(1, 11):
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(df_scaled)
    errors.append(model.inertia_)
    


plt.title('The Elbow Method')
plt.xlabel('k'); plt.ylabel('SSE')
sns.pointplot(x=list(range(1, 11)), y=errors)
plt.savefig("Elbow.png")



In [None]:
model = KMeans(n_clusters=4, random_state=42)
model.fit(df_scaled)

In [None]:
data = data.assign(ClusterLabel= model.labels_)
data.groupby("ClusterLabel")[["av_on_web", "days_since_purch"]].median()

In [None]:
# Plot displaying four customer segments 

fig = px.scatter(
    data_frame=data,
    x = "av_on_web",
    y= "days_since_purch",
    title = "Relationship between av_on_web VS days_since_purch",
    color = "ClusterLabel",
    height=500
)
fig.show()

From av_on_web and days_since_purch attributes, there are four resulting customer segments with median values in table above.

Group 0 - Customers who spend medium to the highest time on web & 122 days_since_purch(4 months)

Group 1 - Customers who spend the least time on web & 392 days_since_purch(13 months)

Group 2 - Customers who spend least to the highest time on web & 24 days_since_purch(less than a month)

Group 3 - Customers who spend medium to the highest time on web & 456 days_since_purch(16 months)

From these customer segments, customers who spend the highest time with an averagely high corresponding purchase price shows that it takes a 4 months to 16 months before the make another purchase therefore more adverts should target that group by marketting team.