In [1]:
# import required libraries for dataframe and visualization

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

# import required libraries for clustering
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
from kmodes.kprototypes import KPrototypes

ModuleNotFoundError: No module named 'sklearn_extra'

In [None]:
# Read in all orders.
orders = pd.read_csv('superstore_dataset2011-2015.csv', header=0, encoding='ISO-8859-1')

In [None]:
# Display dataset's data types.
orders.info()

In [None]:
# Dsiplay the dataset's number of rows and columns pre-formatting
orders.shape

In [None]:
# Display the dataset's basic stat.
orders.describe()

In [None]:
# Checking for null fields
df_null = round(100*(orders.isnull().sum())/len(orders), 2)
df_null

In [None]:
# Drop Postal Code, since 80% of this column's data is missing
orders = orders.drop(columns=['Postal Code'])

In [None]:
orders = orders.dropna()

# Check the shape after dropping the rows with missing values
orders.shape

In [None]:
# Monetary column
rfm_m = orders.groupby('Customer ID')['Sales'].sum()
rfm_m = rfm_m.reset_index()
rfm_m.columns = ['Customer ID', 'Sales']
rfm_m.head()

In [None]:
# Frequency column
rfm_f = orders.groupby('Customer ID')['Order ID'].count()
rfm_f = rfm_f.reset_index()
rfm_f.columns = ['Customer ID', 'Frequency']
rfm_f.head()

In [None]:
# Recency column
orders['Order Date'] = orders['Order Date'].str.replace('/', '-')
orders['Order Date']


In [None]:
orders['Order Date'] = pd.to_datetime(orders['Order Date'],format='%d-%m-%Y')
orders['Order Date']

In [None]:
max_date = max(orders['Order Date'])
max_date

In [None]:
orders['Diff'] = max_date - orders['Order Date']
rfm_r = orders.groupby('Customer ID')['Diff'].min()
rfm_r = rfm_r.reset_index().sort_values(by='Diff', ascending=False)
rfm_r['Diff'] = rfm_r['Diff'].dt.days
rfm_r

In [None]:
# Merging dataframes
rfm = pd.merge(rfm_m, rfm_f, on='Customer ID', how='inner')
rfm = pd.merge(rfm, rfm_r, on='Customer ID', how='inner')
rfm.columns = ['Customer ID', 'Amount', 'Frequency', 'Recency']
rfm.head()

In [None]:
# Distribution for Recency
sns.displot(rfm['Recency'])

In [None]:
# Distribution for Frequency
sns.displot(rfm['Frequency'])

In [None]:
# Distribution for Amount
sns.displot(rfm['Amount'])

In [None]:
# Detect outliers

attributes = ['Amount','Frequency','Recency']
plt.rcParams['figure.figsize'] = [10,8]
sns.boxplot(data = rfm[attributes], orient="v", palette="Set2" ,whis=1.5,saturation=1, width=0.7)
plt.title("Outliers Variable Distribution", fontsize = 14, fontweight = 'bold')
plt.ylabel("Range", fontweight = 'bold')
plt.xlabel("Attributes", fontweight = 'bold')

In [None]:
# Removing (statistical) outliers for Amount
Q1 = rfm.Amount.quantile(0.25)
Q3 = rfm.Amount.quantile(0.75)
IQR = Q3 - Q1
rfm = rfm[(rfm.Amount >= Q1 - 1.5*IQR) & (rfm.Amount <= Q3 + 1.5*IQR)]

# Removing (statistical) outliers for Recency
Q1 = rfm.Recency.quantile(0.25)
Q3 = rfm.Recency.quantile(0.75)
IQR = Q3 - Q1
rfm = rfm[(rfm.Recency >= Q1 - 1.5*IQR) & (rfm.Recency <= Q3 + 1.5*IQR)]

# Removing (statistical) outliers for Frequency
Q1 = rfm.Frequency.quantile(0.25)
Q3 = rfm.Frequency.quantile(0.75)
IQR = Q3 - Q1
rfm = rfm[(rfm.Frequency >= Q1 - 1.5*IQR) & (rfm.Frequency <= Q3 + 1.5*IQR)]

In [None]:
attributes = ['Amount','Frequency','Recency']
plt.rcParams['figure.figsize'] = [10,8]
sns.boxplot(data = rfm[attributes], orient="v", palette="Set2" ,whis=1.5,saturation=1, width=0.7)
plt.title("Outliers Variable Distribution", fontsize = 14, fontweight = 'bold')
plt.ylabel("Range", fontweight = 'bold')
plt.xlabel("Attributes", fontweight = 'bold')

In [None]:
# Distribution for Recency
sns.displot(rfm['Recency'])

In [None]:
# Distribution for Frequency
sns.displot(rfm['Frequency'])

In [None]:
# Distribution for Amount
sns.displot(rfm['Amount'])

In [None]:
# Rescaling the attributes

rfm_df = rfm[['Amount', 'Frequency', 'Recency']]

# Instantiate
scaler = MinMaxScaler()

# fit_transform
rfm_df_scaled = scaler.fit_transform(rfm_df)
rfm_df_scaled.shape

In [None]:
rfm_df_scaled = pd.DataFrame(rfm_df_scaled)
rfm_df_scaled.columns = ['Amount', 'Frequency', 'Recency']
rfm_df_scaled.head()

In [None]:
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(rfm_df_scaled)

In [None]:
kmeans.labels_

In [None]:
# Elbow-curve/SSD (Sum of Squares Distance)

ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(rfm_df_scaled)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
plt.plot(ssd)
# y-axis: SSD
# x-axis: Number of clusters
# Choose a balance point to avoid over-fitting.

In [None]:
# Silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(rfm_df_scaled)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(rfm_df_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
    
# The higher the point, the higher similarities between items within a cluster and 
# the higher dissimilarites bettwen items in different clusters.
# Choose a balance number to avoid over-fitting.

In [None]:
# Final model with k=4
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(rfm_df_scaled)

In [None]:
kmeans.labels_

In [None]:
# assign the label
rfm['Cluster_Id'] = kmeans.labels_
rfm.head()

In [None]:
# Box plot to visualize Cluster Id vs Amount

sns.boxplot(x='Cluster_Id', y='Amount', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Recency

sns.boxplot(x='Cluster_Id', y='Recency', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Frequency

sns.boxplot(x='Cluster_Id', y='Frequency', data=rfm)

In [None]:
# Elbow-curve/SSD (Sum of Squares Distance)

ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmedoids = KMedoids(n_clusters=num_clusters, max_iter=50)
    kmedoids.fit(rfm_df_scaled)
    
    ssd.append(kmedoids.inertia_)
    
# plot the SSDs for each n_clusters
plt.plot(ssd)
# y-axis: SSD
# x-axis: Number of clusters
# Choose a balance point to avoid over-fitting.

In [None]:
# Silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmedoids = KMedoids(n_clusters=num_clusters, max_iter=50)
    kmedoids.fit(rfm_df_scaled)
    
    cluster_labels = kmedoids.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(rfm_df_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
    
# The higher the point, the higher similarities between items within a cluster and 
# the higher dissimilarites bettwen items in different clusters.
# Choose a balance number to avoid over-fitting.

In [None]:
# Final model with k=4
kmedoids = KMedoids(n_clusters=4, max_iter=50)
kmedoids.fit(rfm_df_scaled)

In [None]:
# assign the label
rfm['Cluster_Id'] = kmedoids.labels_
rfm.head()

In [None]:
# Box plot to visualize Cluster Id vs Amount

sns.boxplot(x='Cluster_Id', y='Amount', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Recency

sns.boxplot(x='Cluster_Id', y='Recency', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Frequency

sns.boxplot(x='Cluster_Id', y='Frequency', data=rfm)

In [None]:
# Profit column
profit_column = orders.groupby('City')['Profit'].sum()
profit_column = profit_column.reset_index()
profit_column.columns = ['City', 'Profit']
profit_column.head()

In [None]:
# Sales column
sales_column = orders.groupby('City')['Sales'].sum()
sales_column = sales_column.reset_index()
sales_column.columns = ['City', 'Sales']
sales_column.head()

In [None]:
# Shipping Cost column
shippingCost_column = orders.groupby('City')['Shipping Cost'].sum()
shippingCost_column = shippingCost_column.reset_index()
shippingCost_column.columns = ['City', 'ShippingCost']
shippingCost_column.head()

In [None]:
# Merging dataframes
profit_sales = pd.merge(profit_column, sales_column, on='City', how='inner')
profit_sales_shippingCost = pd.merge(profit_sales, shippingCost_column, on='City', how='inner')
profit_sales_shippingCost.columns = ['City', 'Profit', 'Sales', 'ShippingCost']
profit_sales_shippingCost.head()

In [None]:
# Detect outliers

attributes = ['Profit','Sales','ShippingCost']
plt.rcParams['figure.figsize'] = [10,8]
sns.boxplot(data = profit_sales_shippingCost[attributes], orient="v", palette="Set2" ,whis=1.5,saturation=1, width=0.7)
plt.title("Outliers Variable Distribution", fontsize = 14, fontweight = 'bold')
plt.ylabel("Range", fontweight = 'bold')
plt.xlabel("Attributes", fontweight = 'bold')

In [None]:
sns.displot(profit_sales_shippingCost['Profit'])

In [None]:
sns.displot(profit_sales_shippingCost['Sales'])

In [None]:
sns.displot(profit_sales_shippingCost['ShippingCost'])

In [None]:
# Removing (statistical) outliers for Sales
Q1 = profit_sales_shippingCost.Sales.quantile(0.25)
Q3 = profit_sales_shippingCost.Sales.quantile(0.75)
IQR = Q3 - Q1
rfm = profit_sales_shippingCost[(profit_sales_shippingCost.Sales >= Q1 - 1.5*IQR) & (profit_sales_shippingCost.Sales <= Q3 + 1.5*IQR)]

# Removing (statistical) outliers for Shipping Cost
Q1 = profit_sales_shippingCost.ShippingCost.quantile(0.25)
Q3 = profit_sales_shippingCost.ShippingCost.quantile(0.75)
IQR = Q3 - Q1
rfm = profit_sales_shippingCost[(profit_sales_shippingCost.ShippingCost >= Q1 - 1.5*IQR) & (profit_sales_shippingCost.ShippingCost <= Q3 + 1.5*IQR)]

In [None]:
sns.displot(profit_sales_shippingCost['Profit'])

In [None]:
sns.displot(profit_sales_shippingCost['Sales'])

In [None]:
sns.displot(profit_sales_shippingCost['ShippingCost'])

In [None]:
# Rescaling the attributes

psc = profit_sales_shippingCost[['Profit', 'Sales', 'ShippingCost']]

# Instantiate
scaler = MinMaxScaler()

# fit_transform
psc_scaled = scaler.fit_transform(psc)
psc_scaled.shape

In [None]:
psc_scaled = pd.DataFrame(psc_scaled)
psc_scaled.columns = ['Profit', 'Sales', 'ShippingCost']
psc_scaled.head()

In [None]:
# Elbow-curve/SSD

ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(psc_scaled)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
plt.plot(ssd)
# y-axis: SSD
# x-axis: Number of clusters
# Choose a balance point to avoid over-fitting.

In [None]:
# Silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(psc_scaled)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(psc_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
    
# The higher the point, the higher similarities between items within a cluster and 
# the higher dissimilarites bettwen items in different clusters.
# Choose a balance number to avoid over-fitting.

In [None]:
# Final model with k=4
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(psc_scaled)

In [None]:
 kmeans.labels_

In [None]:
# assign the label
psc['Cluster_Id'] = kmeans.labels_
psc.head()

In [None]:
# Box plot to visualize Cluster Id vs Profit

sns.boxplot(x='Cluster_Id', y='Profit', data=psc)

In [None]:
# Box plot to visualize Cluster Id vs Sales

sns.boxplot(x='Cluster_Id', y='Sales', data=psc)

In [None]:
# Box plot to visualize Cluster Id vs Shipping Cost

sns.boxplot(x='Cluster_Id', y='ShippingCost', data=psc)

In [None]:
# Elbow-curve/SSD

ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmedoids = KMedoids(n_clusters=num_clusters, max_iter=50)
    kmedoids.fit(psc_scaled)
    
    ssd.append(kmedoids.inertia_)
    
# plot the SSDs for each n_clusters
plt.plot(ssd)
# y-axis: SSD
# x-axis: Number of clusters
# Choose a balance point to avoid over-fitting.

In [None]:
# Silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmedoids = KMedoids(n_clusters=num_clusters, max_iter=50)
    kmedoids.fit(psc_scaled)
    
    cluster_labels = kmedoids.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(psc_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
    
# The higher the point, the higher similarities between items within a cluster and 
# the higher dissimilarites bettwen items in different clusters.
# Choose a balance number to avoid over-fitting.

In [None]:
# Final model with k=4
kmedoids = KMedoids(n_clusters=4, max_iter=50, init='build', method='pam')
kmedoids.fit(psc_scaled)

In [None]:
# assign the label
psc['Cluster_Id'] = kmedoids.labels_
psc.head()

In [None]:
# Box plot to visualize Cluster Id vs Profit

sns.boxplot(x='Cluster_Id', y='Profit', data=psc)

In [None]:
# Box plot to visualize Cluster Id vs Sales

sns.boxplot(x='Cluster_Id', y='Sales', data=psc)

In [None]:
# Box plot to visualize Cluster Id vs Shipping Cost

sns.boxplot(x='Cluster_Id', y='ShippingCost', data=psc)

In [None]:
categoricals = orders.loc[:, ['City', 'Category', 'Profit']].copy() 

In [None]:
categoricals

In [None]:
# Choose optimal K using Elbow method
ssd = []
for cluster in range(1, 10):
    try:
        kprototype = KPrototypes(n_clusters = cluster, init = 'Huang', random_state = 0, max_iter=50)
        kprototype.fit_predict(categoricals, categorical = [0, 1])
        ssd.append(kprototype.inertia_)
        print('Cluster initiation: {}'.format(cluster))
    except:
        break
# Converting the results into a dataframe and plotting them
plt.plot(ssd)