# Customer
Grouping customers in various ways to understand their behavior and preferences.

## Importing Libraries

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from common_functions import get_engine, read_data_return_df, drop_sk_datetime_added_columns as drop_columns
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.decomposition import PCA

## Connecting to the Database

In [113]:
engine = get_engine()

## Reading in the Data

In [114]:
query = "SELECT * FROM Customer"
customers = read_data_return_df(query, engine)

customers

In [115]:
query = "SELECT * FROM BusinessEntity"
business_entity = read_data_return_df(query, engine)

business_entity.head()

In [116]:
query = "SELECT * FROM BusinessEntityAddress"
business_entity_address = read_data_return_df(query, engine)

business_entity_address.head()

In [117]:
query = "SELECT * FROM Territory"
territory = read_data_return_df(query, engine)

territory.head()

## Data Cleaning

In [118]:
# combining the customers and business entity data
combined_df = pd.merge(customers, business_entity, left_on='CUSTOMER_CUSTOMER_CustomerID', right_on='BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID', suffixes=('_customer', '_business_entity')) 

combined_df = pd.merge(combined_df, business_entity_address, left_on='BUSINESSENTITY_sk', right_on='BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID', suffixes=('', '_business_entity_address'))

combined_df = pd.merge(combined_df, territory, left_on='CUSTOMER_CUSTOMER_TerritoryID', right_on='TERRITORY_sk', suffixes=('', '_territory'))

# dropping the datetime_added and sk columns
drop_columns(combined_df)

In [119]:
# dropping unnecessary columns
columns_to_keep = ['CUSTOMER_CUSTOMER_CustomerID', 'CUSTOMER_CUSTOMER_StoreID', 'CUSTOMER_CUSTOMER_TerritoryID', 'CUSTOMER_STORE_SalesPersonID', 'CUSTOMER_SALESTERRITORY_CountryRegionCode','CUSTOMER_SALESTERRITORY_SalesYTD', 'CUSTOMER_SALESTERRITORY_SalesLastYear', 'BUSINESSENTITY_CONTACTTYPE_ContactTypeID', 'BUSINESSENTITYADDRESS_ADDRESSTYPE_AddressTypeID', 'TERRITORY_TERRITORIES_TerritoryID', 'TERRITORY_STATEPROVINCE_StateProvinceID', 'TERRITORY_STATEPROVINCE_IsOnlyStateProvinceFlag', 'TERRITORY_SALESTAXRATE_SalesTaxRateID', 'BUSINESSENTITYADDRESS_ADDRESS_City', 'BUSINESSENTITYADDRESS_ADDRESS_POSTALCODE']

# converting the string values to integer representations
combined_df['BUSINESSENTITYADDRESS_ADDRESS_City'] = combined_df['BUSINESSENTITYADDRESS_ADDRESS_City'].astype('category').cat.codes
combined_df['BUSINESSENTITYADDRESS_ADDRESS_POSTALCODE'] = combined_df['BUSINESSENTITYADDRESS_ADDRESS_POSTALCODE'].astype('category').cat.codes

combined_df = combined_df[columns_to_keep]

combined_df.head()

## Clustering
Using Kmeans

In [120]:
# Define a range of k values to try, from 2 to the number of data points -1 
ks = range(2, combined_df.shape[0])

# Empty list to store the intra- and inter cluster distances
intra_distances = []
inter_distances = []
silhouette_scores = []

for k in ks:
    # Training the KMeans model for this k value
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(combined_df)
    
    # Calculate the intra cluster distance as the sum of the distances of the samples to their closest cluster center. This is done by kmeans.inertia_
    intra_distance = kmeans.inertia_
    intra_distances.append(intra_distance)
    
    # Calculate the inter cluster distance as the mean distance between the cluster centers. this is done by pairwise_distances(kmeans.cluster_centers_)
    cluster_centers = kmeans.cluster_centers_
    inter_distance = np.mean(pairwise_distances(cluster_centers))
    inter_distances.append(inter_distance)
    
    # calculate the silhouette score
    silhouette_scores.append(silhouette_score(combined_df, kmeans.labels_))

# Create a DataFrame with the results
k_df = pd.DataFrame({'k': ks, 'intra_distance': intra_distances, 'inter_distance': inter_distances, 'silhouette_score': silhouette_scores})
k_df

### Plotting the intra and inter cluster distances

In [121]:
# Plotting the intra-cluster distances
plt.figure(figsize=(20, 10))
plt.plot(ks, intra_distances, marker='o')
plt.title('Intra-cluster distances for different values of k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Intra-cluster distance (inertia)')
plt.grid(True)
plt.show()

# Plotting the inter-cluster distances
plt.figure(figsize=(20, 10))
plt.plot(ks, inter_distances, marker='o')
plt.title('Inter-cluster distances for different values of k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inter-cluster distance')
plt.grid(True)
plt.show()

# plotting the silhouette scores
plt.figure(figsize=(20, 10))
plt.plot(ks, silhouette_scores, marker='o')
plt.title('Silhouette scores for different values of k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette score')
plt.grid(True)
plt.show()

### Finding the optimal number of clusters

In [122]:
# Gets the index of the maximum silhouette score. Adding 2 because the range of k starts from 2 (since we need at least 2 clusters)
optimal_k = np.argmax(silhouette_scores) + 2

print(f"Optimal number of clusters based on silhouette score: k={optimal_k}")

In [123]:
# Train a KMeans model with the optimal number of clusters
kmeans_optimal = KMeans(n_clusters=optimal_k)
kmeans_optimal.fit(combined_df)

### Plotting the clusters

In [124]:
# Defining a list of colors to be used for the clusters
colors = ['deeppink','blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'azure', 'orange', 'brown']

In [125]:
# Get the cluster assignments for each data point
cluster_assignments = kmeans_optimal.labels_

# Reduce the data to two dimensions using PCA
# PCA is a technique that projects the data into a lower dimensional space while preserving as much variance as possible
pca = PCA(n_components=2) # reducing to 2 dimensions
df_pca = pca.fit_transform(combined_df)

# creating a color map using the colors list
color_map = {label: colors[label % len(colors)] for label in np.unique(cluster_assignments)}

# Map the cluster assignments to colors
color_assignments = list(map(color_map.get, cluster_assignments))

# Create a scatter plot of the reduced data, colored by cluster assignment
plt.figure(figsize=(10, 5))
plt.title('Visualization of clusters')
# PCA is unsupervised, so we don't have labels. We can only label the axes
plt.xlabel('PCA X')
plt.ylabel('PCA Y')

# Create a scatter plot of the reduced data, colored by cluster assignment
scatter = plt.scatter(df_pca[:, 0], df_pca[:, 1], c=np.array(color_assignments))

# Create a legend
handles = [plt.Line2D([],[],marker="o", ls="", color=color_map[i]) for i in np.unique(cluster_assignments)]
plt.legend(handles, np.unique(cluster_assignments), title="Clusters", bbox_to_anchor=(1, 1))

plt.show()

In [126]:
# Get the unique cluster labels
unique_clusters = np.unique(cluster_assignments)

# Create a figure and a set of subplots
fig, axs = plt.subplots(len(unique_clusters), figsize=(10, 5*len(unique_clusters)))

# Iterate over the unique cluster labels
for i, cluster in enumerate(unique_clusters):
    # Get the data points that belong to this cluster
    cluster_data = df_pca[cluster_assignments == cluster]

    # Create a scatter plot for this cluster using a unique color
    axs[i].scatter(cluster_data[:, 0], cluster_data[:, 1], color=colors[i % len(colors)])
    axs[i].set_title(f'Cluster {cluster}')
    axs[i].set_xlabel('PCA X')
    axs[i].set_ylabel('PCA Y')

# Display the plots
plt.tight_layout()
plt.show()