# Clustering

Find clusters based on revenues from the 1st Quarter of 2019 until the 3rd Quarter of 2022. The first part consists in the use of the elbow method to find the ideal number of clusters and the second part consists in finding the actual clusters.

In [None]:
# Import necessary libraries
import pickle
import numpy as np
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.cluster import KMeans

## Elbow method

### Recurring customers

In [None]:
# Import data
with open('Data final\\data_recurring_imputed.pkl', 'rb') as file:
    data_recurring = pickle.load(file)

In [None]:
# Initialize k-means algorithm
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

In [None]:
# Run k-means algorithm for 10 iterations
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(np.array(data_recurring["Total_revenue"]).reshape(-1, 1))
    sse.append(kmeans.inertia_)

In [None]:
# Plot elbow curve
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")

# Save the plot
plt.savefig("Elbow method for recurring.jpeg", bbox_inches='tight')

plt.show()

In [None]:
# Find the best number of clusters
kl = KneeLocator(
    range(1, 11), sse, curve="convex", direction="decreasing"
)
kl.elbow

### New customers

In [None]:
# Import data
with open('Data final\\data_new_imputed.pkl', 'rb') as file:
    data_new = pickle.load(file)

In [None]:
# Initialize k-means algorithm
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

In [None]:
# Run k-means algorithm for 10 iterations
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(np.array(data_new["Total_revenue"]).reshape(-1, 1))
    sse.append(kmeans.inertia_)

In [None]:
# Plot elbow curve
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")

# Save the plot
plt.savefig("Elbow method for new.jpeg", bbox_inches='tight')

plt.show()

In [None]:
# Find the best number of clusters
kl = KneeLocator(
    range(1, 11), sse, curve="convex", direction="decreasing"
)
kl.elbow

## Learn the clusters on data from Q1'19 - Q3'22

### Recurring customers 

In [None]:
# Initialize the k-means algorithms 
kmeans_recurring = KMeans(
    init="random",
    n_clusters=3, # best value selected using the Elbow method
    n_init=10,
    max_iter=300,
    random_state=42
)

In [None]:
# Run k-means algorithm
kmeans_recurring.fit(np.array(data_recurring["Total_revenue"]).reshape(-1, 1))

In [None]:
# Final locations of the centroid
print(kmeans_recurring.cluster_centers_)

In [None]:
# Cluster assignments 
print(kmeans_recurring.labels_)

# Assign clusters to the observations
data_recurring_clusters = data_recurring.copy()
data_recurring_clusters["Cluster"] = kmeans_recurring.labels_

In [None]:
# Change labels in y
mapping = {0: 1, 1: 0, 2: 2}
data_recurring_clusters["Cluster"] = np.where(np.isin(data_recurring_clusters["Cluster"], list(mapping.keys())), [mapping[x] for x in data_recurring_clusters["Cluster"]], data_recurring_clusters["Cluster"])

In [None]:
# Extract data of the first cluster
data_recurring_clusters_low = data_recurring_clusters.loc[data_recurring_clusters["Cluster"] == 0]
# Extract data of the second cluster
data_recurring_clusters_mid = data_recurring_clusters.loc[data_recurring_clusters["Cluster"] == 1]
# Extract data of the third cluster
data_recurring_clusters_high = data_recurring_clusters.loc[data_recurring_clusters["Cluster"] == 2]

In [None]:
# Save pickle files of data for each cluster
with open('Data clustering\\data_recurring_clusters_high.pkl', 'wb') as file:
    pickle.dump(data_recurring_clusters_high, file)
with open('Data clustering\\data_recurring_clusters_mid.pkl', 'wb') as file:
    pickle.dump(data_recurring_clusters_mid, file)
with open('Data clustering\\data_recurring_clusters_low.pkl', 'wb') as file:
    pickle.dump(data_recurring_clusters_low, file)

In [None]:
# Save pickle file of all data with cluster assignments
with open('Data clustering\\data_recurring_clusters.pkl', 'wb') as file:
    pickle.dump(data_recurring_clusters, file)

### New customers 

In [None]:
# Initialize the k-means algorithms 
kmeans_new = KMeans(
    init="random",
    n_clusters=3, # best value selected using the Elbow method
    n_init=10,
    max_iter=300,
    random_state=42
)

In [None]:
# Run k-means algorithm
kmeans_new.fit(np.array(data_new["Total_revenue"]).reshape(-1, 1))

In [None]:
# Final locations of the centroid
print(kmeans_new.cluster_centers_)

In [None]:
# Cluster assignments 
print(kmeans_new.labels_)

# Assign clusters to the observations
data_new_clusters = data_new.copy()
data_new_clusters["Cluster"] = kmeans_new.labels_

In [None]:
# Change labels in y
mapping = {0: 0, 1: 2, 2: 1}
data_new_clusters["Cluster"] = np.where(np.isin(data_new_clusters["Cluster"], list(mapping.keys())), [mapping[x] for x in data_new_clusters["Cluster"]], data_new_clusters["Cluster"])

In [None]:
# Extract data of the first cluster
data_new_clusters_low = data_new_clusters.loc[data_new_clusters["Cluster"] == 0]
# Extract data of the second cluster
data_new_clusters_mid = data_new_clusters.loc[data_new_clusters["Cluster"] == 1]
# Extract data of the third cluster
data_new_clusters_high = data_new_clusters.loc[data_new_clusters["Cluster"] == 2]

In [None]:
# Save pickle files of data for each cluster
with open('Data clustering\\data_new_clusters_high.pkl', 'wb') as file:
    pickle.dump(data_new_clusters_high, file)
with open('Data clustering\\data_new_clusters_mid.pkl', 'wb') as file:
    pickle.dump(data_new_clusters_mid, file)
with open('Data clustering\\data_new_clusters_low.pkl', 'wb') as file:
    pickle.dump(data_new_clusters_low, file)

In [None]:
# Save pickle file of all data with cluster assignments
with open('Data clustering\\data_new_clusters.pkl', 'wb') as file:
    pickle.dump(data_new_clusters, file)