In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings

import plotly.io as pio
import plotly.express as px
import scipy.cluster.hierarchy as hierarchy

from collections import namedtuple
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

sys.path.append("../")

from scripts.order_operations import get_min_max_dt, get_order_details  # Shows error but its ok, trust me
from scripts.optimizers_mp import k_means_optimizer # Same
from scripts.optimizers import dbscan_optimizer  # Same Same

data_path = "../data"

pio.renderers.default = "notebook_connected"

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


# 1 : Introduction

# 2 : DataFrame creation, RFM logic

# 3 : Early visualisations

# 4 : Applying classification algorithms (K-Means, DBSCAN, Agglomerative clustering)

# 5 : Conclusions and potential improvements

# <u>1 : Introduction :</u>

&emsp;The attempt of this notebook is focused on the widespread approach : RFM (Recency Frequency Monetary). Usual classifications using traditionnal (and fundamentaly flawed) approaches will likely be atempted, like the Paretto distribution.<br>

&emsp;Using RMF approach, we will first attempt to distinguish classes with the data AS IS - This is expected to produce poor results. We will then evolve towards classifying machine learning algorithms which use more resources but produce, in general, better results.<br>

&emsp;It is expected that the data provided by this approach will be insufficient but will provide a good raw material to improve the models by improving the data quality and the algorithms.

# <u>2 : DataFrame creation, RFM logic</u>

&emsp;The usage of RFM presupposes that we have the data to calculate the 3 indicators (R F & M). Not all datasets will be required for this step and there is no need to clog up the memory as the calculation time/resource is expected to go up.<br>
&emsp;Will be required : <br>
- olist_customers : will provide the link between the unique customer and it's aliases (and orders) - The goal is to eliminate the useless repetitivity of the alias.
- olist_orders : will provide the link between the customer (referred sometimes as cx.) and the order itself - The date the order was placed and which alias placed the order.
- olist_order_items : will provide what items were ordered at what price, determining the monetary component of each order

<hr>

<i>
&emsp;The Recency will be defined by the time separating the most recent update from a Cx and the most recent update known - In this approach, we assume that the latest order placed is the most recent order (instead of using t0 = today).<br>
&emsp;The Frequency will be : number_of_orders/membership_time. membership_time is : time elapsed between first order of account and last general order(not last order of account).<br>
&emsp;Monetary will be the total of all item prices of all orders placed by a customer, Kaggle shows that freight is always paid (case where Cx orders 10 times the same item from the same seller --> Cx will pay 10 times the freight price).
</i>

In [None]:
olist_customers_file = "../data/optimized/olist_customers.csv"
olist_orders_file = "../data/optimized/olist_orders.csv"
olist_order_items_file = "../data/optimized/olist_order_items.csv"


In [None]:
df_customers = pd.read_csv(filepath_or_buffer=olist_customers_file)
df_orders = pd.read_csv(filepath_or_buffer=olist_orders_file)
df_orders_items = pd.read_csv(filepath_or_buffer=olist_order_items_file)


In [None]:
# Dtypes were not carried over and will need to be enforced
# Or using pickles rather than CSV

df_customers["customer_id"] = df_customers["customer_id"].astype(np.uint32)
df_customers["customer_unique_id"] = df_customers["customer_unique_id"].astype(np.uint32)


In [None]:
df_customers.info()


In [None]:
df_customers.head()


In [None]:
date_cols = [
        "order_purchase_dt", "order_approved_at",
        "order_delivered_carrier_date", "order_delivered_customer_date",
        "order_estimated_delivery_date"
    ]

for col in date_cols:
    df_orders[col] = pd.to_datetime(df_orders[col])

df_orders["order_id"] = df_orders["order_id"].astype(np.uint32)
df_orders["customer_id"] = df_orders["customer_id"].astype(np.uint32)


In [None]:
df_orders.info()


In [None]:
df_orders.head()


In [None]:
df_orders_items["order_id"] = df_orders_items["order_id"].astype(np.uint32)
df_orders_items["order_item_id"] = df_orders_items["order_item_id"].astype(np.uint32)
df_orders_items["product_id"] = df_orders_items["product_id"].astype(np.uint32)
df_orders_items["seller_id"] = df_orders_items["seller_id"].astype(np.uint32)

df_orders_items["shipping_limit_date"] = pd.to_datetime(df_orders_items["shipping_limit_date"])


In [None]:
df_orders_items.info()


In [None]:
df_orders_items.head()


In [None]:
# Starting point can be orders df, we will remove infos as we need

df_orders.head()


In [None]:
# Cx id, order_id, order_purchase_dt are useful, the rest can go

df_orders = df_orders[["order_id", "customer_id", "order_purchase_dt"]]

df_orders["customer_uid"] = np.uint32(0)
df_orders["sum_total"] = np.nan

df_orders.head()


In [None]:
order_ids = df_orders["order_id"].unique()

for order_id in order_ids:
    index = df_orders.index[df_orders["order_id"] == order_id][0]
    order_value = df_orders_items[df_orders_items["order_id"] == order_id]["price"].sum()
    freight_value = df_orders_items[df_orders_items["order_id"] == order_id]["freight_value"].sum()
    df_orders.at[index, "sum_total"] = order_value + freight_value


In [None]:
# Setting customers unique ids instead of cx_ids

cx_ids = df_orders["customer_id"]

for cx_id in cx_ids:
    index = df_orders.index[df_orders["customer_id"] == cx_id][0]
    cx_uid = df_customers[df_customers["customer_id"] == cx_id]["customer_unique_id"]
    df_orders.at[index, "customer_uid"] = cx_uid


In [None]:
df_orders.head()


In [None]:
rfm_cols = ["customer_uid", "order_id_list", "most_ancient_order_dt", "most_recent_order_dt", "recency", "frequency", "monetary"]
df_rfm = pd.DataFrame(columns=rfm_cols)


In [None]:
uniques = df_orders["customer_uid"].unique()

uniques.sort()

df_rfm["customer_uid"] = uniques


In [None]:
for uid in uniques:
    index = df_rfm.index[df_rfm["customer_uid"] == uid][0]
    details = get_order_details(cx_uid=uid, uid_col="customer_uid", from_frame=df_orders)
    df_rfm.at[index, "order_id_list"] = details["order_list"]
    df_rfm.at[index, "monetary"] = details["total_spent"]


In [None]:
for tuple in df_rfm.itertuples():
    index = tuple.Index
    order_list = tuple.order_id_list
    min_max_dt = get_min_max_dt(order_list=order_list, from_frame=df_orders, dt_col="order_purchase_dt")
    df_rfm.at[index, "most_ancient_order_dt"] = min_max_dt["min"]
    df_rfm.at[index, "most_recent_order_dt"] = min_max_dt["max"]


In [None]:
most_recent_global = df_rfm["most_recent_order_dt"].max()


In [None]:
def get_recency(row, most_recent_global) -> pd.Timedelta:
    """
    Returns timedelta in seconds between most recent purchase global and most recent purchase cx
    """
    return (most_recent_global - row["most_recent_order_dt"]).total_seconds()


def get_frequency(row, most_recent_global):
    """
    Returns avg. purchases made per active month
    """
    account_timespan = most_recent_global - row["most_ancient_order_dt"]
    try:
        return len(row["order_id_list"]) / (account_timespan.total_seconds() / 2419200)  # Purchase per active month
    except ZeroDivisionError:
        return 0


def get_num_order(row):
    return len(row["order_id_list"])


In [None]:
df_rfm["recency"] = df_rfm.apply(get_recency, axis=1, args=(most_recent_global, ))


In [None]:
df_rfm["frequency"] = df_rfm.apply(get_frequency, axis=1, args=(most_recent_global, ))


In [None]:
df_rfm["num_orders"] = df_rfm.apply(get_num_order, axis=1)


In [None]:
df_rfm.head()


#### Part 2 - conclusion :

- Many cells are not useful and can be safely deleted (all cells displaying dfs and head() / info())
- DataFrame created containing RFM values : 
    - Recency is delta T in seconds between Cx most recent order and Global most recent order
    - Frequency is purchases made / account creation to most recent order | In purchase per month
    - Monetary is total spent on all order, including freight which is -always- paid


# <u>3 : Early visualisations</u>

Goals : 
- Identify possible clusters using only RFM and scaling (most likely min-max scaler)
- Apply Pareto Principle (bs) : 20% of customers generate 80% of traffic
- Identify possibly lacking variables overlooked by RFM method (to be confirmed by part 4)
- Plot RFM two by two, then 3D attempt and/or radar

## <u> 1 : Intuition : Most clients make just one purchase</u>

In [None]:
df_rfm.sort_values("num_orders", ascending=False)

In [None]:
number_orders = df_rfm["num_orders"].value_counts().to_dict()

number_orders

In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)



ax1.bar(x=list(number_orders.keys()), height=(number_orders.values()), width=1, color="navy", edgecolor="black")

###
# Titles/Lables
ax1.set_xlabel("Number of orders")
ax1.set_xticks(range(0, max(list(number_orders.keys()))))
ax1.set_ylabel("Number of customers")
fig.suptitle("Number of orders distribution in dataset")
#
###

fig.tight_layout()
plt.show()


In [None]:
# Excluding 0 : 
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

try:
    number_orders.pop(0)
except KeyError:
    pass

ax1.bar(x=list(number_orders.keys()), height=number_orders.values(), width=1, color="navy", edgecolor="black")

###
# Titles/Lables
ax1.set_xlabel("Number of orders")
ax1.set_ylabel("Number of customers")
ax1.set_xticks(range(1, max(list(number_orders.keys()))))
fig.suptitle("Number of orders per cx, exclusion of unique orders")
#
###

fig.tight_layout()
plt.show()


### Observations : 

&emsp;Clearly, olist has either problems keeping customers "loyals" or it is easier to create a new account each visit. In any case, the amount of customers who made exactly one purchase is crushingly larger than all other customers combined, regardless of frequency (93099 customers).
&emsp;Excluding customers who made exactly one purchase, the majority of remaining customers order on olist between .1 and .4 times a month (between 1.1 and 4.8 times a year). There are extremes ordering up to 1.7 times a month. But these customers are extremely rare.

- Operating under the assumption that Olist did not provide its whole database dump, early conclusions cannot be drawn.

## <u>2 : How much have customers spent ?</u>


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 5),
    dpi=pc_dpi,
)

flierprops={"marker": "+", "markersize": 2, "markerfacecolor": "navy"}

ax1.boxplot(x="monetary", data=df_rfm, showmeans=True, vert=False, flierprops=flierprops)

###
# Titles/Lables
fig.suptitle("Customer repartition by total spent on Olist")
#
###

fig.tight_layout()
plt.show()


In [None]:
df_rfm["monetary"].value_counts()


Lots of zeroes, either purchased is cancelled or refunded. - Lets drop those and zoom on 0 -> 1K

In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 5),
    dpi=pc_dpi,
)

flierprops={"marker": "+", "markersize": 2, "markerfacecolor": "navy"}

ax1.boxplot(x="monetary", data=df_rfm[df_rfm["monetary"] != 0.0], showmeans=True, vert=False, flierprops=flierprops)

###
# Titles/Lables
ax1.set_xlim(0, 1000)
fig.suptitle("Customer repartition by total spent on Olist | Transaction = 0 removed & Zoom on 0-1K")
#
###

fig.tight_layout()
plt.show()


In [None]:
# For some reason .describe() does not work here
print("Q1 :", df_rfm["monetary"].quantile(.25))
print("Q3 :", df_rfm["monetary"].quantile(.75))
print("Median :", df_rfm["monetary"].median())
print("Avg. :", np.average(df_rfm["monetary"].values))


In [None]:
outliers = df_rfm[df_rfm["monetary"] < 62.39]
outliers = df_rfm[df_rfm["monetary"] > 182.2375]
print("Max :", max(df_rfm["monetary"].values))
print(len(outliers))
del outliers


### Observations :

&emsp;Most clients spend between 62.39 & 182.24 R$ (Assuming it's in Reales and not in Dollars, it's not precised in the dataset and website screenshots show that R$ is used), with the median at 107.28 and an average of 164.88 R$.<br>
&emsp;There is a wide range of outliers (24024) with spendings going from +Q3(182.2375 R$) to 13664.08 R$, but, mostly, outliers spent 4K max with a very small number of customers crossing this line.

- Operating under the assumption that Olist did not provide its whole database dump, early conclusions cannot be drawn.

## <u> 3 : Time Delta between global last order and customer last order</u>

In [None]:
df_rfm.describe()["recency"]


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

ax1.boxplot(x="recency", data=df_rfm, vert=False, showmeans=True)

###
# Titles/Lables
fig.suptitle("Distribution of customers based on their last order time:")
ax1.set_xlabel("Time Delta (seconds)")
#
###

fig.tight_layout()
plt.show()


In [None]:
# Not very visual so let's convert seconds to days : 
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

deltas = df_rfm["recency"]
deltas_days = np.divide(deltas, 86400)

ax1.boxplot(x=deltas_days, vert=False, showmeans=True)

###
# Titles/Lables
fig.suptitle("Distribution of customers based on their last order time :")
ax1.set_xlabel("Time Delta (days)")
#
###

fig.tight_layout()
plt.show()


In [None]:
deltas_days.describe()


### The data that most customers haven't ordered in a while

- 75% of the customers in the database haven't ordered anything in at least the last 164 days (rounded up).
- Operating under the assumption that Olist did not provide its whole database dump, early conclusions cannot be drawn.

In [None]:
grid = sns.pairplot((df_rfm[["recency", "frequency", "monetary"]]))

grid.figure.figsize = (4, 4)
grid.figure.dpi = pc_dpi

###
# Titles/Lables
grid.figure.suptitle("Pairplot between RFM variables")
#
###
grid.figure.tight_layout()
plt.show()


### Conclusion

&emsp;There are no obvious clusters distinguishable using simply RFM variables and pairploting. 3D Plotting of RFM can be attempted but it seems the data of the 3 variables will not be sufficient to offer what the customer (Olist) is looking for.
&emsp;The Wikipedia page regarding Olist mentions that there are 2M+ unique active customers, so it looks like we only have a small fragment of their database : we know we have 96K accounts and it exists 2M+ accounts, but we are not aware of other useful stastistics, like the number of order, which would explain why our frequency is = 0 in close to 99% of the customers.

## 4 : 3D Visualisation of RFM variables

We hope to obtain visual clusters to identify and classify customers using the MK1 eyeball without any clustering algorithms or data engineering

In [None]:
labels_dict ={
        "recency": "Recency (in seconds)",
        "frequency": "Frequncy of purchase(s)",
        "monetary": "Total Spent on Olist"
    }

marker_style = {
    "color": 'navy',
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_rfm, x="recency",
    y="frequency", z="monetary",
    width=5 * pc_dpi, height=3 * pc_dpi,
    labels=labels_dict,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="3D Representation of customers, RFM approach",
)

fig.update_traces(marker=marker_style)

fig.show()


### Unclear

- The most recent "order" is not really an order as it sum total is 0. Maybe it was too recent for the payment for being processed, awaiting further directions for actions on total_spent = 0
- We can clearly see the extremes and what looks like a zone where most Cxs are placed, but the clustering is still unclear.

# <u>4 : Applying classification algorithms (K-Means, DBSCAN, Agglomerative clustering)</u>

> Following the inconclusive results (as expected) of part #3, Machine Learning classification algorithms can be applied to determine if clusters can be found.

## 4.0 : Min Max Scaling the dataset

> 0 -> 10 or 0 -> 5 will be better suited for understanding data like reviews and ratings

In [None]:
mms = MinMaxScaler(feature_range=(0, 10))

dropcols = ["order_id_list", "most_ancient_order_dt", "most_recent_order_dt", "num_orders"]

df_rfm_mms = df_rfm.drop(columns=dropcols)

df_rfm_mms.set_index("customer_uid", inplace=True)

keepcols = df_rfm_mms.columns

df_rfm_mms = mms.fit_transform(df_rfm_mms.to_numpy())

df_rfm_mms = pd.DataFrame(df_rfm_mms, columns=keepcols)

df_rfm_mms.head()


## 4.1 : Using and optimizing k-means clustering

> Trial and error will be used to determine the best k-range to pass to the optimizer


In [None]:
k_range = range(2, 17)

k_means_optimizer(data=df_rfm_mms, k_range=k_range)


#### Observations :

&emsp; 4 and 5 both seem to be the optimal k-numbers of clusters, the error keeps diminishing with the increasing number of clusters but the silhouette score is acceptable, performing k-means with k=4 (k=5 doesnt look as good) and plotting the results
<br>
><i>It seems the silhouette score calculation is what makes the optimizer takes it's sweet sweet time, if not needed for comparison, it can be considered to rely solely on intertia</i>

In [None]:
km = KMeans(n_clusters=4)
y_predicted = km.fit_predict(df_rfm_mms)

df_rfm_mms["cluster_4"] = y_predicted


In [None]:
labels_dict ={
        "recency": "Recency",
        "frequency": "Frequency of purchase(s)",
        "monetary": "Total Spent on Olist"
    }

marker_style = {
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_rfm_mms, x="recency",
    y="frequency", z="monetary", color="cluster_4",
    width=4 * pc_dpi, height=3 * pc_dpi,
    labels=labels_dict,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="3D Representation of customers, RFM approach, k-means clustering, k=4",
)

fig.update_traces(marker=marker_style)

fig.show()


#### Observation :

&emsp;Even if the clustering is not crystal clear, we can see a somewhat clear pattern, to be expected. :

- Cluster 4 (Farthest cluster to origin, cluster# changes) represents disatisfied people overall, this group's mind might not be possible to change about the goals and ideas of Olist : we can theorize (and calculate later) that this group purchased mainly once, for a moderate sum of Reales, and a long time ago. 
- Cluster 2 and 3 (Center groups, cluster# changes) represents people who like Olist, they might not be huge enthusiasts pressing refresh every 10seconds each time Olist releases a new feature but they might be the most important customers, those on which Olist might need to focus the most. Frequency is not really reliable with the data at hand but this group shows a net increase in this statistic, meanwhile their purchases have been more recent and mostly more profitable than Cluster 4
- Cluster 1 (closest cluster to origin, cluster# changes) represents the people who believe strongly in the services Olist provides, it is very clear that this group contains the most repeating customers, who consequently put on their last order the most recently and who are ready to pay a higher price for Olist's services.

&emsp;Even if it is too early to tell, we can speculate and maybe try to apply the "Law of Diffusion of Innovation", Everett Rogers' 1968 theory and reapplied to business models by the now renowned business writer and public speaker Simon Sinek. <br><br>
&emsp;This theory states that :
- The first 2.5% of Investors are the innovators, believing strongly in the compagny's ideas and willing to take risks on said idea
- The next 13.5% are the Early adopters, willing to take risks to try a new model/piece of technology etc.
- The next 34% are the Early Majority and the following 34% are the Late Majority, investing or using the product/service because it is trending and tested by the first two groups
- The last 16% are the laggards, who wont willingly make any effort or take any risk for an innovation, newer product or service, this group is not swayed by advertising, word of mouth etc. and represent poor investment in marketing
&emsp;This Law also theorizes that the tipping point for a new product/service to really take off, it needs the first 15% of these people, group 1 and 2, and that the rest of the majority, where the majority of consumers are, will follow

&emsp;We can quantify our clusters and see if we can indeed see what looks like this distribution : group 1 & 2 believing strongly in the idea (cluster 1), group 3 & 4 being the majority (cluster 0 and 2) and group 5 (cluster 3) being the laggards.

In [None]:
inv = int(input("Investors : "))
maj1 = int(input("Early Majority"))
maj2 = int(input("Late Majority"))
lagg = int(input("Laggards"))


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

amounts = df_rfm_mms["cluster_4"].value_counts()
amounts_inv = amounts[inv]
amount_maj_1 = amounts[maj1]
amount_maj_2 = amounts[maj2]
amounts_maj = amount_maj_1 + amount_maj_2
amount_lag = amounts[lagg]
total = len(df_rfm_mms)

cluster_dict = {
        f"Investors-{round((amounts_inv / total) * 100, ndigits=2)}%": amounts_inv,
        f"Majority-{round((amounts_maj / total) * 100, ndigits=2)}%": amounts_maj,
        f"Laggards-{round((amount_lag / total) * 100, ndigits=2)}%": amount_lag
    }

my_colors = ["royalblue", "#003153", "red"]

ax1.bar(
        x=list(cluster_dict.keys()),
        height=list(cluster_dict.values()),
        color=my_colors
    )

###
# Titles/Lables
ax1.set_xlabel("Law of Diffusion Classes (with respective percentages)")
ax1.set_ylabel("Customer per Classes")
fig.suptitle("Expression of Customers theorizing the Law of Diffusion of Innovation")
#
###

fig.tight_layout()
plt.show()


#### Observation :

&emsp;The above graph seems to confirm the intuition that Olist's customers follow indeed the Law of Diffusion of Innovation (or cunsumption here). It is very interesting to see that the cuts between the classes are clear and provide a potential course of action for Olist.
&emsp;Indeed, according to the Law of Diffusion of Innovation : 
- Our "Investors" (here around 20% based on each run) are already satisfied with Olist, its services and its methods. They are happy with Olist as is and necessitate little to no effort to keep loyal.
- On the other hand, the "Laggards" (here around 18%) could be interpreted as customers who are either not interested in the services Olist provides, do not wish to change their habbits of consumption or are generally disappointed by the services ; this group -cannot- be swayed, or at a high cost.
- The Majority (Early and Late on the above graph are merged) are the group on which Olist should concentrate its marketting efforts : they are often the followers of the first group. On a purely economical standpoint, this is the most important group to satisfy while keeping group 1 happy.

## 4.2 : DBSCAN Clustering :

&emsp;DBSCAN is another algorithm testable on this dataset. We first need to determine the best max intra cluster distance (epsilon) and the best minimal amount of points it takes to make a cluster, this is often dimension + 1 or dimension * 2
&emsp;Epsilon can be determined using kneighbors. Using a graph to represent the avg distance between a point and its k-neighbors (here 4 : dimension + 1). Zooming in and using the elbow method help us to focus on the best potential epsilon.

In [None]:
neighbors_matrix = df_rfm_mms[["recency", "frequency", "monetary"]].to_numpy()
nneighbors = NearestNeighbors(n_neighbors=4, n_jobs=-1)  # dataset dim + 1

nneighbors.fit(X=neighbors_matrix)

distances, potential_eps = nneighbors.kneighbors(neighbors_matrix)

distances = np.sort(distances, axis=0)
distances_plot = distances[:,1]


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(16, 8),
    dpi=pc_dpi,
)

ax1.plot(distances_plot)

###
# Titles/Lables
ax1.set_xlabel("Object")
ax1.set_ylabel("k distance")
fig.suptitle("Points sorted by distance - Neighbors = 4")
#
###

fig.tight_layout()
plt.show()


In [None]:
# Zooming up until we can see the "eblow"

fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(16, 8),
    dpi=pc_dpi,
)

ax1.plot(distances_plot)

###
# Titles/Lables
ax1.set_xlabel("Object")
ax1.set_ylabel("k distance")
ax1.set_xlim((96020, 96100))
ax1.set_yticks(np.arange(0, 5, 0.1))
ax1.grid(visible=True, axis="both")
fig.suptitle("Points sorted by distance - Neighbors = 4")
#
###

fig.tight_layout()
plt.show()


In [None]:
# Looks like .8 is the best candidate

best_dbs = DBSCAN(eps=0.8, min_samples=4, n_jobs=-1)

y_predict = best_dbs.fit_predict(
        df_rfm_mms.drop(
                columns=["cluster", "cluster_4", "cluster_5", "cluster_DBSCAN"], errors="ignore"
            )
    )

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    df_rfm_mms.loc[:, "cluster_DBSCAN"] = y_predict


In [None]:
labels_dict ={
        "recency": "Recency",
        "frequency": "Frequency of purchase(s)",
        "monetary": "Total Spent on Olist"
    }

marker_style = {
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_rfm_mms, x="recency",
    y="frequency", z="monetary", color="cluster_DBSCAN",
    width=4 * pc_dpi, height=3 * pc_dpi,
    labels=labels_dict,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="3D Representation of customers, RFM approach, DBSCAN clustering",
)

fig.update_traces(marker=marker_style)

fig.show()


#### Observation :

&emsp;DBSCAN clustering is disappointing here. It identifies outliers rather than smaller groups. It doesnt help to narrow down the potential targets for any campaign. It is unclear whether or not my parameters are not correct or if the algorithm is not helpful in this case.
&emsp;A final clustering attempt can be made using Agglomerative Clustering.

## 4.3 : Agglomerative Clustering

&emsp;Agglomerative Clustering might help identify pre-determined clusters with the help of the linkage distance, which we can use from scipy
&emsp;The objective is to produce at least a similar result to K-Means, if not, it is not worth using and updating at this stage.

### 4.3.1 : Determining the correct amount of clusters :

&emsp;Quoting sources : `If you want to create flat clusters we can analyze the [...] dendrogram to determine no. of clusters. We first assume that the horizontal lines are extended on both sides, and as such, they would also cross the vertical lines. Now we have to identify the tallest vertical line that does not have any horizontal line crossing through it.`


In [None]:
## Due to Apple m1 chip being a little stupid, we need to sample the group or the overpriced piece of aluminium is gonna melt
## Taking representatives from classes from k-means, sample size = 2K
sample_inv = df_rfm_mms[df_rfm_mms["cluster_4"] == inv].sample(round((amounts_inv / len(df_rfm_mms)) * 2000))
sample_maj1 = df_rfm_mms[df_rfm_mms["cluster_4"] == maj1].sample(round((amount_maj_1 / len(df_rfm_mms)) * 2000))
sample_maj2 = df_rfm_mms[df_rfm_mms["cluster_4"] == maj2].sample(round((amount_maj_2 / len(df_rfm_mms)) * 2000))
sample_lagg = df_rfm_mms[df_rfm_mms["cluster_4"] == lagg].sample(round((amount_lag / len(df_rfm_mms)) * 2000))

df_rfm_mms_2ksample = pd.concat([sample_inv, sample_maj1, sample_maj2, sample_lagg]).sort_index()

len(df_rfm_mms_2ksample)


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 4),
    dpi=pc_dpi,
)

linkage_method = hierarchy.linkage(
        df_rfm_mms_2ksample.drop(columns=["cluster", "cluster_4", "cluster_5", "cluster_DBSCAN"], errors="ignore"),
        method ="ward",
        metric="euclidean",
    )
dendrogram_plot = hierarchy.dendrogram(linkage_method, ax=ax1, no_labels=True)

###
# Titles/Lables
#
###

fig.tight_layout()
plt.show()


In [None]:
# Lets say its 3 ?

agg = AgglomerativeClustering(n_clusters=3)

#Let's try the small one first
df_rfm_mms_2ksample["cluster_agg"] = agg.fit_predict(df_rfm_mms_2ksample.drop(columns=["cluster_4", "cluster_DBSCAN"], errors="ignore"))



In [None]:
df_rfm_mms_2ksample.head()

In [None]:
labels_dict ={
        "recency": "Recency",
        "frequency": "Frequency of purchase(s)",
        "monetary": "Total Spent on Olist"
    }

marker_style = {
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_rfm_mms_2ksample, x="recency",
    y="frequency", z="monetary", color="cluster_agg",
    width=4 * pc_dpi, height=3 * pc_dpi,
    labels=labels_dict,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="3D Representation of customers, RFM approach, Agglomerative clustering - 2K sample",
)

fig.update_traces(marker=marker_style)

fig.show()


### Observation :
This clustering method could be fine, and sure, it works on a 2K sample but an I7 4.3GhZ w/ 32Gb RAM couldn't hack it in an hour for 100K. So it is not in the interest of Olist to use an algorithm that demanding for their 2M unique customers.

Test has been done on Desktop using :
- i7-97000K
- 32Gb RAM 2133 MhZ
- NVidia 3080 4GB integrated RAMDAC
- Windows 10
- Visual Studio Code with Python 3.10.5
<br><hr>
Test was stopped at the one hour mark on the 96 000 cx sample because it was simply not feasible, much less scalable

# <u>5 : Conclusion and potential improvements :</u>

- K-Means clustering shows a real way of clustering the customers, the theory is backed up by a proven by time "law" : the diffusion of innovation, this is only a primary result but it narrows down considerably Olist's targets.
- Further improvements could be, and will be made, using and testing metrics like :
    - Geolocation : how the customers are clustered and where a campaign can have the maximum impact
    - Delta Dist between buyers and sellers : Do customers consume close to home ? Does this matter ?
    - Reviews : Are consummers driven to a product because of it's previous ratings, and do potential loyal customer leave reviews more often ?
    - Categories : Are some categories more successful than others ? Are giants like Amazon and Ebay swallowing a given category while leaving others to local businesses, favored by Olist ?

Not all of this points might be relevant but including them in the model could make a difference.

<hr>

##### <b>Export :</b>
<u>Will be exported :</u>
- customers unique ids
- the list of their order(s)
- the RFM data
- the initial clusters found by k-means --> Unexpectedly positive results could mean improvements on further models