In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings
import datetime
import math

from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

from collections import namedtuple
from matplotlib import pyplot as plt
from dotenv import load_dotenv

sys.path.append("../")

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


In [None]:
dataset = "../pickles/dataset_pca_maintenance.pkl"

df_pca = pd.read_pickle(filepath_or_buffer=dataset)


In [None]:
kmeans = KMeans(n_clusters=5, random_state=1984)  # Estimator, Setting random state to be reproductible
# if random_state = None, numpy rng will setup the random state differently each time it fits the data


In [None]:
df_pca.head()


# 1 : Purpose / introduction / pitfalls
# 2 : Definition of the period
# 3 : ARI per period (month per month), determination of optimal period.
# 4 : ARI on determined rolling period to optimize maintenance.
# 5 : Conclusion

<hr>

# <u>1 Introduction : </u>

- The purpose of this notebook is to provide the necessary tools and answers to obtain the best stability of the model defined in the notebook "improvements_on_rfm". The goal is to find the time needed between two actualizations of the model (kmeans with 5 clusters). We are targeting the highest stability possible while trying to lengthen the period as much as possible.
- Considering that the clustering model takes into account many temporal features, it is subject to the division of the dataset by period : because the model bases itself on the period between two orders (frequency) and the age of the account (most ancient order), it is by nature not very stable on different time samples.

# <u>2 : Definition of the period : </u>

&emsp;While we have no real notion of the account creation (except the first time a client orders something), we must define the period based on an agreed upon metric. Here it will be defined by the most recent order date, so the most ancient order date will be t0 and the most recent order will be tf.<br>
&emsp;Using the actual creation date of the accounts might be a better alternative and a lead worth mentioning. <br>
&emsp;The variable checked is "most_ancient_order_dt", so we are looking at the most recent "most_ancient_order_dt" and the oldest "most_ancient_order_dt", which might seem counter-intuitive


In [None]:
print(df_pca["most_ancient_order_dt"].isna().sum())


In [None]:
# Few NAs in ["most_ancient_order_dt"], dropping ...

df_pca.dropna(subset=["most_ancient_order_dt"], inplace=True)


In [None]:
most_ancient = df_pca["most_ancient_order_dt"].min()
most_recent = df_pca["most_ancient_order_dt"].max()

period = (most_recent - most_ancient)/np.timedelta64(1, 'M')

print(f"""
The most acient order was placed at {most_ancient} and the most recent on {most_recent}.\
\nThe studied period is {math.ceil(period)} months (rounded up)
""")


In [None]:
def get_period(row, t_zero = most_ancient):
    """
    Returns the difference in month (rounded up) between row["most_ancient_order_dt"]
    and t_zero, set by default but can be overwitten.
    """
    return math.ceil((row["most_ancient_order_dt"] - most_ancient)/np.timedelta64(1, 'M'))


In [None]:
df_pca["period"] = df_pca.apply(func=get_period, axis=1)


In [None]:
df_pca["period"].value_counts()


#### Observations :

&emsp;As expected, we have one 0 and one 25 (min/max), the periods will be the group of clients that have ordered between P0 and PStudied, iteratively, so starting at 1 and ending at 25. The repartitions seems otherwise quite good. Let's check.


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 5),
    dpi=pc_dpi,
)

g = sns.histplot(x="period", data=df_pca, ax=ax1, binwidth=1)

###
# Titles/Lables
fig.suptitle("""Amount of clients by the number of month between
 their first order and the most ancient order""")
#
###

fig.tight_layout()
plt.show()


#### Observation :


&emsp;Due to our definition of the period (based on most ancient order), it was expected to have more and more value as we went closer to PF(inal) <br> 
&emsp;In the future, the actual account creation date would be very useful.
<br><br>
&emsp;The ARI works only if the two evaluated samples are of the same length, so we'll need to grab indexes from subset and confront it to the actual dataset.


# <u> 3 : ARI per period (month per month), stability </u>

We will evaluate the stability with the adjusted rand score (cf sklearn doc.).

- 1 : Prediction with the full spectrum of data, model fitted on <u><b>the original dataset, not every run</b></u>.
- 2 : Calculation of the ARI for period in reversed range (starting at 25, so full dataset) and going towards 1, evaluating rows with the same index (need samples of the same len)
- 3 : Plotting the ARI for longer and longer periods of time, selection of the best score.

In [None]:
df_ari = df_pca.copy()


In [None]:
df_pca.head()


In [None]:
df_ari.head()


In [None]:
# Yes theres a typo, kmeanscluster -> kluster
kmeans.fit(df_pca.drop(columns=["most_ancient_order_dt", "period", "kluster"], errors="ignore"))

df_pca["kluster"] = kmeans.predict(df_pca.drop(columns=["most_ancient_order_dt", "period", "kluster"], errors="ignore"))


In [None]:
# Range = 25 --> 1 -- step : 1

eval_period = list(range(1, 26, 1))

eval_period.reverse()


In [None]:
aris = dict.fromkeys(eval_period)

for period in aris.keys():
    actual_slice = df_pca[df_pca["period"] <= period]
    sample = df_ari[df_ari["period"] <= period]
    kmeans.fit(sample.drop(columns=["most_ancient_order_dt", "period"]))
    labels_predict = kmeans.predict(
        sample.drop(columns=["most_ancient_order_dt", "period"])
        )
    score = adjusted_rand_score(labels_true=actual_slice["kluster"].to_numpy(), labels_pred=labels_predict)
    aris[period] = score


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(10, 6),
    dpi=pc_dpi,
)

ax1.plot((list(aris.keys())), (list(aris.values())), color="#000331", marker="o", linewidth=1)
ax1.axhline(y=0.8, linestyle="dotted", color="royalblue", linewidth=3)
ax1.axhline(y=0.9, linestyle="--", color="navy")

###
# Titles/Lables
fig.suptitle("Evolution of Adjusted random score with increasing sample temporality")
ax1.set_xlabel("Period of evaluation, in months")
ax1.set_ylabel("Adjusted Random Score")
ax1.set_xticks(list(aris.keys()))
#
###
fig.tight_layout()
plt.show()


#### Observation :

&emsp;The ARI shows that, to remain relevant, the training data must be at least 5 month old, this period is a little short and we will bump that up to 8 months (next best period, continuous improvement in stability after that). The accuracy represented by the ARI score peaks.<br>
&emsp;We will use this 8 month period to define our period upon which we will the optimal number of months between two maintenances to stay relevant


# <u>4 : ARI on determined rolling period to optimize maintenance.</u>

&emsp;We determined 8 month as the optimal period for peak stability in our dataset. We will proceed to a similar process as in #3 but we will be using a rolling period instead of an extending one. We will displace that period for every month in our period and use it to determine when the algorithm must be retrained.

In [None]:
duration = 8
start = 0

rolling_eval_period = list(range(duration, 26, 1))
rolling_aris = dict.fromkeys(rolling_eval_period)

for period in rolling_eval_period:
    current_p_end = start + duration
    rolling_df = df_ari[(df_ari["period"] >= start) & (df_ari["period"] < period)]
    from_original = df_pca[(df_pca["period"] >= start) & (df_pca["period"] < period)]
    kmeans.fit(rolling_df.drop(columns=["most_ancient_order_dt", "period"]))
    labels_predict = kmeans.predict(
        rolling_df.drop(columns=["most_ancient_order_dt", "period"])
        )
    ar_score = adjusted_rand_score(labels_true=from_original["kluster"].to_numpy(), labels_pred=labels_predict)
    rolling_aris[period] = ar_score
    start += 1


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(10, 6),
    dpi=pc_dpi,
)

ax1.plot((list(rolling_aris.keys())), (list(rolling_aris.values())), color="#000331", marker="o", linewidth=1)
ax1.axhline(y=0.8, linestyle="dotted", color="royalblue", linewidth=3)
ax1.axhline(y=0.9, linestyle="--", color="navy")

###
# Titles/Lables
fig.suptitle("Evolution of ARI for temporal stability, rolling period = 8 months")
ax1.set_xlabel("Period of evaluation, (0->8 months, 1->9 months etc.)")
ax1.set_ylabel("Adjusted Random Score")
ax1.set_xticks(list(rolling_aris.keys()))

#
###
fig.tight_layout()
plt.show()


# <u> 5 Conclusions :</u>

&emsp;The ARI over an 8 month rolling period (rolling monthly) stays high regardless of time. This might mean that we need a dataset with a longer total period to distinguish a decrease in stability. For now, it seems the clusters are stable for at least 17 months (25 - 8). This is a possible recommendation, although incomplete since we cannot predict data we do not have. <br>
&emsp;For now, maintenance could be done every 17 months according to our calculations - Having more data on an extended timespan might benefit our accuracy on this matter.