# Mise a jour du modele

In [136]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import itertools
from sklearn.cluster import KMeans

In [137]:
df = pd.read_parquet('/kaggle/input/completedf/df.gzip')

In [138]:
RFM = pd.read_parquet('/kaggle/input/marketing/RFM.gzip')

In [139]:
RFM = RFM.reset_index()

In [140]:
RFM_f = RFM[['Recency', 'Frequency', 'Monetary']]
RFM_f

Unnamed: 0,Recency,Frequency,Monetary
0,159,1,129.90
1,162,1,18.90
2,585,1,69.00
3,369,1,25.99
4,336,1,180.00
...,...,...,...
95823,495,1,680.00
95824,310,1,64.89
95825,616,1,89.90
95826,167,1,115.00


In [141]:
# Nécessité de normaliser les données car cela utilise des distances
from sklearn.preprocessing import StandardScaler

scaled_features = StandardScaler().fit_transform(RFM_f.values)
scaled = scaled_features

In [142]:
scaled = pd.DataFrame(scaled, columns = RFM_f.columns)
scaled

Unnamed: 0,Recency,Frequency,Monetary
0,-0.835085,-0.162323,-0.002082
1,-0.815534,-0.162323,-0.566398
2,1.941261,-0.162323,-0.311693
3,0.533536,-0.162323,-0.530353
4,0.318467,-0.162323,0.252622
...,...,...,...
95823,1.354709,-0.162323,2.794583
95824,0.149019,-0.162323,-0.332588
95825,2.143295,-0.162323,-0.205439
95826,-0.782947,-0.162323,-0.077833


In [143]:
RFM_scalled = scaled.join(RFM[['customer_unique_id', 'Clusters']])
RFM_scalled 

Unnamed: 0,Recency,Frequency,Monetary,customer_unique_id,Clusters
0,-0.835085,-0.162323,-0.002082,0000366f3b9a7992bf8c76cfdf3221e2,2
1,-0.815534,-0.162323,-0.566398,0000b849f77a49e4a4ce2b2a4ca5be3f,2
2,1.941261,-0.162323,-0.311693,0000f46a3911fa3c0805444483337064,1
3,0.533536,-0.162323,-0.530353,0000f6ccb0745a6a4b88665a16c9f078,1
4,0.318467,-0.162323,0.252622,0004aac84e0df4da2b147fca70cf8255,1
...,...,...,...,...,...
95823,1.354709,-0.162323,2.794583,fffcf5a5ff07b0908bd4e2dbc735a684,1
95824,0.149019,-0.162323,-0.332588,fffea47cd6d3cc0a88bd621562a9d061,1
95825,2.143295,-0.162323,-0.205439,ffff371b4d645b6ecea244b27531430a,1
95826,-0.782947,-0.162323,-0.077833,ffff5962728ec6157033ef9805bacc48,2


In [144]:
mise_jour = pd.merge(RFM_scalled, df,  on='customer_unique_id', how='inner')
mise_jour = mise_jour [['Recency', 'Frequency', 'Monetary', 'customer_unique_id', 'Clusters', 'order_purchase_timestamp']]

In [145]:
data_init = mise_jour.copy()
data_init

Unnamed: 0,Recency,Frequency,Monetary,customer_unique_id,Clusters,order_purchase_timestamp
0,-0.835085,-0.162323,-0.002082,0000366f3b9a7992bf8c76cfdf3221e2,2,2018-05-10 10:56:27
1,-0.815534,-0.162323,-0.566398,0000b849f77a49e4a4ce2b2a4ca5be3f,2,2018-05-07 11:11:27
2,1.941261,-0.162323,-0.311693,0000f46a3911fa3c0805444483337064,1,2017-03-10 21:05:03
3,0.533536,-0.162323,-0.530353,0000f6ccb0745a6a4b88665a16c9f078,1,2017-10-12 20:29:41
4,0.318467,-0.162323,0.252622,0004aac84e0df4da2b147fca70cf8255,1,2017-11-14 19:45:42
...,...,...,...,...,...,...
99158,1.354709,-0.162323,2.794583,fffcf5a5ff07b0908bd4e2dbc735a684,1,2017-06-08 21:00:36
99159,0.149019,-0.162323,-0.332588,fffea47cd6d3cc0a88bd621562a9d061,1,2017-12-10 20:07:56
99160,2.143295,-0.162323,-0.205439,ffff371b4d645b6ecea244b27531430a,1,2017-02-07 15:49:16
99161,-0.782947,-0.162323,-0.077833,ffff5962728ec6157033ef9805bacc48,2,2018-05-02 15:17:41


In [146]:
data_init.sort_values(by=['order_purchase_timestamp'], ascending=True, inplace=True)
data_init.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99163 entries, 71321 to 52609
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Recency                   99163 non-null  float64
 1   Frequency                 99163 non-null  float64
 2   Monetary                  99163 non-null  float64
 3   customer_unique_id        99163 non-null  object 
 4   Clusters                  99163 non-null  int32  
 5   order_purchase_timestamp  99163 non-null  object 
dtypes: float64(3), int32(1), object(2)
memory usage: 4.9+ MB


Je garde que le mois et l'année de la colonne order_purchase_timestamp pour avoir une meilleure répartition de nos clients.

In [147]:
data_init['order_purchase_timestamp'] = pd.to_datetime(data_init['order_purchase_timestamp'])

In [148]:
data_init['order_purchase_timestamp'] = data_init['order_purchase_timestamp'].apply(lambda x: x.strftime('%Y-%m'))
data_init

Unnamed: 0,Recency,Frequency,Monetary,customer_unique_id,Clusters,order_purchase_timestamp
71321,3.159985,-0.162323,-0.459178,b7d76e111c89f7ebf14761390f0f7d17,1,2016-09
28088,3.153468,-0.162323,-0.359991,4854e9b3feff728c13ee5fc7d1547e92,1,2016-09
240,3.101330,-0.162323,-0.022280,009b0127b727ab0ba422f6d9604487c7,1,2016-09
50891,3.088296,-0.162323,-0.433758,830d5b7aaa3b6f1e9ad63703bec97d23,1,2016-09
5691,2.977503,-0.162323,-0.154092,0eb1ee9dba87f5b36b4613a65074337c,1,2016-10
...,...,...,...,...,...,...
60423,-1.760534,-0.162323,-0.022280,9bb92bebd4cb7511e1a02d5e50bc4655,2,2018-09
38542,-1.773569,4.501869,0.308175,634420a0ea42302205032ed44ac7fccc,3,2018-10
68083,-1.786603,9.166061,1.522861,af5454198a97379394cacf676e1e96cb,3,2018-10
14814,-1.871327,4.501869,0.981794,262e1f1e26e92e86375f86840b4ffd63,3,2018-10


In [149]:
data_init.reset_index(inplace=True)

In [150]:
# Division de le jeu de donnes initial entre 2017-02-01 et 2018-10-01 sur une periode de 1 moins a chaque fois. 

data_init.order_purchase_timestamp=pd.to_datetime(data_init.order_purchase_timestamp)
bin_start=pd.to_datetime(str(2017) + '-' + str(1).zfill(2) + '-01 00:00:00')
bin_end=pd.to_datetime(str(2018) + '-' + str(1).zfill(2) + '-01 00:00:00')
list_df=[data_init[(data_init.order_purchase_timestamp>bin_start)&(data_init.order_purchase_timestamp<=bin_end)][['Recency','Frequency','Monetary']].reset_index(drop=True)]
bin_start=bin_end
for year in [2017, 2018]:
    for month in list(range(1,13)):
        if year==2018 and month>=10:
            continue
        if month==12:
            new_year=year+1
            new_month=1
        else:
            new_year=year
            new_month=month+1
        bin_end=pd.to_datetime(str(new_year) + '-' + str(new_month).zfill(2) + '-01 00:00:00')
        list_df.append(data_init[(data_init.order_purchase_timestamp<=bin_end)][['Recency','Frequency','Monetary']].reset_index(drop=True))
        bin_start=bin_end
km_init=KMeans(n_clusters = 4, init = 'k-means++', max_iter = 1000)
km_cumulative=KMeans(n_clusters = 4, init = 'k-means++', max_iter = 1000)

## Reintranement du modele KMeans 

Pour déterminer le moment où notre modele deviens obsolette, nous allons itérer le modele K-Means et calculer le score ARI. labels_true - resultat des labels_true, c'est les resultat du k_means sur les donnees initials et labels_pred c'est le resultat du k_means dans les nouvelles donnees.

sklearn.metrics.adjusted_rand_score(labels_true, labels_pred)

In [151]:
km_init.fit(list_df[0])
label_init=km_init.predict(list_df[0])
list_ari=[]
for idx, df in enumerate(list_df):
    label_init=km_init.predict(df)
    label_cumulative=km_cumulative.fit_predict(df)
    list_ari.append(adjusted_rand_score(label_init, label_cumulative))

In [155]:
d = ['ARI']
ari_table = pd.DataFrame(data = list_ari, columns=d)
ari_table

Unnamed: 0,ARI
0,0.999602
1,0.834389
2,0.845362
3,0.974384
4,0.855217
5,0.866606
6,0.903626
7,0.956071
8,0.963928
9,0.957698


In [156]:
pd.options.plotting.backend = "plotly"
fig = ari_table.plot(title = "RFM Predictions per moins for clients segmentation with KMeans using ARI", labels=dict(index="moins", value="ARI Score", variable=""))
fig.show()

Sur ce plot des scores ARI obtenus sur les itérations par période de 1 mois, on remarque une forte decroissance après 15 mois sur les clients initiaux.