In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from function_utils import display_circles
import gc

In [None]:
sns.set_palette("Pastel2")

In [None]:
df = pd.read_csv("datasets/data_segmentation_rfm.csv")
df.set_index("customer_unique_id", inplace=True)
df.head(3)

In [None]:
df.info()

# Préparation des données

In [None]:
customers = pd.DataFrame()
customers["Recency"] = np.log1p(df['Recency'])
customers["Frequency"] = np.log1p(df["Frequency"])
customers["MonetaryValue"] =np.log1p(df["MonetaryValue"])
customers.tail()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True)
fig.suptitle('Distribution des variables après transformation logarithme (+1)')

sns.histplot(ax=axes[0],x="Recency", data=customers, kde=True)
axes[0].set_title("Distribution des jours passés depuis la dernière commande")

sns.histplot(ax=axes[1],x="Frequency", data=customers, kde=True)
axes[1].set_title("Distribution de la fréquence d'achats des clients")

sns.histplot(ax=axes[2],x="MonetaryValue", data=customers, kde=True)
axes[2].set_title("Distribution des sommes dépensées par les clients")

plt.tight_layout()
plt.show();

- Standardisation

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(customers)
customers_normalized = scaler.transform(customers)

print("Mean:", customers_normalized.mean(axis=0).round(2))
print("Standard Deviation:", customers_normalized.std(axis=0).round(2))

# Réduction de dimensions

In [None]:
from sklearn.decomposition import PCA

n_comp = 2
pca = PCA(n_components = n_comp)
features = customers.columns
data_pca = pca.fit_transform(customers_normalized)
pcs = pca.components_

In [None]:
display_circles(pcs, n_comp, pca, [(0,1)], labels = np.array(features))

# Modélisation

## K-Means

In [None]:
# Elbow method pour déterminer le nombre de clusters optimale
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer


model = KMeans()
visualizer = KElbowVisualizer(model, k =(2, 10), timings=False)
visualizer.fit(data_pca)
visualizer.show();

In [None]:
sns.set_palette("Pastel2")

In [None]:
# Calcul du K-Means et affectation du cluster à chaque bâtiment
km = KMeans(n_clusters=4)

#Prédiction
label = km.fit_predict(data_pca)
 
centroids = km.cluster_centers_
u_labels = np.unique(label)

#Représentation graphique:
plt.figure(figsize=(10,10))
for i in u_labels:
    plt.scatter(data_pca[label == i , 0] , data_pca[label == i , 1] , label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, alpha = 0.8, color = 'k')
plt.legend()
plt.title('Représentation des clusters du K-Means')
#plt.savefig("Présentation/Images/representation_kmeans.png")
plt.show();

### Description des clusters obtenus avec le modèle

In [None]:
df["Cluster"] = km.labels_
df.groupby('Cluster').agg({
    'Recency' : "mean", 
    'Frequency' : "mean",
    'MonetaryValue': "mean"}).round(2)

In [None]:
df_normalized = pd.DataFrame(customers_normalized, columns=['Recency', 'Frequency', 'MonetaryValue'])
df_normalized['ID'] = df.index
df_normalized["Cluster"] = km.labels_

df_nor_melt = pd.melt(df_normalized.reset_index(),
                      id_vars=["ID", "Cluster"],
                      value_vars = ["Recency", "Frequency", "MonetaryValue"],
                      var_name = "Attribute",
                      value_name = "Value")

df_nor_melt.head()

In [None]:
sns.lineplot("Attribute", "Value", hue="Cluster", data=df_nor_melt)
plt.show()

### Evaluation du modèle

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

visualizer = SilhouetteVisualizer(km, colors='yellowbrick')

visualizer.fit(data_pca)        # Fit the data to the visualizer
visualizer.show();              # Finalize and render the figure

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

print("Silhouette Score:", silhouette_score(data_pca, label))
print("Score de Davies-Bouldin:", davies_bouldin_score(data_pca, label))

In [None]:
from yellowbrick.cluster import intercluster_distance

intercluster_distance(KMeans(4, random_state=42), data_pca);

In [None]:
del data_pca
del df
del customers
del df_normalized
del df_nor_melt

In [None]:
gc.collect()

## K-Means (2)

In [None]:
data = pd.read_csv("datasets/data_clustering_complete.csv")
data.set_index("customer_unique_id", inplace=True)
data.head()

In [None]:
customers = pd.DataFrame()
customers["Recency"] = np.log1p(data['Recency'])
customers["Frequency"] = np.log1p(data["Frequency"])
customers["MonetaryValue"] =np.log1p(data["MonetaryValue"])
customers["nb_days_before_delivered"] = np.log1p(data["nb_days_before_delivered"])
customers["distance_customer_seller"] = np.log1p(data["distance_customer_seller"])
customers.tail()

In [None]:
scaler = StandardScaler()
customers_transform = scaler.fit_transform(customers)

In [None]:
n_comp = 2
pca = PCA(n_components=n_comp)
customers_transform = pca.fit_transform(customers_transform)
features = customers.columns
pcs = pca.components_

In [None]:
display_circles(pcs, n_comp, pca, [(1,2)], labels=np.array(features))

In [None]:
# Elbow method pour déterminer le nombre de clusters optimale

model = KMeans()
visualizer = KElbowVisualizer(model, k =(2, 10), timings=False)
visualizer.fit(customers_transform)
visualizer.show();

In [None]:
sns.set_palette("Pastel2")
sns.set_theme("white")

In [None]:
# Calcul du K-Means et affectation du cluster à chaque bâtiment
model = KMeans(n_clusters=5)

#Prédiction
label = model.fit_predict(customers_transform)
 
centroids = model.cluster_centers_
u_labels = np.unique(label)

#Représentation graphique:
plt.figure(figsize=(10,10))
for i in u_labels:
    plt.scatter(customers_transform[label == i , 0] ,customers_transform[label == i , 1] , label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, alpha = 0.8, color = 'k')
plt.legend()
plt.title('Représentation des clusters du K-Means')
#plt.savefig("Présentation/Images/representation_kmeans.png")
plt.show();

In [None]:
data["Cluster"] = model.labels_
data.groupby('Cluster').agg({
    'Recency' : "mean", 
    'Frequency' : "mean",
    'MonetaryValue': "mean",
    'nb_days_before_delivered' : "mean",
    "distance_customer_seller" : "mean",
    "product_category_name" : lambda x : x.value_counts().index[0],
    "customer_city" : lambda x : x.value_counts().index[0],
    "review_score" : "mean"}).round(2)

In [None]:
df_normalized = pd.DataFrame(customers_transform, columns=['Recency', 'Frequency', 'MonetaryValue', "nb_days_before_delivered", "distance_customer_seller", "review_score"])
df_normalized['ID'] = data.index
df_normalized["Cluster"] = model.labels_

df_nor_melt = pd.melt(df_normalized.reset_index(),
                      id_vars=["ID", "Cluster"],
                      value_vars = ["Recency", "Frequency", "MonetaryValue", "nb_days_before_delivered", "distance_customer_seller", "review_score"],
                      var_name = "Attribute",
                      value_name = "Value")

df_nor_melt.head()

In [None]:
sns.lineplot("Attribute", "Value", hue="Cluster", data=df_nor_melt)
plt.show()

In [None]:
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')

visualizer.fit(customers_transform)        # Fit the data to the visualizer
visualizer.show();              # Finalize and render the figure

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

print("Silhouette Score:", silhouette_score(customers_transform, label))
print("Score de Davies-Bouldin:", davies_bouldin_score(customers_transform, label))

In [None]:
intercluster_distance(KMeans(5, random_state=42), customers_transform);

## Stabilité du clustering

In [None]:
ari

## K-Prototype

https://towardsdatascience.com/the-k-prototype-as-clustering-algorithm-for-mixed-data-type-categorical-and-numerical-fe7c50538ebb

In [None]:
from kmodes.kprototypes import KPrototypes

In [None]:
cols = data.columns

for col in cols:
    if data[col].dtypes =="float":
        data[col] = np.log1p(data[col].values)

In [None]:
data_test_30d = data.loc[data["Recency"] < 30]
data_test_60d = data.loc[data["Recency"] < 60]
data_test_90d = data.loc[data["Recency"] < 90]
data_test_120d = data.loc[data["Recency"] < 120]
data_test_150d = data.loc[data["Recency"] < 150]
data_test_180d = data.loc[data["Recency"] < 180]

In [None]:
data.head()

In [None]:
cat_var = ["review_score", "customer_city", "product_category_name"]
num_var = ["Frequency", "nb_days_before_delivered", "nb_days_before_delivered_estimation", 
           "MonetaryValue", "distance_customer_seller"]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing'))
])

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median", fill_value="missing")),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
    ('cat', cat_pipe, cat_var),
    ('num', num_pipe, num_var)
])

In [None]:
data_normalized = preprocessor.fit_transform(data)
data_normalized = pd.DataFrame(data_normalized, columns=["review_score", "customer_city", "product_category_name",
                                                         "Frequency","nb_days_before_delivered", 
                                                         "nb_days_before_delivered_estimation","MonetaryValue", 
                                                         "distance_customer_seller"])

In [None]:
for col in num_var:
    data_normalized[col] = data_normalized[col].astype("float32")

In [None]:
catColumnsPos = [data_normalized.columns.get_loc(col) for col in list(data_normalized.select_dtypes('object').columns)]
print('Categorical columns           : {}'.format(list(data_normalized.select_dtypes('object').columns)))
print('Categorical columns position  : {}'.format(catColumnsPos))

In [None]:
dfMatrix = data_normalized.to_numpy()
dfMatrix

In [None]:
kprototype = KPrototypes(n_jobs = -1, n_clusters = 4, init = 'Huang', random_state = 42)
kprototype.fit_predict(dfMatrix, categorical = catColumnsPos)

In [None]:
kprototype.cluster_centroids_

In [None]:
data['cluster_id'] = kprototype.labels_

In [None]:
clustering_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", kprototype)
])

### Analyse du clustering

In [None]:
data["review_score"] = data["review_score"].astype("int32")

In [None]:
for col in cols:
    if data[col].dtypes =="float":
        data[col] = np.expm1(data[col].values)

In [None]:
data.groupby('cluster_id').agg({
    'Frequency' : "mean",
    'MonetaryValue': "mean",
    'distance_customer_seller' : "mean",
    'nb_days_before_delivered' : "mean",
    'nb_days_before_delivered_estimation' : "mean",
    'review_score':"mean"}).round(2)