In [1]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score, silhouette_samples

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'yellowbrick'

In [None]:
# Configuración de números flotantes a 3 decimales
pd.set_option('display.float_format','{:.2f}'.format)

# Estilo de visualización
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})

## CARGA Y EXPLORACIÓN DE DATOS

In [None]:
# DATA PATH
DATA_PATH = os.path.join('..','datasets','processed','rfm_dataset.csv')

In [None]:
# Carga de datos
data = pd.read_csv(DATA_PATH, index_col='Customer ID')
data.sample(5)

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data = data.drop(data[data['MonetaryValue'] < 0].index)

In [None]:
data.columns

#### VARIABLES SIN ESCALADO

In [None]:
%run ../auxiliar_functions/plotDistributions.py
    
plotDistributions(data, 'Distribución de las variables')

#### TRANSFORMACIÓN DE VARIABLES

In [None]:
# Yeo-Johnson Transformation
pt = PowerTransformer(method='box-cox')
ms = MinMaxScaler(feature_range=(-3,3))

transformer = ColumnTransformer(transformers=[('box-cox', pt, ['Recency']),
                              ('minmax', ms, ['Frequency', 'MonetaryValue'])])

scaled = transformer.fit_transform(data)

# Variables escaladas - DataFrame
scaled = pd.DataFrame(scaled, index=data.index, columns=data.columns)
scaled.sample(5)

In [None]:
scaled.describe().T

In [None]:
plotDistributions(scaled, 'Distribución de las variables transformadas')

In [None]:
pca = PCA(n_components=2, random_state=0)
scaled_pca = pca.fit_transform(scaled)
pca.explained_variance_ratio_

## KMeans

#### MÉTODO DE CODO

In [None]:
# MÉTODO DE CODO
# ============================================================================

model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,12))

visualizer.fit(scaled_pca)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure
plt.show()


#### ENTRENAMIENTO DEL MODELO - KMEANS

In [None]:
# Modelo - KMeans
kmeans_model = KMeans(n_clusters=5)

# Modelo entrenado
kmeans_model.fit_transform(scaled_pca)

data['cluster_labels'] = kmeans_model.labels_

kmeansData = data.groupby('cluster_labels').agg({'Recency':['mean','median','std'],
                                           'Frequency':['mean','median','std'],
                                           'MonetaryValue':['mean','median','std']})
kmeansData.T

In [None]:
fig = px.scatter_3d(data, x="Recency", y="Frequency",z="MonetaryValue", color="cluster_labels")
#fig.write_html("scater_3d.html")
fig.show()

In [None]:
fig, ax = plt.subplots(1,3, figsize=(18,6))
for i, feat in enumerate(['Frequency','Recency','MonetaryValue']):
    sns.boxplot(x='cluster_labels', y=feat, data=data,ax=ax[i])
    ax[i].set_title(feat)
plt.savefig(os.path.join('..','img','kmeans_clusters_boxplots_dist.png'))
plt.show()