In [1]:
%matplotlib notebook
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
eventos = pd.read_pickle("./datasets/evento.pkl")
acessos = pd.read_pickle("./datasets/acesso.pkl")

In [3]:
consumidor_evento = eventos.groupby("id_consumidor_ecommerce").agg({
    "id_evento" : "count",
    "preco" : "sum",
}).reset_index()

consumidor_acessos = acessos.groupby("id_consumidor_ecommerce").agg({ "id_acessos": "count" }).reset_index()

In [4]:
vendas_acessos = consumidor_evento.join(
    consumidor_acessos.set_index("id_consumidor_ecommerce"), on = "id_consumidor_ecommerce", lsuffix="_e", rsuffix="_a", how = "inner"
).reset_index()
vendas_acessos.rename(columns = { "id_evento": "count_compras", "preco": "sum_preco", "id_acessos": "count_acessos" }, inplace=True)

In [5]:
features = vendas_acessos[["count_compras", "sum_preco", "count_acessos"]]

In [6]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features.fillna(method="ffill"))

In [7]:
sse = []
silhouette_coefficients = []
k_axis = range(2, 11)
for k in k_axis:
    kmeans = KMeans(random_state=42, n_clusters=k)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)
    
    score = silhouette_score(scaled_features, kmeans.labels_)
    silhouette_coefficients.append(score)

scores = pd.DataFrame(index=k_axis, data={"elbow": sse, "silhoutte": silhouette_coefficients})
sns.lineplot(data= scores)
plt.show()

<IPython.core.display.Javascript object>

In [20]:
kmeans = KMeans(random_state=42, n_clusters=4)
kmeans.fit(scaled_features)

fig = plt.figure()
ax = fig.add_subplot(projection = "3d")
labels = kmeans.labels_
u_labels = np.unique(labels)
for cluster in u_labels:
    ax.scatter(features.iloc[labels == cluster, 0], features.iloc[labels == cluster, 1], 
        features.iloc[labels == cluster, 2])
plt.show()

<IPython.core.display.Javascript object>

In [18]:
features.assign(cluster = labels )

Unnamed: 0,count_compras,sum_preco,count_acessos,cluster
0,1,1655.69,1,0
1,2,4399.66,2,0
2,1,1497.03,1,0
3,1,1594.61,1,0
4,1,1891.57,1,0
5,3,6244.14,3,0
6,2,3762.66,2,0
7,4,7913.98,4,0
8,7,13403.77,7,0
9,7,15090.97,7,0


In [23]:
from pathlib import Path
p = Path("pre_processed_data")
p.mkdir(parents = True, exist_ok = True)
features.to_pickle("./pre_processed_data/decision_tree")