# Import Packages

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

import json

from utils import cluster_topk_classes
from modules.models import KMeansModels, KMeansModelConfig 
from constants import PRODUCT_EMBEDDINGS_PATH, CLASS_EMBEDDINGS_PATH, RANDOM_STATE, DEVICE, CLEANED_GPC_PATH

## Read CSVs

In [None]:
df_products = pd.read_csv(PRODUCT_EMBEDDINGS_PATH)
df_classes = pd.read_csv(CLASS_EMBEDDINGS_PATH)
df_class = pd.read_csv(CLEANED_GPC_PATH)

## Load Embeddings

In [None]:
products_embeddings = [json.loads(embedding) for embedding in df_products["embeddings"].tolist()]
classes_embeddings = [json.loads(embedding) for embedding in df_classes["embeddings"].tolist()]

## Visualize Embeddings

### Products Embeddings

In [None]:
tsne = TSNE(n_components=2, perplexity=30, learning_rate=400, random_state=RANDOM_STATE)
products = tsne.fit_transform(np.array(products_embeddings))

plt.scatter(products[:, 0], products[:, 1], cmap="viridis")

plt.xlabel("TSNE 1")
plt.ylabel("TSNE 2")
plt.title("TSNE - 2 Components")
plt.show()

### Classes Embeddings

In [None]:
tsne = TSNE(n_components=2, perplexity=30, learning_rate=400, random_state=RANDOM_STATE)
classes = tsne.fit_transform(np.array(classes_embeddings))

plt.scatter(classes[:, 0], classes[:, 1], cmap="viridis")

plt.xlabel("TSNE 1")
plt.ylabel("TSNE 2")
plt.title("TSNE - 2 Components")
plt.show()

## Fit Model

In [None]:

inertia = []
K = [1] + list(range(5, 100, 5))

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=RANDOM_STATE)
    kmeans.fit(products_embeddings)
    inertia.append(kmeans.inertia_)


plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()


In [None]:
sil_scores = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=RANDOM_STATE)
    labels = kmeans.fit_predict(products_embeddings)
    if len(set(labels)) <= 1:
        continue
    sil_score = silhouette_score(products_embeddings, labels)
    sil_scores.append(sil_score)

plt.figure(figsize=(8,5))
plt.plot(K[1:], sil_scores, 'ro-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method For Optimal k')
plt.show()


In [None]:
kmeans = KMeans(n_clusters=40, random_state=RANDOM_STATE)
labels = kmeans.fit_predict(products_embeddings)

In [None]:
cluster_items = {i: [] for i in range(kmeans.n_clusters)}
for i, (embedding, label) in enumerate(zip(products_embeddings, labels)):
    cluster_items[label].append((i, embedding))

In [None]:
kmeans.cluster_centers_[0],

In [None]:
cluster_embeddings = torch.tensor([kmeans.cluster_centers_[0]], dtype=torch.float16, device=DEVICE)
classes_embeddings = torch.tensor(classes_embeddings, dtype=torch.float16, device=DEVICE)
classes = cluster_topk_classes(cluster_embeddings, classes_embeddings, 3)

In [None]:
classes = [item for sublist in classes.tolist() for item in sublist]

In [None]:
classes

In [None]:
cluster_items[0]

In [None]:
from collections import Counter

counter = Counter(classes)

mode, count = counter.most_common(1)[0]

In [None]:
mode, count

In [None]:
df_class[df_class["id"]==3345]

In [None]:
df_products