# Import Packages

In [13]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import json

from constants import PRODUCT_TEST_EMBEDDINGS_PATH, CLASS_EMBEDDINGS_PATH, RANDOM_STATE, DEVICE, CLEANED_GPC_PATH, CLEANED_TEST_DATA_PATH

In [14]:
product_df = pd.read_csv(CLEANED_TEST_DATA_PATH)
class_df = pd.read_csv(CLEANED_GPC_PATH)
product_embedding_df = pd.read_csv(PRODUCT_TEST_EMBEDDINGS_PATH)
class_embedding_df = pd.read_csv(CLASS_EMBEDDINGS_PATH)

In [18]:
product_embedding_df.head()

Unnamed: 0,id,embeddings
0,0,"[0.0131988525390625, 0.0203704833984375, -0.00..."
1,1,"[0.0016574859619140625, 0.041656494140625, 0.0..."
2,2,"[0.0162811279296875, 0.018951416015625, -0.025..."
3,3,"[0.0190277099609375, 0.0229949951171875, -0.01..."
4,4,"[0.0226593017578125, 0.0291900634765625, -0.02..."


In [None]:
# Merge names with embeddings
product_full = product_embedding_df.merge(product_df, on="id")
class_full = class_embedding_df.merge(class_df, on="id")

In [None]:
products_embeddings = [json.loads(embedding) for embedding in product_full["embeddings"].tolist()]
products_embeddings = torch.tensor(products_embeddings, dtype=torch.float16, device=DEVICE)

classes_embeddings = [json.loads(embedding) for embedding in class_full["embeddings"].tolist()]
classes_embeddings = torch.tensor(classes_embeddings, dtype=torch.float16, device=DEVICE)

In [None]:
products_embeddings = np.array(products_embeddings.cpu())
classes_embeddings = np.array(classes_embeddings.cpu())

In [None]:
K = [1] + list(range(3, 80, 3))
np.random.seed(RANDOM_STATE)

In [None]:
avg_confidences = []

for k in K:
    knn = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute')
    knn.fit(classes_embeddings)
    distances, indices = knn.kneighbors(products_embeddings)

    confidences = []
    for dist in distances:
        weights = 1 / (dist + 1e-8)
        confidences.append(weights.max() / weights.sum())

    avg_confidences.append(np.mean(confidences))

plt.figure(figsize=(8,5))
plt.plot(K, avg_confidences, 'bo-')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Average Confidence Score')
plt.title('KNN Performance')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
knn = NearestNeighbors(n_neighbors= 3, metric='cosine', algorithm='brute')
knn.fit(classes_embeddings)

In [None]:
_, indices = knn.kneighbors(products_embeddings)

In [None]:
pred_classes = [Counter(idx).most_common(1)[0][0] for idx in indices]
confidences = [Counter(idx).most_common(1)[0][1] / len(idx) for idx in indices]
pred_class_names = [class_full["class_name"].iloc[i] for i in pred_classes]

In [None]:
product_full["predicted_class_name"] = pred_class_names
product_full["prediction_confidence"] = confidences

In [None]:
product_full.head()