In [1]:
%pip install scikit-learn==1.7.2

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
from collections import defaultdict

import numpy as np
import json
import pickle

from sklearn.cluster import KMeans

In [3]:
# embeddings_input_path = '../data/Beauty/content_embeddings.pkl'
# semantic_index_output_path = '../data/Beauty/index_rqkmeans.json'

embeddings_input_path = '../data/Beauty/tuned_content_embeddings.pkl'
semantic_index_output_path = '../data/Beauty/tuned_index_rqkmeans.json'

In [4]:
import os
print(os.path.getsize(embeddings_input_path))

446333898


In [5]:
with open(embeddings_input_path, 'rb') as f:
    data = pickle.load(f)

item_ids = np.array(data['item_id'], dtype=np.int64)
X = np.array(data['embedding'], dtype=np.float32)

In [6]:
class RQKMeans:
    def __init__(
            self, 
            num_clusters, 
            num_codebooks, 
            init='k-means++', 
            max_iter=300, 
            tol=1e-4, 
            verbose=0, 
            random_state=42
    ):
        self.models = [
            KMeans(
                n_clusters=num_clusters,
                init=init,
                max_iter=max_iter,
                tol=tol,
                verbose=verbose,
                random_state=random_state + i,
            ) for i in range(num_codebooks)
        ]

    def fit(self, X, y=None):
        for model in self.models:
            y = model.fit_predict(X)
            X = X - model.cluster_centers_[y]
        return self

    def predict(self, X):
        result = []
        centroids = []
        for model in self.models:
            result.append(model.predict(X))
            centroids.append(model.cluster_centers_[result[-1]])
            X = X - centroids[-1]
        return np.stack(result, axis=-1)

In [7]:
rq_kmeans = RQKMeans(num_clusters=256, num_codebooks=3, max_iter=1000)

In [8]:
rq_kmeans.fit(X)

<__main__.RQKMeans at 0x7f723f55e950>

In [9]:
clusters = rq_kmeans.predict(X)

In [11]:
clusters[:10]

array([[243,  23,  64],
       [ 56,  70, 103],
       [  7,  16, 199],
       [  7,  29, 178],
       [ 43, 216,  90],
       [ 68,  60,   2],
       [161,  82, 120],
       [161, 128,   7],
       [233, 137, 240],
       [233,  71, 120]], dtype=int32)

In [12]:
clusters.shape

(12101, 3)

In [13]:
item_ids[:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 9, 8])

In [14]:
item_ids.shape

(12101,)

In [15]:
X[:10]

array([[-0.00584892, -0.02669609,  0.0093983 , ...,  0.00875165,
         0.0034929 ,  0.00049796],
       [-0.01031132, -0.02947189,  0.0138112 , ..., -0.00578546,
         0.00503929, -0.01395363],
       [-0.00554423,  0.00016405,  0.01172988, ...,  0.00759633,
        -0.0054365 , -0.00408524],
       ...,
       [ 0.00458405, -0.0070419 ,  0.01724852, ...,  0.00067518,
         0.00887071, -0.00383712],
       [-0.00571675,  0.00032094,  0.01758219, ..., -0.00190286,
        -0.00962625,  0.00325021],
       [-0.00188042,  0.00160439,  0.02096209, ...,  0.0001148 ,
        -0.00616271,  0.00545511]], shape=(10, 4096), dtype=float32)

In [16]:
X.shape

(12101, 4096)

In [17]:
# Create semantics mapping
inter = {}
sem_2_ids = defaultdict(list)
for idx, clusters in zip(item_ids, clusters):
    inter[int(idx)] = clusters.tolist()
    sem_2_ids[tuple(clusters.tolist())].append(int(idx))

# Solve collistions
for semantics, item_ids in sem_2_ids.items():
    assert len(item_ids) <= 256
    collision_solvers = np.random.permutation(256)[:len(item_ids)].tolist()
    for item_id, collision_solver in zip(item_ids, collision_solvers):
        # то есть получается shape стал 4, но не у всех, у кого-то все еще 3
        inter[item_id].append(collision_solver)
    
# Save semantics
with open(semantic_index_output_path, 'w') as f:
    json.dump(inter, f)

Посмотрим, насколько в пространстве близки эмбеды до и после tower (по идее это лучше положить в cf_finetune в раздел с прокси-метриками)

In [19]:
old_embeddings_input_path = '../data/Beauty/content_embeddings.pkl'

In [25]:
with open(old_embeddings_input_path, 'rb') as f:
    old_data = pickle.load(f)

old_item_ids = np.array(old_data['item_id'], dtype=np.int64)
old_X = np.array(old_data['embedding'], dtype=np.float32)

In [26]:
import torch.nn.functional as F
import torch

cos = F.cosine_similarity(F.normalize(torch.tensor(X)), F.normalize(torch.tensor(old_X)))
print('mean cos:', cos.mean().item(), 'std:', cos.std().item())

mean cos: 0.012486394494771957 std: 0.012419095262885094
