In [1]:
import pandas as pd
import numpy as np
import json
from scipy import sparse as sp
from tqdm.notebook import tqdm
from collections import defaultdict

In [2]:
import sys
sys.path.append('../')

from src.utils import get_shard_path, ProductEncoder, make_coo_row
from src.metrics import normalized_average_precision

In [4]:
product_encoder = ProductEncoder('data/raw/products.csv')

In [5]:
rows = []
for shard_id in range(4):
    for js in tqdm(json.loads(l) for l in open(get_shard_path(shard_id))):
        rows.append(make_coo_row(js["transaction_history"], product_encoder))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [6]:
X_sparse = sp.vstack(rows)

In [7]:
X_sparse.shape

(183938, 43038)

In [8]:
X_stored = X_sparse.tocsr()

In [10]:
from sklearn.decomposition import TruncatedSVD

In [11]:
svd = TruncatedSVD(n_components=128)
X_dense = svd.fit_transform(X_sparse)

In [13]:
from sklearn.neighbors import NearestNeighbors

In [14]:
num_neighbours = 256
knn = NearestNeighbors(n_neighbors=num_neighbours, metric="cosine")
knn.fit(X_dense)

In [15]:
valid_data = [json.loads(l) for l in open(get_shard_path(7))][:3000]

In [16]:
m_ap = []
for js in tqdm(valid_data):
    row_sparse = make_coo_row(js["transaction_history"], product_encoder)
    row_dense = svd.transform(row_sparse)
    knn_result = knn.kneighbors(row_dense, n_neighbors=num_neighbours)
    neighbors = knn_result[1]
    scores = np.asarray(X_stored[neighbors[0]].sum(axis=0)[0]).flatten()
    top_indices = np.argsort(-scores)
    recommended_items = product_encoder.toPid(top_indices[:30])
    gt_items = js["target"][0]["product_ids"]
    m_ap.append(normalized_average_precision(gt_items, recommended_items, k=30))
print(np.mean(m_ap))

  0%|          | 0/3000 [00:00<?, ?it/s]

0.09657134817881471


In [17]:
! mkdir -p ../tmp/u2u

In [18]:
import pickle
pickle.dump(X_stored, open('../tmp/u2u/X_stored.pkl', "wb"))
pickle.dump(svd, open('../tmp/u2u/svd.pkl', "wb"))
pickle.dump(knn, open('../tmp/u2u/knn.pkl', "wb"))

In [19]:
! ls -lah ../tmp/u2u

[38;2;64;120;242md[39m[38;5;2mr[39m[38;5;3mw[39m[38;5;1mx[39m[38;5;2mr[39m[38;5;245m-[39m[38;5;1mx[39m[38;5;2mr[39m[38;5;245m-[39m[38;5;1mx[39m[38;5;6m[39m [38;5;2msap[39m [38;5;1msap[39m [38;5;184m52[39m [38;5;184mB[39m  [38;5;40mTue Nov 12 01:41:23 2024[39m [38;2;64;120;242m .[39m
[38;2;64;120;242md[39m[38;5;2mr[39m[38;5;3mw[39m[38;5;1mx[39m[38;5;2mr[39m[38;5;245m-[39m[38;5;1mx[39m[38;5;2mr[39m[38;5;245m-[39m[38;5;1mx[39m[38;5;6m[39m [38;5;2msap[39m [38;5;1msap[39m [38;5;184m12[39m [38;5;184mB[39m  [38;5;40mTue Nov 12 01:41:22 2024[39m [38;2;64;120;242m ..[39m
.[38;5;2mr[39m[38;5;3mw[39m[38;5;245m-[39m[38;5;2mr[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;2mr[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;6m[39m [38;5;2msap[39m [38;5;1msap[39m [38;5;178m90[39m [38;5;178mMB[39m [38;5;40mTue Nov 12 01:41:23 2024[39m  knn.pkl
.[38;5;2mr[39m[38;5;3mw[39m[38;5;245m-[39m[38;5;2mr[39m[38;5;245m-

# FAISS
[Вики faiss](https://github.com/facebookresearch/faiss/wiki)

In [21]:
import faiss

In [22]:
index = faiss.index_factory(128, "IVF256,PQ32", faiss.METRIC_INNER_PRODUCT)
index.train(X_dense)
index.add(X_dense)

[Индексы в faiss](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes)

In [27]:
index.nprobe = 128

In [28]:
m_ap = []
for js in tqdm(valid_data):
    row_sparse = make_coo_row(js["transaction_history"], product_encoder)
    row_dense = svd.transform(row_sparse)
    knn_result = index.search(row_dense, num_neighbours)
    neighbors = knn_result[1]
    scores = np.asarray(X_stored[neighbors[0]].sum(axis=0)[0]).flatten()
    top_indices = np.argsort(-scores)
    recommended_items = product_encoder.toPid(top_indices[:30])
    gt_items = js["target"][0]["product_ids"]
    m_ap.append(normalized_average_precision(gt_items, recommended_items, k=30))
print(np.mean(m_ap))

  0%|          | 0/3000 [00:00<?, ?it/s]

0.08143530234836167


In [None]:
# ???

In [24]:
faiss.write_index(index, '../tmp/u2u/faiss.idx')

In [None]:
! ls -lah ../tmp/u2u