In [1]:
import pandas as pd
import numpy as np
import json
from scipy import sparse as sp
from tqdm.autonotebook import tqdm
from collections import defaultdict



In [2]:
import sys
sys.path.append('../src')

from dataset import load_data, split_to_chunks
from train_valid_split import train_valid_split
from utils import get_shard_path, ProductEncoder, make_coo_row
from metrics import normalized_average_precision

## Download and preprocessing data

In [3]:
load_data()

  0%|          | 0.00/625M [00:00<?, ?B/s]

Downloading data


100%|██████████| 625M/625M [00:46<00:00, 13.6MB/s] 
 29%|██▊       | 2/7 [00:00<00:00, 13.11it/s]

Download completed
Extracting


100%|██████████| 7/7 [00:17<00:00,  2.49s/it]


In [4]:
split_to_chunks()

0it [00:00, ?it/s]

split_data_to_chunks: ../data/raw/purchases.csv -> ../data/jsons/


92it [05:06,  3.00s/it]


In [5]:
train_valid_split()

  0%|          | 0/8 [00:00<?, ?it/s]

process shards


100%|██████████| 8/8 [02:03<00:00, 15.34s/it]


## Encoding

In [6]:
product_encoder = ProductEncoder('../data/raw/products.csv')

In [7]:
rows = []
for shard_id in range(1):
    for js in tqdm(json.loads(l) for l in open(get_shard_path(shard_id))):
        rows.append(make_coo_row(js["transaction_history"], product_encoder))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
X_sparse = sp.vstack(rows)
X_stored = X_sparse.tocsr()

In [9]:
from sklearn.decomposition import TruncatedSVD

In [10]:
svd = TruncatedSVD(n_components=128)
X_dense = svd.fit_transform(X_sparse)

# FAISS
[Вики faiss](https://github.com/facebookresearch/faiss/wiki)

installing: ! conda install -c pytorch faiss-cpu

In [11]:
import faiss

In [12]:
index = faiss.index_factory(128, "IVF256,PQ32", faiss.METRIC_INNER_PRODUCT)
index.train(X_dense)
index.add(X_dense)
index.nprobe = 20

In [13]:
valid_data = [json.loads(l) for l in open(get_shard_path(7))][:3000]
num_neighbours = 256

In [14]:
m_ap = []
for js in tqdm(valid_data):
    row_sparse = make_coo_row(js["transaction_history"], product_encoder)
    row_dense = svd.transform(row_sparse)
    
    faiss_result = index.search(row_dense, num_neighbours)
    neighbors = faiss_result[1]
    
    scores = np.asarray(X_stored[neighbors[0]].sum(axis=0)[0]).flatten()
    top_indices = np.argsort(-scores)
    
    recommended_items = product_encoder.toPid(top_indices[:30])
    
    gt_items = js["target"][0]["product_ids"]
    m_ap.append(normalized_average_precision(gt_items, recommended_items, k=30))
print(np.mean(m_ap))

HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))


0.08550227944783727


In [21]:
faiss.write_index(index, '../temp/faiss.idx')

In [22]:
! ls -lah ../temp/

total 1257504
drwxr-xr-x  4 a18339743  staff   128B Feb 20 11:56 [1m[36m.[m[m
drwxr-xr-x  9 a18339743  staff   288B Feb 20 11:36 [1m[36m..[m[m
-rw-r--r--  1 a18339743  staff   596M Feb 20 11:36 data.zip
-rw-r--r--  1 a18339743  staff   2.0M Feb 20 11:56 faiss.idx


In [24]:
new_index = faiss.read_index('../temp/faiss.idx')
new_index.is_trained

True