In [1]:
import pandas as pd
import numpy as np

from rectools.metrics import MAP, calc_metrics
from rectools.models import PopularModel, RandomModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import LightFMWrapperModel

from pathlib import Path
from tqdm import tqdm

from lightfm import LightFM

  from .autonotebook import tqdm as notebook_tqdm


# Загрузка данных

In [2]:
data_path = Path("../data")

In [3]:
%%time
users = pd.read_csv(data_path / 'users.csv')
items = pd.read_csv(data_path / 'items.csv')
interactions = pd.read_csv(data_path / 'interactions.csv')

CPU times: user 2.36 s, sys: 404 ms, total: 2.76 s
Wall time: 2.76 s


In [4]:
interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


# Препроцессинг, сплит

In [5]:
# preprocessing
Columns.Datetime = 'last_watch_dt'
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# train test split
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
print(f"train: {train.shape}")
print(f"test: {test.shape}")
train.drop(train.query("total_dur < 300").index, inplace=True)

# cold users filtering
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

train: (4985269, 6)
test: (490982, 6)


# User, item features

In [6]:
# user features
users.isnull().sum()
users.fillna('Unknown', inplace=True)
users.nunique()
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

# item features
items.isnull().sum()
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items.nunique()

item_id         14019
content_type        2
title           13454
title_orig       9724
release_year      104
genres           2559
countries         666
for_kids            2
age_rating          6
studios            38
directors        7414
actors          11830
description     13791
keywords        13583
dtype: int64

In [7]:
# explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature["value"].value_counts()
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

In [8]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# LightFM params

In [9]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = 32
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

# Оценка LightFM через кросс-валидацию

In [10]:
from rectools.model_selection import TimeRangeSplitter, cross_validate

In [11]:
n_splits = 3

splitter = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [12]:
%%time
dataset = Dataset.construct(
    interactions_df=interactions,  # train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 925 ms, sys: 171 ms, total: 1.1 s
Wall time: 1.1 s


In [13]:
splitter.get_test_fold_borders(dataset.interactions)

[(Timestamp('2021-07-12 00:00:00', freq='14D'),
  Timestamp('2021-07-26 00:00:00', freq='14D')),
 (Timestamp('2021-07-26 00:00:00', freq='14D'),
  Timestamp('2021-08-09 00:00:00', freq='14D')),
 (Timestamp('2021-08-09 00:00:00', freq='14D'),
  Timestamp('2021-08-23 00:00:00', freq='14D'))]

In [14]:
from rectools.metrics import MeanInvUserFreq, Serendipity

In [20]:
models = {
    "random": RandomModel(random_state=42),
    "popular": PopularModel(),
    "most_raited": PopularModel(popularity="sum_weight"),
    "lightfm": LightFMWrapperModel(
            LightFM(
                no_components=N_FACTORS, 
                loss='warp', 
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            )),
}

metrics_name = {
    'MAP': MAP,
    "novelty": MeanInvUserFreq,
    "serendipity": Serendipity,
}

metrics = {}
k=10
for metric_name, metric in metrics_name.items():
    metrics[f'{metric_name}@{k}'] = metric(k=k)

K_RECS = 10

In [21]:
%%time

cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)

CPU times: user 6min 36s, sys: 10min 48s, total: 17min 25s
Wall time: 3min 30s


In [22]:
pd.DataFrame(cv_results["splits"])

Unnamed: 0,i_split,start,end,train,train_users,train_items,test,test_users,test_items
0,0,2021-07-12,2021-07-26,3239125,646423,14730,398993,122488,7394
1,1,2021-07-26,2021-08-09,3892558,742256,15085,458757,135624,7711
2,2,2021-08-09,2021-08-23,4649162,850489,15415,521381,151629,7705


In [23]:
pd.DataFrame(cv_results["metrics"])

Unnamed: 0,model,i_split,MAP@10,novelty@10,serendipity@10
0,random,0,0.000183,15.504172,6e-06
1,popular,0,0.113264,3.753619,3e-06
2,most_raited,0,0.11657,3.808252,4e-06
3,lightfm,0,0.113233,4.233231,2.4e-05
4,random,1,0.000149,15.56027,7e-06
5,popular,1,0.096352,3.711794,3e-06
6,most_raited,1,0.097129,3.768894,4e-06
7,lightfm,1,0.096614,4.210381,2.9e-05
8,random,2,0.000163,15.609173,7e-06
9,popular,2,0.086029,3.703142,3e-06


In [24]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean", "std"])
)
mean_metric_subset = [(metric, "mean") for metric in pivot_results.columns.levels[0]]
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,MAP@10,MAP@10,novelty@10,novelty@10,serendipity@10,serendipity@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
random,0.000165,1.7e-05,15.557872,0.052541,7e-06,1e-06
popular,0.098549,0.01375,3.722852,0.026994,3e-06,0.0
most_raited,0.099943,0.015413,3.780579,0.024062,4e-06,1e-06
lightfm,0.099325,0.01277,4.21284,0.01928,2.8e-05,4e-06


Видно, что lightfm по map находится на уровне popular и most_rated моделей, но обладает лучшим novelty по сравнению c ними.  
К тому же, обладает лучшим serendipity среди всех моделей, что является следствием хороших map и novelty.

# Обучение

In [26]:
models = {}
lightfm_losses = ('warp',)

for loss in lightfm_losses:
    models[f"LightFM_{loss}_{N_FACTORS}"] = LightFMWrapperModel(
        LightFM(
            no_components=N_FACTORS, 
            loss=loss, 
            random_state=RANDOM_STATE,
            learning_rate=LEARNING_RATE,
            user_alpha=USER_ALPHA,
            item_alpha=ITEM_ALPHA,
        ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
    )

In [27]:
metrics_name = {
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [28]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 734 ms, sys: 1.26 ms, total: 736 ms
Wall time: 734 ms


In [29]:
splitter.get_test_fold_borders(dataset.interactions)

[(Timestamp('2021-07-04 00:00:00', freq='14D'),
  Timestamp('2021-07-18 00:00:00', freq='14D')),
 (Timestamp('2021-07-18 00:00:00', freq='14D'),
  Timestamp('2021-08-01 00:00:00', freq='14D')),
 (Timestamp('2021-08-01 00:00:00', freq='14D'),
  Timestamp('2021-08-15 00:00:00', freq='14D'))]

In [30]:
123 in dataset.user_id_map.external_ids

True

In [31]:
TEST_USERS = test[Columns.User].unique()

In [32]:
models

{'LightFM_warp_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7f07a0462b30>}

In [33]:
%%time
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model LightFM_warp_32...
CPU times: user 3min 12s, sys: 3min 4s, total: 6min 17s
Wall time: 28.1 s


In [34]:
%%time
model.recommend(
    users=[1],
    dataset=dataset,
    k=10,
    filter_viewed=False,
)

CPU times: user 174 ms, sys: 193 ms, total: 367 ms
Wall time: 266 ms


Unnamed: 0,user_id,item_id,score,rank
0,1,15297,-202.800873,1
1,1,10440,-203.007812,2
2,1,9728,-203.222626,3
3,1,13865,-203.347702,4
4,1,4151,-203.373535,5
5,1,3734,-203.477997,6
6,1,2657,-203.643143,7
7,1,142,-203.849182,8
8,1,4880,-203.876297,9
9,1,14431,-203.985931,10


In [35]:
metric_values = calc_metrics(metrics, recos, test, train)["MAP@10"]
metric_values

0.0757090028380533

# Тюнинг гиперпараметров

In [39]:
import optuna

In [36]:
import logging
logging.basicConfig(filename="optuna.log",
                    filemode='a',
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)

logging.info("Начало тюнинга")

In [37]:
def objective(trial):
    param = {
        "no_components": trial.suggest_int("no_components", 4, 64),
        "loss": trial.suggest_categorical("loss", ['logistic', 'bpr', 'warp']),
        "random_state": RANDOM_STATE,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "user_alpha": trial.suggest_float("user_alpha", 0, 0.3),
        "item_alpha": trial.suggest_float("item_alpha", 0, 0.3),
    }
    model = LightFMWrapperModel(
        LightFM(
            **param,
        ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
    )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    metric_value = calc_metrics({"MAP@10": MAP(k=10)}, recos, test, train)["MAP@10"]
    logging.info(f"{metric_value=}: {param=}")
    return metric_value

In [41]:
study = optuna.create_study(direction="maximize")
study.enqueue_trial(
    {'no_components': 61, 'loss': 'warp', 'random_state': 42, 'learning_rate': 0.22958684083647646, 'user_alpha': 0.10227312715865738, 'item_alpha': 0.049429867705118334}
)
study.optimize(objective, n_trials=10, show_progress_bar=True, n_jobs=-1)
logging.info(f"Number of finished trials: {len(study.trials)}")

logging.info("Best trial:")
trial = study.best_trial

logging.info(f"  Value: {trial.value}")

logging.info("  Params: ")
for key, value in trial.params.items():
    logging.info(f"    {key}: {value}")

[I 2023-12-05 23:01:03,987] A new study created in memory with name: no-name-f9b64cf8-36f8-4391-8305-b748027b39e3
Best trial: 5. Best value: 0.0761124:  10%|█         | 1/10 [00:34<05:13, 34.84s/it]

[I 2023-12-05 23:01:38,822] Trial 5 finished with value: 0.07611236791211336 and parameters: {'no_components': 13, 'loss': 'warp', 'learning_rate': 0.020330492377470213, 'user_alpha': 0.18005225425507979, 'item_alpha': 0.20668417362500002}. Best is trial 5 with value: 0.07611236791211336.


Best trial: 5. Best value: 0.0761124:  20%|██        | 2/10 [01:42<07:10, 53.86s/it]

[I 2023-12-05 23:02:45,992] Trial 4 finished with value: 0.06246787480664642 and parameters: {'no_components': 39, 'loss': 'warp', 'learning_rate': 0.22356411590259528, 'user_alpha': 0.059882260728142955, 'item_alpha': 0.03312457124342167}. Best is trial 5 with value: 0.07611236791211336.
[I 2023-12-05 23:02:46,044] Trial 6 finished with value: 0.07201369642366953 and parameters: {'no_components': 30, 'loss': 'warp', 'learning_rate': 0.06966052814958455, 'user_alpha': 0.1847080642288032, 'item_alpha': 0.1680844771220847}. Best is trial 5 with value: 0.07611236791211336.


Best trial: 5. Best value: 0.0761124:  40%|████      | 4/10 [02:24<03:20, 33.35s/it]

[I 2023-12-05 23:03:28,285] Trial 9 finished with value: 0.0 and parameters: {'no_components': 4, 'loss': 'bpr', 'learning_rate': 0.1323815991899607, 'user_alpha': 0.2552118339242624, 'item_alpha': 0.12897975245468948}. Best is trial 5 with value: 0.07611236791211336.


Best trial: 5. Best value: 0.0761124:  50%|█████     | 5/10 [03:10<03:05, 37.18s/it]

[I 2023-12-05 23:04:14,005] Trial 0 finished with value: 0.07420793037467797 and parameters: {'no_components': 61, 'loss': 'warp', 'learning_rate': 0.22958684083647646, 'user_alpha': 0.10227312715865738, 'item_alpha': 0.049429867705118334}. Best is trial 5 with value: 0.07611236791211336.


Best trial: 5. Best value: 0.0761124:  60%|██████    | 6/10 [03:12<01:46, 26.51s/it]

[I 2023-12-05 23:04:16,372] Trial 7 finished with value: 3.2244483924463904e-06 and parameters: {'no_components': 13, 'loss': 'bpr', 'learning_rate': 0.04237010229387936, 'user_alpha': 0.2867793650079866, 'item_alpha': 0.08781058101917623}. Best is trial 5 with value: 0.07611236791211336.


Best trial: 5. Best value: 0.0761124:  70%|███████   | 7/10 [04:22<01:59, 39.85s/it]

[I 2023-12-05 23:05:26,686] Trial 2 finished with value: 2.358733953890865e-06 and parameters: {'no_components': 30, 'loss': 'bpr', 'learning_rate': 0.15728511549727853, 'user_alpha': 0.2878851160861324, 'item_alpha': 0.03225543338260445}. Best is trial 5 with value: 0.07611236791211336.


Best trial: 5. Best value: 0.0761124:  80%|████████  | 8/10 [04:43<01:08, 34.01s/it]

[I 2023-12-05 23:05:47,288] Trial 3 finished with value: 0.0002971182167947741 and parameters: {'no_components': 6, 'loss': 'logistic', 'learning_rate': 0.1843255368705906, 'user_alpha': 0.19607256289163963, 'item_alpha': 0.21290636598997872}. Best is trial 5 with value: 0.07611236791211336.


Best trial: 5. Best value: 0.0761124:  90%|█████████ | 9/10 [05:25<00:36, 36.45s/it]

[I 2023-12-05 23:06:29,351] Trial 1 finished with value: 0.00024653083444832143 and parameters: {'no_components': 56, 'loss': 'logistic', 'learning_rate': 0.1418549745255148, 'user_alpha': 0.264283761569392, 'item_alpha': 0.16759226303765906}. Best is trial 5 with value: 0.07611236791211336.


Best trial: 5. Best value: 0.0761124: 100%|██████████| 10/10 [05:45<00:00, 34.51s/it]

[I 2023-12-05 23:06:49,055] Trial 8 finished with value: 0.00024033026324042972 and parameters: {'no_components': 44, 'loss': 'logistic', 'learning_rate': 0.18616790854856374, 'user_alpha': 5.7608055564639656e-05, 'item_alpha': 0.11703570310123583}. Best is trial 5 with value: 0.07611236791211336.





In [44]:
trial.value

0.07611236791211336

In [43]:
trial.params

{'no_components': 13,
 'loss': 'warp',
 'learning_rate': 0.020330492377470213,
 'user_alpha': 0.18005225425507979,
 'item_alpha': 0.20668417362500002}

In [59]:
params = trial.params
params['random_state'] = RANDOM_STATE

In [64]:
tuned_model = LightFMWrapperModel(
    LightFM(
        **params
    ),
    epochs=N_EPOCHS,
    num_threads=NUM_THREADS,
)

In [65]:
tuned_model.fit(dataset)
recos = tuned_model.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)
metric_values = calc_metrics(metrics, recos, test, train)

In [66]:
metric_values

{'MAP@1': 0.039804840419051836,
 'MAP@2': 0.05423074630512508,
 'MAP@3': 0.06404834838527128,
 'MAP@4': 0.06901814704847893,
 'MAP@5': 0.07158353122010752,
 'MAP@6': 0.07319099257540852,
 'MAP@7': 0.07395958940331449,
 'MAP@8': 0.07459030933867374,
 'MAP@9': 0.07522977829103579,
 'MAP@10': 0.07575308864468179}

In [67]:
pd.DataFrame(metric_values.items(), columns=['Metric', 'Value'])

Unnamed: 0,Metric,Value
0,MAP@1,0.039805
1,MAP@2,0.054231
2,MAP@3,0.064048
3,MAP@4,0.069018
4,MAP@5,0.071584
5,MAP@6,0.073191
6,MAP@7,0.07396
7,MAP@8,0.07459
8,MAP@9,0.07523
9,MAP@10,0.075753


# ANN

## nmslib

In [68]:
import nmslib

In [69]:
model = tuned_model

In [70]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [71]:
item_embeddings.shape

(14019, 15)

In [72]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [73]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (14019, 15)


(14019, 16)

In [74]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(756562, 16)

In [75]:
user_id = 30

In [76]:
user_embeddings[user_id]

array([-5.63961075e-05,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [77]:
augmented_user_embeddings[user_id]

array([-5.63961075e-05,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [78]:
# Set index parameters
# These are the most important ones
M = 48
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [79]:
# Number of neighbors 
K=10
# Space name should correspond to the space name 
# used for brute-force search
space_name='negdotprod'
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}


In [80]:
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [81]:
augmented_user_embeddings.shape

(756562, 16)

In [82]:
query_matrix = augmented_user_embeddings[:1000, :]

In [83]:
query_matrix.shape

(1000, 16)

In [84]:
user_id = 10973
one_user_matrix = augmented_user_embeddings[user_id, :]


In [85]:
%%time
# Querying
nbrs = index.knnQueryBatch([one_user_matrix], k = K, num_threads = num_threads)

CPU times: user 904 µs, sys: 0 ns, total: 904 µs
Wall time: 324 µs


In [86]:
augmented_user_embeddings.shape[1]

16

In [87]:
nbrs

[(array([ 31,  19,  43,  32, 121,  62, 100, 173,  86, 268], dtype=int32),
  array([-4.7851790e-07, -4.2876775e-07, -3.3568671e-07, -3.3513595e-07,
         -3.3299099e-07, -3.0817492e-07, -2.5874357e-07, -2.4364081e-07,
         -2.3202978e-07, -2.2550529e-07], dtype=float32))]

In [88]:
augmented_item_embeddings.shape

(14019, 16)

In [89]:
M

48

In [90]:
augmented_user_embeddings.shape[1]

16

## FAISS

In [91]:
import faiss
import numpy as np

efC = 200  # Size of the priority queue to explore at construction time
efS = 200  # Size of the priority queue to explore at search time
M = 48

num_threads = 4

# Creating a FAISS HNSW index
index = faiss.index_factory(augmented_user_embeddings.shape[1], f"HNSW{M}", faiss.METRIC_L2)

# Adding the item embeddings to the index
index.add(augmented_item_embeddings)

# Setting the parameters efC and efS
index.hnsw.efConstruction = efC
index.hnsw.efSearch = efS

In [92]:
%%time
# Performing the search for the nearest neighbors of the query
D, I = index.search(np.array([one_user_matrix]), K)


CPU times: user 50.4 ms, sys: 0 ns, total: 50.4 ms
Wall time: 16.8 ms


FAISS работает гораздо быстрее. Также есть возможность работы с GPU при необходимости

In [93]:

# Displaying the results
print("Indices of Nearest Neighbors:", I)
print("Distances to Nearest Neighbors:", D)

Indices of Nearest Neighbors: [[1008 1339 1525 2453 2611 3123 3347 3574 3835 3921]]
Distances to Nearest Neighbors: [[1.9999999 1.9999999 1.9999999 1.9999999 1.9999999 1.9999999 1.9999999
  1.9999999 1.9999999 1.9999999]]


In [94]:
model

<rectools.models.lightfm.LightFMWrapperModel at 0x7f075c22fa00>

In [95]:
import dill

path_model = '../models/tuned_lightfm.dill'
with open(path_model, 'wb') as f:
    dill.dump(model, f)

In [96]:
with open(path_model, 'rb') as f:
        model = dill.load(f)

In [97]:
model

<rectools.models.lightfm.LightFMWrapperModel at 0x7f075c276b60>