In [2]:
import os

In [3]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

# import matplotlib.pyplot as plt
# import seaborn as sns

# import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

In [6]:
DATA_PATH = Path("../data/raw/kion_train/")

# LOAD DATA 

In [7]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 2.23 s, sys: 896 ms, total: 3.13 s
Wall time: 5.99 s


# Preprocess

In [21]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

### Add new users

In [8]:
Columns.Datetime = 'last_watch_dt'

Добавим трех пользователей, для проверки рекомендаций

In [9]:
np.random.seed(42)

In [10]:
interactions[interactions[Columns.User] == 400000]

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct


In [11]:
def random_dates(start, end, n=10):

    ndays = (end - start).days + 1

    return pd.to_timedelta(np.random.randint(0, ndays, n), unit='D') + start

1й пользователь, который смотрит только новые фильмы, с 2015 года

In [12]:
item_year = items.query("release_year > 2015")[Columns.Item].unique()
new_users_last_year = pd.DataFrame()
new_users_last_year['item_id'] = item_year[np.random.randint(len(item_year), size=10)]
new_users_last_year[Columns.User] = 100000
new_users_last_year[Columns.Datetime] = random_dates(pd.to_datetime(interactions[Columns.Datetime].min()), 
                                           pd.to_datetime((interactions[Columns.Datetime].max())))
new_users_last_year['total_dur'] = np.random.randint(interactions['total_dur'].quantile(0.1), interactions['total_dur'].quantile(0.9), size=10)
new_users_last_year['watched_pct'] = np.random.randint(100, size=10)

2й пользователь, который смотрит только фэнтези.

In [13]:
item_fi = items[Columns.Item][items['genres'].apply(lambda x: 'фэнтези' in x)].unique()
new_users_fi = pd.DataFrame()
new_users_fi['item_id'] = item_fi[np.random.randint(len(item_fi), size=10)]
new_users_fi[Columns.User] = 200000
new_users_fi[Columns.Datetime] = random_dates(pd.to_datetime(interactions[Columns.Datetime].min()), 
                                           pd.to_datetime((interactions[Columns.Datetime].max())))
new_users_fi['total_dur'] = np.random.randint(interactions['total_dur'].quantile(0.1), interactions['total_dur'].quantile(0.9), size=10)
new_users_fi['watched_pct'] = np.random.randint(100, size=10)

И 3й пользователь, который фанат фильмов США.

In [14]:
item_country = items.query("countries == 'США'")[Columns.Item].unique()
new_users_country = pd.DataFrame()
new_users_country['item_id'] = item_country[np.random.randint(len(item_country), size=10)]
new_users_country[Columns.User] = 400000
new_users_country[Columns.Datetime] = random_dates(pd.to_datetime(interactions[Columns.Datetime].min()), 
                                           pd.to_datetime((interactions[Columns.Datetime].max())))
new_users_country['total_dur'] = np.random.randint(interactions['total_dur'].quantile(0.1), interactions['total_dur'].quantile(0.9), size=10)
new_users_country['watched_pct'] = np.random.randint(100, size=10)

In [22]:
interactions = pd.concat([interactions, new_users_last_year, new_users_fi, new_users_country])

In [25]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [27]:
max_date = interactions[Columns.Datetime].max()

In [28]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [29]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985298, 6)
test: (490983, 6)


In [30]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [31]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [32]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [37]:
train.query("user_id == 400000")

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,400000,7330,2021-04-25,4929,44.0,3
3,400000,11209,2021-06-15,4901,70.0,3
4,400000,363,2021-04-29,6373,8.0,1
5,400000,8780,2021-03-27,8613,87.0,3
6,400000,4423,2021-04-21,8726,0.0,1
7,400000,3155,2021-06-02,7250,7.0,1
8,400000,8504,2021-07-01,5318,87.0,3
9,400000,1761,2021-05-04,2104,62.0,3


# Prepare features

## User features

In [16]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [17]:
users.fillna('Unknown', inplace=True)

In [18]:
users.nunique()

user_id     840197
age              7
income           7
sex              3
kids_flg         2
dtype: int64

In [19]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [20]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840184,529394,age_25_34,income_40_60,Ж,0
840186,80113,age_25_34,income_40_60,Ж,0
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1


In [21]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [22]:
user_features.query(f"id == 973171")

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age
0,973171,income_60_90,income


# Item features

In [23]:
items.isnull().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [24]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [25]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [26]:
items.nunique()

item_id         14019
content_type        2
title           13454
title_orig       9724
release_year      104
genres           2559
countries         666
for_kids            2
age_rating          6
studios            38
directors        7414
actors          11830
description     13791
keywords        13583
dtype: int64

### Genre

In [27]:
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [28]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

In [29]:
content_feature

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [30]:
item_features = pd.concat((genre_feature, content_feature))

In [31]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# Metrics

In [32]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [33]:
metrics

{'Precision@1': Precision(k=1),
 'Precision@2': Precision(k=2),
 'Precision@3': Precision(k=3),
 'Precision@4': Precision(k=4),
 'Precision@5': Precision(k=5),
 'Precision@6': Precision(k=6),
 'Precision@7': Precision(k=7),
 'Precision@8': Precision(k=8),
 'Precision@9': Precision(k=9),
 'Precision@10': Precision(k=10),
 'Recall@1': Recall(k=1),
 'Recall@2': Recall(k=2),
 'Recall@3': Recall(k=3),
 'Recall@4': Recall(k=4),
 'Recall@5': Recall(k=5),
 'Recall@6': Recall(k=6),
 'Recall@7': Recall(k=7),
 'Recall@8': Recall(k=8),
 'Recall@9': Recall(k=9),
 'Recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@2': MAP(k=2, divide_by_k=False),
 'MAP@3': MAP(k=3, divide_by_k=False),
 'MAP@4': MAP(k=4, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@6': MAP(k=6, divide_by_k=False),
 'MAP@7': MAP(k=7, divide_by_k=False),
 'MAP@8': MAP(k=8, divide_by_k=False),
 'MAP@9': MAP(k=9, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False)}

# Models

In [34]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = (32,)
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [35]:
models = {
    'popular': PopularModel(),
}

In [36]:
implicit_models = {
    'ALS': AlternatingLeastSquares,
}
for implicit_name, implicit_model in implicit_models.items():
    for is_fitting_features in (True, False):
        for n_factors in N_FACTORS:
            models[f"{implicit_name}_{n_factors}_{is_fitting_features}"] = (
                ImplicitALSWrapperModel(
                    model=implicit_model(
                        factors=n_factors, 
                        random_state=RANDOM_STATE, 
                        num_threads=NUM_THREADS,
                    ),
                    fit_features_together=is_fitting_features,
                )
            )

In [37]:
lightfm_losses = ('logistic', 'bpr', 'warp')

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(
                no_components=n_factors, 
                loss=loss, 
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )

In [38]:
models

{'popular': <rectools.models.popular.PopularModel at 0x7fee5204fed0>,
 'ALS_32_True': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fee5204ca50>,
 'ALS_32_False': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fee5204c7d0>,
 'LightFM_logistic_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7fee5204cd90>,
 'LightFM_bpr_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7fee520d0450>,
 'LightFM_warp_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7fee5204f150>}

In [39]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.65 s, sys: 151 ms, total: 1.8 s
Wall time: 1.8 s


In [40]:
TEST_USERS = test[Columns.User].unique()

In [41]:
%%time
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model popular...
Fitting model ALS_32_True...
Fitting model ALS_32_False...
Fitting model LightFM_logistic_32...
Fitting model LightFM_bpr_32...
Fitting model LightFM_warp_32...
CPU times: user 8min 42s, sys: 5.81 s, total: 8min 48s
Wall time: 8min 49s


In [42]:
df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [43]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,popular,ALS_32_True,ALS_32_False,LightFM_logistic_32,LightFM_bpr_32,LightFM_warp_32
Precision@1,0.069368,0.081353,0.062004,0.000363,0.034697,0.077308
Recall@1,0.035863,0.041987,0.030862,0.000116,0.019555,0.039141
Precision@2,0.063681,0.069063,0.054688,0.000469,0.02539,0.068408
Recall@2,0.064597,0.069557,0.053593,0.000301,0.028291,0.067837
Precision@3,0.059233,0.059227,0.050804,0.000313,0.020193,0.061027
Recall@3,0.08808,0.087536,0.073736,0.000301,0.032985,0.089641
Precision@4,0.057348,0.052577,0.047118,0.000241,0.016964,0.055219
Recall@4,0.112881,0.102227,0.090174,0.000306,0.036404,0.106709
Precision@5,0.051035,0.046967,0.04348,0.0002,0.014681,0.05038
Recall@5,0.124184,0.112856,0.102644,0.000319,0.039087,0.120754


# Approximate Nearest Neighbors 

In [44]:
import nmslib

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


# HNSW algorithm parameters

## Search parameters:
* ```ef``` - the size of the dynamic list for the nearest neighbors (used during the search). Higher ```ef```
leads to more accurate but slower search. ```ef``` cannot be set lower than the number of queried nearest neighbors
```k```. The value ```ef``` of can be anything between ```k``` and the size of the dataset.
* ```k``` number of nearest neighbors to be returned as the result.
The ```knn_query``` function returns two numpy arrays, containing labels and distances to the k found nearest 
elements for the queries. Note that in case the algorithm is not be able to find ```k``` neighbors to all of the queries,
(this can be due to problems with graph or ```k```>size of the dataset) an exception is thrown.


## Construction parameters:
* ```M``` - the number of bi-directional links created for every new element during construction. Reasonable range for ```M``` 
is 2-100. Higher ```M``` work better on datasets with high intrinsic dimensionality and/or high recall, while low ```M``` work 
better for datasets with low intrinsic dimensionality and/or low recalls. The parameter also determines the algorithm's memory 
consumption, which is roughly ```M * 8-10``` bytes per stored element.  
As an example for ```dim```=4 random vectors optimal ```M``` for search is somewhere around 6, while for high dimensional datasets 
(word embeddings, good face descriptors), higher ```M``` are required (e.g. ```M```=48-64) for optimal performance at high recall. 
The range ```M```=12-48 is ok for the most of the use cases. When ```M``` is changed one has to update the other parameters. 
Nonetheless, ef and ef_construction parameters can be roughly estimated by assuming that ```M```*```ef_{construction}``` is 
a constant.

* ```ef_construction``` - the parameter has the same meaning as ```ef```, but controls the index_time/index_accuracy. Bigger 
ef_construction leads to longer construction, but better index quality. At some point, increasing ef_construction does
not improve the quality of the index. One way to check if the selection of ef_construction was ok is to measure a recall 
for M nearest neighbor search when ```ef``` =```ef_construction```: if the recall is lower than 0.9, than there is room 
for improvement.
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extened by saving/loading(load_index
function has a parameter which defines the new maximum number of elements).

Еще источники: 
- [Nmslib Docs](https://github.com/nmslib/nmslib/blob/master/manual/methods.md)
- [Pinecone Vector Indexes](https://www.pinecone.io/learn/vector-indexes/)

<img src="https://d33wubrfki0l68.cloudfront.net/4c635fabb268a4af60109a506300a2dfda612063/d2535/images/similarity-search-indexes17.jpg">

<img src="https://d33wubrfki0l68.cloudfront.net/96d80cd46c2d12df99c044c860a8a5fb00cf6376/d59ca/images/similarity-search-indexes18.jpg">

In [45]:
import time

In [46]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [47]:
user_embeddings.shape, item_embeddings.shape

((756562, 34), (14019, 34))

In [48]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [49]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (14019, 34)


(14019, 35)

In [50]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(756562, 35)

In [51]:
user_id = 30

In [52]:
user_embeddings[user_id]

array([-2.05896729e+02,  1.00000000e+00, -4.52711433e-02,  3.91230196e-01,
        4.04265732e-01, -4.95160520e-01,  2.22277611e-01, -4.93303478e-01,
       -3.73352468e-02,  1.99019402e-01,  1.68944001e-01,  1.06457457e-01,
       -5.36034182e-02, -2.60553062e-01,  4.19317544e-01, -1.66068971e-03,
       -2.99279541e-01, -3.29282790e-01, -3.25635701e-01, -2.32122898e-01,
        1.45212173e-01,  5.44523180e-01, -2.62969434e-01, -1.23370670e-01,
       -5.34298480e-01, -1.21053249e-01,  1.40258759e-01, -1.00327849e-01,
       -1.89747423e-01,  5.76154925e-02,  1.94223195e-01,  2.62569904e-01,
       -2.33748466e-01, -1.12916067e-01])

In [53]:
augmented_user_embeddings[user_id]

array([-2.05896729e+02,  1.00000000e+00, -4.52711433e-02,  3.91230196e-01,
        4.04265732e-01, -4.95160520e-01,  2.22277611e-01, -4.93303478e-01,
       -3.73352468e-02,  1.99019402e-01,  1.68944001e-01,  1.06457457e-01,
       -5.36034182e-02, -2.60553062e-01,  4.19317544e-01, -1.66068971e-03,
       -2.99279541e-01, -3.29282790e-01, -3.25635701e-01, -2.32122898e-01,
        1.45212173e-01,  5.44523180e-01, -2.62969434e-01, -1.23370670e-01,
       -5.34298480e-01, -1.21053249e-01,  1.40258759e-01, -1.00327849e-01,
       -1.89747423e-01,  5.76154925e-02,  1.94223195e-01,  2.62569904e-01,
       -2.33748466e-01, -1.12916067e-01,  0.00000000e+00])

In [54]:
item_id = 0

In [55]:
item_embeddings[item_id]

array([ 1.        , -1.48405325,  0.23708403, -0.11246382,  0.14794403,
       -0.34387285,  0.72653121,  0.81500947, -0.13408449, -1.05674577,
        0.21708079, -0.02391037, -0.27358592,  0.15148595, -0.00224125,
        0.63105565,  0.92966682,  0.19560455,  0.00708316, -1.04372263,
        0.85816073, -0.97662985, -0.27525479,  0.30821079, -0.51743782,
       -1.08208537, -0.57330936,  0.21116838, -0.12294419,  0.42426503,
        0.70090687, -1.09553933, -0.80954003, -0.09136926])

In [56]:
augmented_item_embeddings[item_id]

array([ 1.00000000e+00, -1.48405325e+00,  2.37084031e-01, -1.12463817e-01,
        1.47944033e-01, -3.43872845e-01,  7.26531208e-01,  8.15009475e-01,
       -1.34084493e-01, -1.05674577e+00,  2.17080787e-01, -2.39103660e-02,
       -2.73585916e-01,  1.51485950e-01, -2.24125385e-03,  6.31055653e-01,
        9.29666817e-01,  1.95604548e-01,  7.08316267e-03, -1.04372263e+00,
        8.58160734e-01, -9.76629853e-01, -2.75254786e-01,  3.08210790e-01,
       -5.17437816e-01, -1.08208537e+00, -5.73309362e-01,  2.11168379e-01,
       -1.22944191e-01,  4.24265027e-01,  7.00906873e-01, -1.09553933e+00,
       -8.09540033e-01, -9.13692564e-02,  4.74953720e+00])

In [57]:
# Set index parameters
# These are the most important ones
M = 48
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [58]:
# Number of neighbors 
K=10

In [59]:
# Space name should correspond to the space name 
# used for brute-force search
space_name='negdotprod'

In [60]:
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

14019

In [61]:
index

<nmslib.FloatIndex method='hnsw' space='negdotprod' at 0x600001c89800>

In [62]:
# Create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
end = time.time() 
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 0.202522


In [63]:
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [64]:
query_matrix = augmented_user_embeddings[:1000, :]

In [65]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=0.008626 (sec), per query=0.000009 (sec), per query adjusted for thread number=0.000035 (sec)


In [66]:
nbrs[0]

(array([ 8867, 12982,  3527, 11758,  8287,  3174,  2256,  4153, 10348,
          118], dtype=int32),
 array([184.05742, 184.178  , 184.39885, 184.52432, 184.53716, 184.6088 ,
        185.01271, 185.10115, 185.10686, 185.11636], dtype=float32))

In [67]:
nbrs[0][1]

array([184.05742, 184.178  , 184.39885, 184.52432, 184.53716, 184.6088 ,
       185.01271, 185.10115, 185.10686, 185.11636], dtype=float32)

In [68]:
def recommend_all(query_factors, index_factors, topn=10):
    output = query_factors.dot(index_factors.T)
    argpartition_indices = np.argpartition(output, -topn)[:, -topn:]

    x_indices = np.repeat(np.arange(output.shape[0]), topn)
    y_indices = argpartition_indices.flatten()
    top_value = output[x_indices, y_indices].reshape(output.shape[0], topn)
    top_indices = np.argsort(top_value)[:, ::-1]

    y_indices = top_indices.flatten()
    top_indices = argpartition_indices[x_indices, y_indices]
    labels = top_indices.reshape(-1, topn)
    distances = output[x_indices, top_indices].reshape(-1, topn)
    return labels, distances

In [69]:
recommend_all(user_embeddings[[0], :], item_embeddings)

(array([[ 8867, 12982,  3527, 11758,  8287,  3174,  2256,  4153, 10348,
           118]]),
 array([[-184.05740336, -184.17797718, -184.39885122, -184.52432216,
         -184.53716474, -184.60879525, -185.01268947, -185.10115671,
         -185.10684388, -185.11636774]]))

In [70]:
item_embeddings[:1000, :].shape, user_embeddings.shape

((1000, 34), (756562, 34))

In [71]:
query_matrix_not_augmented = user_embeddings[:1000, :]

In [72]:
%%timeit
recommend_all(query_matrix_not_augmented, item_embeddings)

206 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
%%timeit
index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)

7.29 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [74]:
import hnswlib

In [75]:
%%time
max_elements, dim = augmented_item_embeddings.shape
hnsw = hnswlib.Index("ip", dim) # possible options for space are l2, cosine or ip

# Initing index - the maximum number of elements should be known beforehand
hnsw.init_index(max_elements, M, efC)

# Element insertion (can be called several times)
hnsw.add_items(augmented_item_embeddings)

CPU times: user 952 ms, sys: 47.7 ms, total: 1e+03 ms
Wall time: 94.4 ms


In [76]:
# Controlling the recall by setting ef, should always be > k
hnsw.set_ef(efS)

In [77]:
label, distance = hnsw.knn_query(query_matrix, k=k)

In [78]:
label

array([[ 8867, 12982,  3527, ...,  4153, 10348,   118],
       [ 8867, 12982,  3527, ...,  4153,   118, 12243],
       [ 8287, 11758, 13721, ...,  3174,  3809, 10330],
       ...,
       [ 8287, 11758,  8867, ..., 12139, 12982,  6492],
       [ 8867, 11758,  8287, ..., 12512,  4153,  6667],
       [12982,  8867,  3527, ..., 13772,  8513,  4153]], dtype=uint64)

In [79]:
1 - distance

array([[-184.0574 , -184.17798, -184.39886, ..., -185.10114, -185.10686,
        -185.11635],
       [-220.61137, -220.6664 , -220.91142, ..., -221.55711, -221.60637,
        -221.63803],
       [-198.92917, -198.95955, -199.00185, ..., -199.3266 , -199.36307,
        -199.36728],
       ...,
       [-200.17957, -200.48221, -200.48912, ..., -201.06464, -201.19824,
        -201.22412],
       [-219.26947, -219.31493, -219.50241, ..., -220.13098, -220.14204,
        -220.14986],
       [-191.45169, -191.6257 , -191.84825, ..., -192.61856, -192.62114,
        -192.67528]], dtype=float32)

In [80]:
item_embeddings[8867].dot(user_embeddings[0])

-184.05740336207296

In [81]:
labels, distances = recommend_all(user_embeddings[:1000, :], item_embeddings)
print(labels)
print(distances)

[[ 8867 12982  3527 ...  4153 10348   118]
 [ 8867 12982  3527 ...  4153   118 12243]
 [ 8287 11758 13721 ...  3174  3809 10330]
 ...
 [ 8287 11758  8867 ... 12139 12982  6492]
 [ 8867 11758  8287 ... 12512  4153  6667]
 [12982  8867  3527 ... 13772  8513  4153]]
[[-184.05740336 -184.17797718 -184.39885122 ... -185.10115671
  -185.10684388 -185.11636774]
 [-220.61137902 -220.66641107 -220.91140402 ... -221.55709346
  -221.6063776  -221.63805658]
 [-198.92916239 -198.95951994 -199.00185253 ... -199.32658814
  -199.36309606 -199.36727971]
 ...
 [-200.17957133 -200.48221844 -200.48913154 ... -201.06461744
  -201.19825035 -201.22410638]
 [-219.26946139 -219.31492795 -219.50240637 ... -220.13098585
  -220.142046   -220.14982973]
 [-191.45166778 -191.6257109  -191.8482294  ... -192.61855672
  -192.62113107 -192.67528727]]


# Домашнее задание

Домашнее задание состоит из нескольких блоков.


## Эксперименты в ipynb ноутбуках (11 баллов)
- Необходимо будет перебрать $N$ моделей $(N \geq 2)$ матричной факторизации и перебрать у них $K$ гиперпараметров $(K \geq 2)$ **(5 баллов)**
    - Для перебора гиперпараметров можно использовать [`Optuna`](https://github.com/optuna/optuna), [`Hyperopt`](https://github.com/hyperopt/hyperopt)
- Воспользоваться методом приближенного поиска соседей для выдачи рекомендаций. **(2 балла)**
    - Можно использовать любые удобные: [`Annoy`](https://github.com/spotify/annoy), [`nmslib`](https://github.com/nmslib/nmslib) и.т.д
- Добавить 3 "аватаров" (искусственных пользователей) и посмотреть рекомендации итоговой модели на них. Объяснить почему добавили именно таких пользователей. **(2 балла)**
- Придумать как можно обработать рекомендации для холодных пользователей. **(2 балла)**

Примечание: за невоспроизводимый код в ноутбуках (например, нарушен порядок выполнения ячеек, вызываются переменные, которые нигде не были объявлены ранее и.т.п) будут штрафы на усмотрение проверяющего.


## Реализация итоговой модели в сервисе (9 баллов)
- Пробитие бейзлайна $MAP@10 \geq 0.074921$ **(6 баллов)**
- Код сервиса соответствует критериям читаемости и воспроизводимости **(3 балла)**



