In [92]:
import os
import pickle
import time
import warnings
from pathlib import Path
from pprint import pprint

import numpy as np
import optuna
import pandas as pd
from tqdm.auto import tqdm

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from lightfm import LightFM
import nmslib
import rii
import anopq

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import calc_metrics
from rectools.metrics.ranking import MAP
from rectools.models import LightFMWrapperModel

# Определение переменных окружения
os.environ["OPENBLAS_NUM_THREADS"] = "1"


warnings.filterwarnings('ignore')

In [93]:
DATA_PATH = Path("/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

### Preprocess

In [94]:
interactions_df = interactions.drop(columns='total_dur')
interactions_df['watched_pct'] = interactions_df['watched_pct'] / 100
interactions_df.rename(columns={'user_id': Columns.User, 'item_id': Columns.Item,
                                'last_watch_dt': Columns.Datetime, 'watched_pct': Columns.Weight}, inplace=True)
interactions_df.dropna(inplace=True)

In [95]:
mask = interactions_df['weight'] < 0.10
indexes_to_drop = interactions_df[mask].index
interactions_df.drop(index=indexes_to_drop, inplace=True)
interactions_df[Columns.Datetime] = pd.to_datetime(interactions_df[Columns.Datetime], format='%Y-%m-%d')

### Подготовка моделей
________________________________________________________________________________________________________

In [96]:
max_date = interactions_df[Columns.Datetime].max()

train = interactions_df[interactions_df[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions_df[interactions_df[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

train: (3404098, 4)
test: (317219, 4)


#### Работа с фичами

In [97]:
users.head(1)

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1


In [98]:
items.head(1)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."


In [99]:
items = items.loc[items[Columns.Item].isin(interactions_df[Columns.Item])].copy()
users = users.loc[users[Columns.User].isin(interactions_df[Columns.User])].copy()

In [100]:
users.drop(columns=['kids_flg', 'income'], inplace=True)
print(users.isnull().mean())
users.fillna('Unknown', inplace=True)
print(users.isnull().mean())

user_id    0.000000
age        0.011622
sex        0.011237
dtype: float64
user_id    0.0
age        0.0
sex        0.0
dtype: float64


In [101]:
items.drop(columns=['keywords', 'description', 'studios', 'age_rating',
                    'for_kids', 'countries', 'release_year', 'title_orig', 'title'], inplace=True)
print(items.isnull().mean())
items.fillna('Unknown', inplace=True)
print(items.isnull().mean())

item_id         0.000000
content_type    0.000000
genres          0.000000
directors       0.062595
actors          0.139557
dtype: float64
item_id         0.0
content_type    0.0
genres          0.0
directors       0.0
actors          0.0
dtype: float64


In [102]:
def get_feature_df(features: tuple, id_label: str, items: pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame(columns=['id', 'value', 'feature'])
    for feature in features:
        feature, multi_label_binary_flag = feature
        if multi_label_binary_flag:
            items[feature] = items[feature].str.lower().str.replace(", ", ",", regex=False).str.split(",")
            feature_df = items[[id_label, feature]].explode(feature)
            feature_df.columns = ["id", "value"]
            feature_df["feature"] = feature
        else:
            feature_df = items.reindex(columns=[id_label, feature])
            feature_df.columns = ["id", "value"]
            feature_df["feature"] = feature
        df = pd.concat([df, feature_df])
    return df

создание датасета для items

In [103]:
featurse = (('genres', True), ('content_type', False),
            ('actors', True), ('directors', True))
id_label = 'item_id'
items_features = get_feature_df(featurse, id_label, items)

In [104]:
pd.concat([items_features.head(), items_features.tail()])

Unnamed: 0,id,value,feature
0,10711,драмы,genres
0,10711,зарубежные,genres
0,10711,детективы,genres
0,10711,мелодрамы,genres
1,2508,зарубежные,genres
15960,10632,амир камдин,directors
15960,10632,эрик эгер,directors
15961,4538,марк о’коннор,directors
15961,4538,конор макмахон,directors
15962,3206,михаил миронов,directors


In [105]:
items_features['feature'].unique()

array(['genres', 'content_type', 'actors', 'directors'], dtype=object)

создание матрицы для юезров

In [106]:
featurse = (('age', False), ('sex', False))
id_label = 'user_id'
users_features = get_feature_df(featurse, id_label, users)
pd.concat([users_features.head(), users_features.tail()])

Unnamed: 0,id,value,feature
0,973171,age_25_34,age
1,962099,age_18_24,age
3,721985,age_45_54,age
4,704055,age_35_44,age
5,1037719,age_45_54,age
840186,80113,Ж,sex
840188,312839,Ж,sex
840189,191349,М,sex
840190,393868,М,sex
840195,590706,Ж,sex


In [107]:
users_features['feature'].unique()

array(['age', 'sex'], dtype=object)

### Создание моделей

In [108]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 6
USER_ALPHA = 0  # Lightfm
ITEM_ALPHA = 0  # Lightfm

In [109]:
dataset = Dataset.construct(
    interactions_df=interactions_df,
    user_features_df=users_features,
    cat_user_features=["sex", "age"],
    item_features_df=items_features,
    cat_item_features=['genres', 'content_type', 'actors', 'directors'],
)

In [110]:
with open('dataset.pkl', 'wb') as file:
    pickle.dump(dataset, file)

In [86]:
TEST_USERS = test[Columns.User].unique()

Тюнинг параметров

In [None]:
metrics = {
    'MAP@10': MAP(k=10)
}


def objective(trial):
    no_components = trial.suggest_int('no_components', 32, 50)
    epochs = trial.suggest_int('epochs', 10, 50)
    learning_rate = trial.suggest_float('learning_rate', 0.03, 0.05)
    loss_function = trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])

    model = LightFMWrapperModel(
        LightFM(
            no_components=no_components,
            loss=loss_function,
            random_state=RANDOM_STATE,
            learning_rate=learning_rate,
            user_alpha=USER_ALPHA,
            item_alpha=ITEM_ALPHA,
        ),
        epochs=epochs,
        num_threads=NUM_THREADS,
    )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values['MAP@10']


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, n_jobs=6)

print('Лучшие параметры: ', study.best_params)

### Рекомендации с nmclib

In [111]:
class ANNRecommendation:
    def __init__(self, model, dataset, M=48, efc=100, num_threads=6, K=10, space_name='negdotprod'):
        self.index_time_params = {'M': M, 'indexThreadQty': num_threads,
                                  'efConstruction': efc, 'post': 0}
        self.query_time_params = {'efSearch': efc}
        self.model = model
        self.dataset = dataset
        self.num_threads = num_threads
        self.K = K
        self.space_name = space_name

    def get_vectors(self):
        user_embeddings, item_embeddings = self.model.get_vectors(dataset)
        user_shape, item_shape = user_embeddings.shape, item_embeddings.shape
        print(f'Размер эмбединга для юзеров: {user_shape} \n Размер эмбединга для айтемо: {item_shape}')
        return user_embeddings, item_embeddings

    def augment_inner_product(self, user_embeddings, item_embeddings):
        normed_factors = np.linalg.norm(item_embeddings, axis=1)
        max_norm = normed_factors.max()

        extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
        self.augmented_item_embeddings = np.append(item_embeddings, extra_dim, axis=1)

        extra_zero = np.zeros((user_embeddings.shape[0], 1))
        self.augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)

    def create_index(self):
        self.index = nmslib.init(method='hnsw', space=self.space_name, data_type=nmslib.DataType.DENSE_VECTOR)
        self.index.addDataPointBatch(self.augmented_item_embeddings)
        self.index.createIndex(self.index_time_params)

    def create_query_params(self):
        self.index.setQueryTimeParams(self.query_time_params)

    def fit(self):
        user_embeddings, item_embeddings = self.get_vectors()
        self.augment_inner_product(user_embeddings, item_embeddings)
        self.create_index()
        self.create_query_params()

    def get_recommendation(self, users):
        users_intermal_ids = self.dataset.user_id_map.convert_to_internal(users)
        query_matrix = self.augmented_user_embeddings[users_intermal_ids, :]
        query_qty = query_matrix.shape[0]
        start = time.time()
        nbrs = self.index.knnQueryBatch(query_matrix, k=self.K, num_threads=self.num_threads)
        end = time.time()
        print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
              (end-start, float(end-start)/query_qty, self.num_threads*float(end-start)/query_qty))
        results = nbrs[0][0]
        items = self.dataset.item_id_map.convert_to_external(results)
        return items



[I 2023-12-15 18:04:52,459] Trial 3 finished with value: 0.0793295196069672 and parameters: {'no_components': 36, 'epochs': 37, 'learning_rate': 0.049812853066957014, 'loss': 'warp'}. Best is trial 3 with value: 0.0793295196069672.


In [124]:
model = LightFMWrapperModel(
    LightFM(
        no_components=36,
        loss='bpr',
        random_state=RANDOM_STATE,
        learning_rate=0.03,
        user_alpha=0.0008,
        item_alpha=0.0004,
    ),
    epochs=50,
    num_threads=NUM_THREADS,
)

model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fa6d5f360d0>

In [None]:
recos = model.recommend(
        users=list(interactions_df['user_id']),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
metric_values = calc_metrics(metrics, recos, interactions_df, interactions_df)
print(metric_values)

In [113]:
with open('light_fm.pickle', 'wb') as fle:
    pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [116]:
ann = ANNRecommendation(model, dataset)
ann.fit()

Размер эмбединга для юзеров: (762727, 44) 
 Размер эмбединга для айтемо: (13851, 44)


In [121]:
ann.get_recommendation([7])

kNN time total=0.000470 (sec), per query=0.000470 (sec), per query adjusted for thread number=0.002821 (sec)


array([10440, 13865, 15297,  9728,  4151,  3734,  7107, 14461,  9996,
        8636])

rii

In [28]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [29]:
item_embeddings = item_embeddings.astype(np.float32)
user_embeddings = user_embeddings.astype(np.float32)

In [30]:
item_embeddings.shape

(13711, 38)

In [31]:
e = rii.Rii(fine_quantizer=nanopq.PQ(M=19).fit(vecs=item_embeddings))
e.add_configure(vecs=user_embeddings)

M: 19, Ks: 256, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 123
Training the subspace: 0 / 19
Training the subspace: 1 / 19
Training the subspace: 2 / 19
Training the subspace: 3 / 19
Training the subspace: 4 / 19
Training the subspace: 5 / 19
Training the subspace: 6 / 19
Training the subspace: 7 / 19
Training the subspace: 8 / 19
Training the subspace: 9 / 19
Training the subspace: 10 / 19
Training the subspace: 11 / 19
Training the subspace: 12 / 19
Training the subspace: 13 / 19
Training the subspace: 14 / 19
Training the subspace: 15 / 19
Training the subspace: 16 / 19
Training the subspace: 17 / 19
Training the subspace: 18 / 19
Encoding the subspace: 0 / 19
SIMD support: avx
Encoding the subspace: 1 / 19
Encoding the subspace: 2 / 19
Encoding the subspace: 3 / 19
Encoding the subspace: 4 / 19
Encoding the subspace: 5 / 19
Encoding the subspace: 6 / 19
Encoding the subspace: 7 / 19
Encoding the subspace: 8 / 19
Encoding the subspace: 9 / 19
Encoding the subspac

<rii.rii.Rii at 0x7fa72d0fafd0>

In [43]:
user_vector = user_embeddings[4]
topk = 10
ids, dists = e.query(q=user_vector, topk=topk)

print("Индексы ближайших элементов:", ids)
print("Расстояния:", dists)

Индексы ближайших элементов: [     4  47735 104593  54227 163929 174633 122817 146169  12746 143843]
Расстояния: [1.5420351  1.5428884  1.54295492 1.55285871 1.55553508 1.55553508
 1.55553508 1.5556016  1.5556016  1.56001651]
