In [167]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [168]:
from implicit.bpr import BayesianPersonalizedRanking
from lightfm import LightFM
from tqdm import tqdm
import typing as tp
from pathlib import Path

from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.dataset import Dataset
from rectools import Columns
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools.metrics import Precision, Recall, MAP, calc_metrics
from implicit.als import AlternatingLeastSquares
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from implicit.lmf import LogisticMatrixFactorization

In [169]:
users = pd.read_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/users.csv')
items = pd.read_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/items.csv')
interactions = pd.read_csv(
    '/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/interactions.csv')

In [170]:
interactions_df = interactions.drop(columns='total_dur')
interactions_df['watched_pct'] = interactions_df['watched_pct'] / 100
interactions_df.rename(columns={'user_id': Columns.User, 'item_id': Columns.Item,
                                'last_watch_dt': Columns.Datetime, 'watched_pct': Columns.Weight}, inplace=True)

In [171]:
interactions_df.dropna(inplace=True)

In [172]:
interactions_df[Columns.Datetime] = pd.to_datetime(interactions_df[Columns.Datetime], format='%Y-%m-%d')

In [173]:
max_date = interactions_df[Columns.Datetime].max()

In [174]:
train = interactions_df[interactions_df[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions_df[interactions_df[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4984443, 4)
test: (490980, 4)


In [175]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [176]:
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [177]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [178]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [179]:
items.fillna('Unknown', inplace=True)

In [180]:
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [181]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

In [182]:
item_features = pd.concat([genre_feature, content_feature])

In [183]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [184]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 6
N_FACTORS = (32,)
N_EPOCHS = 1  # Lightfm
USER_ALPHA = 0  # Lightfm
ITEM_ALPHA = 0  # Lightfm
LEARNING_RATE = 0.05  # Lightfm

In [185]:
interactions_df.isnull().mean()

user_id     0.0
item_id     0.0
datetime    0.0
weight      0.0
dtype: float64

In [197]:
models = {
    'popular': PopularModel(),
}

In [190]:
lightfm_losses = 'bpr'
models = {}

models[f"LightFM_bpr_32"] = LightFMWrapperModel(
            LightFM(
                no_components=32,
                loss=lightfm_losses,
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )

In [191]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.39 s, sys: 266 ms, total: 1.65 s
Wall time: 1.68 s


In [164]:
TEST_USERS = test[Columns.User].unique()

In [198]:
%%time
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    print(metric_values)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model popular...
{'Precision@1': 0.07330855141970494, 'Recall@1': 0.038149593189863926, 'Precision@2': 0.0692635125043562, 'Recall@2': 0.07101151735469721, 'Precision@3': 0.06622523883016092, 'Recall@3': 0.10040097751049191, 'Precision@4': 0.05938324565625052, 'Recall@4': 0.11887933072414097, 'Precision@5': 0.05273569093413432, 'Recall@5': 0.13047422807981374, 'Precision@6': 0.04737743186356671, 'Recall@6': 0.13935091270991617, 'Precision@7': 0.04243468077086291, 'Recall@7': 0.14504771897356164, 'Precision@8': 0.03885000580826101, 'Recall@8': 0.15111143638232485, 'Precision@9': 0.035987247640278726, 'Recall@9': 0.15713799491725927, 'Precision@10': 0.03370865762790621, 'Recall@10': 0.16334924892186042, 'MAP@1': 0.038149593189863926, 'MAP@2': 0.05525989453907726, 'MAP@3': 0.06573912068795446, 'MAP@4': 0.07095273573862776, 'MAP@5': 0.0736904806226815, 'MAP@6': 0.07547305264521456, 'MAP@7': 0.07645764528290082, 'MAP@8': 0.07739640676688721, 'MAP@9': 0.07820339897519692, 'MAP@10': 0

In [200]:
recos['item_id'].unique()

array([10440, 15297,  9728, 13865,  4151,  3734,  2657,   142,  6809,
        9996,  4880,  8636,  4740, 12192, 11237,  1844,  7571, 12995,
        4457, 14431, 14741,  7829,  4495,  7417, 14703,  4436,  7102,
        7107, 16228, 11863,  7793, 12173,   849,  7626, 13018, 16166,
        3784])

In [44]:
class ANNRecommendation:
    def __init__(self, model, dataset, M = 48, efc = 100, num_threads = 6, K = 10):
        self.model = model
        self.dataset = dataset
        self.M = M
        self.efc = efc
        self.K = K
        self.space_name = 'negdotprod'
        self.num_threads = num_threads
        self.user_embeddings = None
        self.item_embeddings = None
        self.augmented_factors = None
        self.augmented_user_embeddings = None
        self.index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efc, 'post': 0}
    
    def get_vectors(self):
        self.user_embeddings, self.item_embeddings = model.get_vectors(dataset)
        user_shape, item_shape = self.user_embeddings.shape, self.item_embeddings.shape
        print(f'Размер эмбединга для юзеров: {user_shape} \n Размер эмбединга для айтемо: {item_shape}')
        
    def augment_inner_product(self):
        normed_factors = np.linalg.norm(self.item_embeddings, axis=1)
        max_norm = normed_factors.max()

        extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
        self.augmented_item_embeddings = np.append(self.item_embeddings, extra_dim, axis=1)
        
    def get_augment_user_embeddings(self):
        extra_zero = np.zeros((self.user_embeddings.shape[0], 1))
        self.augmented_user_embeddings = np.append(self.user_embeddings, extra_zero, axis=1)
        
    def create_index(self):
        self.index = nmslib.init(method='hnsw', space=self.space_name, data_type=nmslib.DataType.DENSE_VECTOR)
        self.index.addDataPointBatch(self.augmented_item_embeddings)
        start = time.time()
        self.index.createIndex(self.index_time_params)
        end = time.time()
        print('Index-time parameters', self.index_time_params)
        print('Indexing time = %f' % (end-start))
        
    def create_query_params(self):
        query_time_params = {'efSearch': self.efc}
        print('Setting query-time parameters', query_time_params)
        self.index.setQueryTimeParams(query_time_params)
        
    def fit(self):
        self.get_vectors()
        self.augment_inner_product()
        self.get_augment_user_embeddings()
        self.create_index()
        self.create_query_params()
        
    def save_to_file(self, filename):
        state = {k: v for k, v in self.__dict__.items() if k != 'index'}
        with open(filename, 'wb') as file:
            pickle.dump(state, file)
         
    def get_recommendation(self, users):
        users_intermal_ids = self.dataset.user_id_map.convert_to_internal(users)
        query_matrix = self.augmented_user_embeddings[users_intermal_ids, :]
        query_qty = query_matrix.shape[0]
        start = time.time()
        nbrs = self.index.knnQueryBatch(query_matrix, k=self.K, num_threads=self.num_threads)
        end = time.time()
        print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
              (end-start, float(end-start)/query_qty, self.num_threads*float(end-start)/query_qty))
        results = nbrs[0][0]
        
        items = dataset.item_id_map.convert_to_external(results)
        return items

In [35]:
ann = ANNRecommendation(model, dataset)
ann.fit()

Размер эмбединга для юзеров: (962151, 52) 
 Размер эмбединга для айтемо: (15605, 52)
Index-time parameters {'M': 48, 'indexThreadQty': 6, 'efConstruction': 100, 'post': 0}
Indexing time = 0.225880
Setting query-time parameters {'efSearch': 100}


In [42]:
with open('imlicit.pkl', 'wb') as file:
    pickle.dump(ann, file)

TypeError: cannot pickle 'nmslib.dist.FloatIndex' object

In [33]:
ann.get_recommendation([1000, 45])

kNN time total=0.005111 (sec), per query=0.002555 (sec), per query adjusted for thread number=0.015333 (sec)


array([10440,  9728, 13865, 15297,  7829, 12995, 12356,  4457,  3734,
        7793])

In [109]:
model.u2i_dist

<Distance.DOT: 1>