In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
genome_scores = pd.read_csv('../input/movielens-20m-dataset/genome_scores.csv')
genome_tags = pd.read_csv('../input/movielens-20m-dataset/genome_tags.csv')
tag = pd.read_csv('../input/movielens-20m-dataset/tag.csv')

In [15]:
ratings = pd.read_csv('../input/movielens-20m-dataset/rating.csv', parse_dates=['timestamp'])
ratings.head()

In [14]:
movies = pd.read_csv('../input/movielens-20m-dataset/movie.csv')
movies.head()

In [78]:
#!!!! валорджпва
movies = movies[movies['movieId'].isin(ratings['movieId'].unique())]
genres = list(movies['genres'].str.split('|').explode().unique())
len(genres)

In [16]:
# Достанем год выпуска фильма
movies['year'] = movies['title'].map(lambda x: x[x.find('(')+1:x.find(')')])

In [49]:
# Посмотрим, как менялась популярность 3 самых просматриваемых фильмов
popular_movies = rating[rating['movieId'].isin(rating['movieId'].value_counts()[:3].index)]
popular_movies.head()

fig, axes = plt.subplots(3, figsize=(10, 12))

sns.histplot(popular_movies.query('movieId == 296')['timestamp'], ax=axes[0])
axes[0].set_title(list(movies.query('movieId == 296').title)[0])
sns.histplot(popular_movies.query('movieId == 318')['timestamp'], ax=axes[1])
axes[1].set_title(list(movies.query('movieId == 318').title)[0])
sns.histplot(popular_movies.query('movieId == 356')['timestamp'], ax=axes[2])
axes[2].set_title(list(movies.query('movieId == 356').title)[0])

plt.show()

In [50]:
ratings = rating.copy()

In [51]:
# Оставим фильмы, которые просматривали больше 4 раз
movie_frec = ratings['movieId'].value_counts()
ratings = ratings[ratings['movieId'].isin(movie_frec[movie_frec >= 5].index)]

In [52]:
users_inv_mapping = dict(enumerate(ratings['userId'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

In [53]:
items_inv_mapping = dict(enumerate(np.sort(ratings['movieId'].unique())))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
len(items_mapping)

In [54]:
print(f"Количество пользователей: {ratings['userId'].nunique()}")
print(f"Количество фильмов: {ratings['movieId'].nunique()}")

In [56]:
max_date = ratings['timestamp'].max()
min_date = ratings['timestamp'].min()

print(f"min дата в ratings: {min_date}")
print(f"max дата в ratings: {max_date}")

In [57]:
# Делим датасет на тренировочную и тестовую часть
train = ratings[(ratings['timestamp'] < max_date - pd.Timedelta(days=7))]
test = ratings[(ratings['timestamp'] >= max_date - pd.Timedelta(days=7))]

In [58]:
# Будем предсказывать только на горячих пользователях
test = test[test['userId'].isin(train['userId'].unique())]

In [59]:
import scipy.sparse as sp
from itertools import islice, cycle
from more_itertools import pairwise
from tqdm.auto import tqdm

def get_coo_matrix(df, 
                   user_col='userId', 
                   item_col='movieId', 
                   weight_col='rating', 
                   users_mapping=users_mapping, 
                   items_mapping=items_mapping):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [72]:
def compute_metrics(df_true, df_pred, top_N):
    result = {}
    test_recs = df_true.set_index(['userId', 'movieId']).join(df_pred.set_index(['userId', 'movieId']))
    test_recs = test_recs.sort_values(by=['userId', 'rank'])

    test_recs['users_item_count'] = test_recs.groupby(level='userId')['rank'].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='userId').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']
    
    users_count = test_recs.index.get_level_values('userId').nunique()
    k = top_N
    hit_k = f'hit@{k}'
    test_recs[hit_k] = test_recs['rank'] <= k
    result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
    result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count

    result[f'MAP@{top_N}'] = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
    result[f'MRR'] = test_recs.groupby(level='userId')['reciprocal_rank'].max().mean()
    return pd.Series(result)

In [64]:
from implicit.als import AlternatingLeastSquares

def generate_implicit_recs_mapper(model, train_matrix, N, user_mapping, item_inv_mapping):
    def _recs_mapper(user):
        userId = user_mapping[user]
        recs = model.recommend(userId, 
                               train_matrix, 
                               N=N, 
                               filter_already_liked_items=True)
        return [item_inv_mapping[item] for item, _ in recs]
    return _recs_mapper

In [69]:
top_N = 15

m = get_coo_matrix(train).tocsr()

als_model = AlternatingLeastSquares(factors=32, iterations=30, num_threads=4)
als_model.fit(m.T)

mapper = generate_implicit_recs_mapper(als_model, m, top_N, users_mapping, items_inv_mapping)

recs = pd.DataFrame({
    'userId': test['userId'].unique()
})
recs['movieId'] = recs['userId'].map(mapper)
recs = recs.explode('movieId')
recs['rank'] = recs.groupby('userId').cumcount() + 1
metrics = compute_metrics(test, recs, top_N)

In [64]:
metrics

# LightFM

In [48]:
from lightfm.data import Dataset
from lightfm import LightFM

In [85]:
dataset = Dataset()
dataset.fit(ratings['userId'].unique(), ratings['movieId'].unique())

In [86]:
dataset.fit_partial(item_features=genres)

In [87]:
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'items_mapping': lightfm_mapping[2],
    'item_features_mapping': lightfm_mapping[3],
}
print('users_mapping len - ', len(lightfm_mapping['users_mapping']))
print('items_mapping len - ', len(lightfm_mapping['items_mapping']))
print('Users item_features_mapping len - ', len(lightfm_mapping['item_features_mapping']))

In [113]:
# [(user_id1, item_id1, weight), (user_id2, item_id2, weight), ...]
interactions_matrix, weights_matrix = dataset.build_interactions(
    zip(*train[['userId', 'movieId', 'rating']].values.T)
)

In [197]:
def concat_last_to_list(t):
    return (t[0], list(t[1:])[0])

def df_to_tuple_list_iterator(df):
    return map(concat_last_to_list, zip(*df.values.T))

In [204]:
# (1, ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'])
movies = movies[movies['movieId'].isin(ratings['movieId'].unique())]

train_items_features = dataset.build_item_features(
    df_to_tuple_list_iterator(
        movies[['movieId', 'features']]
    )
)
train_items_features

In [220]:
lfm_model = LightFM(no_components=64, learning_rate=0.05, loss='warp', max_sampled=5, random_state=23)

In [285]:
num_epochs = 15
for _ in tqdm(range(num_epochs), total=num_epochs):
    lfm_model.fit_partial(
        weights_matrix,
        item_features=train_items_features,
        num_threads=4
    )

In [311]:
def generate_lightfm_recs_mapper(model, item_ids, known_items, item_features, N, user_mapping, item_inv_mapping, num_threads=4):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, item_features=item_features, num_threads=num_threads)
        
        additional_N = len(known_items[user_id]) if user_id in known_items else 0
        total_N = N + additional_N
        top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]
        
        final_recs = [item_inv_mapping[item] for item in top_cols]
        if additional_N > 0:
            filter_items = known_items[user_id]
            final_recs = [item for item in final_recs if item not in filter_items]
        return final_recs[:N]
    return _recs_mapper

In [298]:
known_items = train.groupby('userId')['movieId'].apply(list).to_dict()
len(known_items)

all_cols = list(lightfm_mapping['items_mapping'].values())

In [324]:
mapper = generate_lightfm_recs_mapper(
    lfm_model, 
    item_ids=all_cols, 
    known_items=known_items,
    N=top_N,
    item_features=train_items_features, 
    user_mapping=lightfm_mapping['users_mapping'],
    item_inv_mapping=lightfm_mapping['items_inv_mapping'],
    num_threads=4
)

recs['movieId'] = recs['userId'].map(mapper)

recs = recs.explode('movieId')
recs['rank'] = recs.groupby('userId').cumcount() + 1

metrics = compute_metrics(test, recs, top_N)

In [325]:
metrics