In [1]:
! pip install kagglehub Rectools

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("CooperUnion/anime-recommendations-database")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/lockr/.cache/kagglehub/datasets/CooperUnion/anime-recommendations-database/versions/1


In [3]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from tqdm.autonotebook import tqdm as notebook_tqdm
from rectools.models.lightfm import LightFMWrapperModel
from rectools.dataset import Dataset
from rectools import Columns

def sparsing(features, column):
    features = pd.get_dummies(features)
    features_frames = []
    for feature in features.columns[1:]:
        feature_frame = features.reindex(columns=[column, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        features_frames.append(feature_frame)
    return pd.concat(features_frames)

In [21]:
anime = pd.read_csv(path + '/anime.csv')
ratings = pd.read_csv(path + '/rating.csv')
anime.drop('rating', axis=1, inplace=True)
anime.rename(columns={'anime_id': 'id'}, inplace=True)
ratings = ratings[ratings['rating']!=-1]
ratings.columns = [Columns.User, Columns.Item,  Columns.Weight]
ratings['datetime'] = ratings.groupby(Columns.User).cumcount()


In [79]:
from typing import Mapping, Sequence

import numpy as np
import pandas as pd
import polars as pl

def evaluate(ref_path, pred_path, train_path, K):
    ndcg_weights = 1.0 / np.log2(np.arange(0, K) + 2)
    ndcg_idcg = ndcg_weights.cumsum()
    def evaluate_inside(ref_path, pred_path, train_path):

        submission = pl.read_parquet(pred_path)
        ref_df = pl.read_parquet(ref_path)
        train = pl.read_parquet(train_path)

        submission = (
            submission
            .select(
                pl.col("user_id").cast(pl.Int64),
                pl.col("item_id").cast(pl.List(pl.Int64)).alias("predicted"),
            )
            .unique(subset="user_id")
            .with_columns(
                pl.col("predicted").list.unique(maintain_order=True)
            )
        )

        ground_truth = ref_df.with_columns(pl.col("item_id").alias("ground_truth"))

        submission_with_gt = ground_truth.join(submission, on="user_id", how="left")

        metrics_per_user = submission_with_gt.select(
            pl.col("user_id"),
            pl.struct("predicted", "ground_truth").map_elements(ndcg_per_user).alias("ndcg"),
        )
        mean_ndcg = metrics_per_user.select(pl.col("ndcg").mean())["ndcg"][0]
        
        metrics_per_user = submission_with_gt.select(
            pl.col("user_id"),
            pl.struct("predicted", "ground_truth").map_elements(hitrate_per_user).alias("hitrate"),
        )
        mean_hitrate = metrics_per_user.select(pl.col("hitrate").mean())["hitrate"][0]
        
        coverage = compute_coverage(submission, train)
        novelty = compute_novelty(submission, train)

        return {'ndcg': mean_ndcg,
                'hitrate': mean_hitrate,
                'coverage': coverage,
                'novelty': novelty}


    def ndcg_per_user(pl_struct: Mapping[str, Sequence[int]]) -> float:

        predicted = pl_struct["predicted"]
        ground_truth = pl_struct["ground_truth"]

        if predicted is None:
            return 0.0

        assert ground_truth is not None
        assert len(ground_truth) > 0

        predicted_np = np.array(predicted[:K])
        ground_truth_np = np.array(ground_truth)

        predicted_count = min(len(predicted_np), K)
        gt_count = min(len(ground_truth_np), K)

        hits = (predicted_np.reshape(-1, 1) == ground_truth_np.reshape(1, -1)).sum(axis=1)
        dcg = (hits * ndcg_weights[:predicted_count]).sum()
        idcg = ndcg_idcg[gt_count - 1]
        ndcg = dcg / idcg
        return ndcg


    def hitrate_per_user(pl_struct: Mapping[str, Sequence[int]]) -> float:

        predicted = pl_struct["predicted"]
        ground_truth = pl_struct["ground_truth"]

        if predicted is None:
            return 0.0

        assert ground_truth is not None
        assert len(ground_truth) > 0

        predicted_np = np.array(predicted[:K])
        ground_truth_np = np.array(ground_truth)

        hitrate = int(len(np.intersect1d(predicted_np, ground_truth_np)) > 0)

        return hitrate


    def compute_coverage(submission, train):
        
        list_of_lists = submission.select('predicted').to_series().to_list()
        all_pred_items = [x for xs in list_of_lists for x in xs]
        all_pred_items = set(all_pred_items)
        
        all_train_items = train.select('item_id').unique().to_series().to_list()
        
        coverage = len(all_pred_items.intersection(all_train_items)) / len(all_train_items)
        
        return coverage


    def compute_novelty(submission, train):
        
        num_interactions = len(train)
        item_stats = train.group_by('item_id').count()
        item_stats = item_stats.with_columns(-np.log2(pl.col('count') / num_interactions).alias('item_novelty'))
        item_stats = item_stats.with_columns((pl.col('item_novelty') / np.log2(num_interactions)))
        item_stats = item_stats.select('item_id', 'item_novelty').to_pandas()
        
        list_of_lists = submission.select('predicted').to_series().to_list()
        all_pred_items = [x for xs in list_of_lists for x in xs]
        num_recommendations = len(all_pred_items)
        
        recs_items = pd.Series(all_pred_items).value_counts().reset_index()
        recs_items.columns = ['item_id', 'item_count']
        recs_items = pd.merge(recs_items, item_stats)
        recs_items['product'] = recs_items['item_count'] * recs_items['item_novelty']

        novelty = recs_items['product'].sum() / num_recommendations

        return novelty
    
    return evaluate_inside(ref_path, pred_path, train_path)

In [73]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

splitter  = LastNSplitter(10, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp-kos', no_components=10), epochs=10, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

Epoch: 100%|██████████| 10/10 [00:31<00:00,  3.12s/it]


In [80]:
evaluate('gt.parquet', 'submission.parquet', 'train.parquet', 10)

  item_stats = train.group_by('item_id').count()


{'ndcg': 0.024063982545459103,
 'hitrate': 0.16958783808715547,
 'coverage': 0.14808099123602297,
 'novelty': 0.4088066253854988}

In [82]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 3

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp-kos', no_components=10), epochs=10, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 10/10 [00:34<00:00,  3.45s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.011904855062562694,
 'hitrate': 0.03442646564555625,
 'coverage': 0.10879419764279238,
 'novelty': 0.40193698130963096}

In [83]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 1

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp', no_components=10), epochs=10, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 10/10 [00:34<00:00,  3.42s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.006001658749905753,
 'hitrate': 0.006001658749905753,
 'coverage': 0.08562506295960512,
 'novelty': 0.39769777619904334}

In [84]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 10

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp', no_components=10), epochs=10, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 10/10 [00:16<00:00,  1.69s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.025885971358933446,
 'hitrate': 0.19157964650750894,
 'coverage': 0.1386118666263725,
 'novelty': 0.4000985249487645}

In [85]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 10

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp', no_components=100), epochs=10, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 10/10 [00:37<00:00,  3.78s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.03229466508893545,
 'hitrate': 0.22054536732961882,
 'coverage': 0.28991638964440414,
 'novelty': 0.42363002415881124}

In [86]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 10

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp', no_components=100), epochs=100, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 100/100 [06:03<00:00,  3.63s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.04107740887693496,
 'hitrate': 0.282554149293384,
 'coverage': 0.4149289815654276,
 'novelty': 0.44159226272999375}

In [87]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 10

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp', no_components=10), epochs=100, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 100/100 [02:22<00:00,  1.43s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.028134504467381565,
 'hitrate': 0.20034316076897532,
 'coverage': 0.2026795607937947,
 'novelty': 0.4116600815170103}

In [88]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 10

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp-kos', no_components=100), epochs=100, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 100/100 [09:29<00:00,  5.70s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.04567057675582652,
 'hitrate': 0.3211874100586694,
 'coverage': 0.33534804069708873,
 'novelty': 0.43926556934212113}

In [89]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 10

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='logistic', no_components=100), epochs=100, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 100/100 [04:55<00:00,  2.96s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.010745014083999723,
 'hitrate': 0.09960887052138298,
 'coverage': 0.013599274705349048,
 'novelty': 0.3804586473313905}

In [91]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 40

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp-kos', no_components=100), epochs=100, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 100/100 [07:09<00:00,  4.29s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.0679179745378838,
 'hitrate': 0.7867503907428227,
 'coverage': 0.42379369396595146,
 'novelty': 0.457048397850819}

In [92]:
from rectools.model_selection.last_n_split import LastNSplitter
from rectools.dataset.interactions import Interactions

NUM_OF_SAMPELS = 40

splitter  = LastNSplitter(NUM_OF_SAMPELS, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True)

train_ids, test_ids = [], []

for train_ids, test_ids, _ in splitter.split(Interactions(ratings)):
    train_ids, test_ids = train_ids, test_ids


train = ratings.iloc[train_ids, :]
test = ratings.iloc[test_ids, :]


dataset = Dataset.construct(
    interactions_df=train, 
    # item_features_df=anime,
    )

model = LightFMWrapperModel(LightFM(loss='warp-kos', no_components=1000), epochs=100, num_threads=12, verbose=True)
model.fit(dataset)

# Make recommendations
recos = model.recommend(
    users=test.user_id.unique(),
    dataset=dataset,
    k=NUM_OF_SAMPELS,
    filter_viewed=True
)


pd.DataFrame(recos.groupby('user_id')['item_id'].apply(set)).to_parquet('submission.parquet')
pd.DataFrame(test.groupby('user_id')['item_id'].apply(set)).to_parquet('gt.parquet')
ratings.to_parquet('train.parquet')

evaluate('gt.parquet', 'submission.parquet', 'train.parquet', NUM_OF_SAMPELS)

Epoch: 100%|██████████| 100/100 [50:10<00:00, 30.11s/it]
  item_stats = train.group_by('item_id').count()


{'ndcg': 0.06307275178928959,
 'hitrate': 0.7779758150758178,
 'coverage': 0.507504784929989,
 'novelty': 0.4764956090517431}