In [1]:
from typing import Mapping, Sequence

import numpy as np
import pandas as pd
import polars as pl

K = 10
ndcg_weights = 1.0 / np.log2(np.arange(0, K) + 2)
ndcg_idcg = ndcg_weights.cumsum()


def evaluate(ref_path, pred_path, train_path):

    submission = pl.read_parquet(pred_path)
    ref_df = pl.read_parquet(ref_path)
    train = pl.read_parquet(train_path)

    submission = (
        submission
        .select(
            pl.col("user_id").cast(pl.Int64),
            pl.col("item_id").cast(pl.List(pl.Int64)).alias("predicted"),
        )
        .unique(subset="user_id")
        .with_columns(
            pl.col("predicted").list.unique(maintain_order=True)
        )
    )

    ground_truth = ref_df.with_columns(pl.col("item_id").alias("ground_truth"))

    submission_with_gt = ground_truth.join(submission, on="user_id", how="left")

    metrics_per_user = submission_with_gt.select(
        pl.col("user_id"),
        pl.struct("predicted", "ground_truth").apply(ndcg_per_user).alias("ndcg"),
    )
    mean_ndcg = metrics_per_user.select(pl.col("ndcg").mean())["ndcg"][0]
    
    metrics_per_user = submission_with_gt.select(
        pl.col("user_id"),
        pl.struct("predicted", "ground_truth").apply(hitrate_per_user).alias("hitrate"),
    )
    mean_hitrate = metrics_per_user.select(pl.col("hitrate").mean())["hitrate"][0]
    
    coverage = compute_coverage(submission, train)
    novelty = compute_novelty(submission, train)

    return {'ndcg': mean_ndcg,
            'hitrate': mean_hitrate,
            'coverage': coverage,
            'novelty': novelty}


def ndcg_per_user(pl_struct: Mapping[str, Sequence[int]]) -> float:

    predicted = pl_struct["predicted"]
    ground_truth = pl_struct["ground_truth"]

    if predicted is None:
        return 0.0

    assert ground_truth is not None
    assert len(ground_truth) > 0

    predicted_np = np.array(predicted[:K])
    ground_truth_np = np.array(ground_truth)

    predicted_count = min(len(predicted_np), K)
    gt_count = min(len(ground_truth_np), K)

    hits = (predicted_np.reshape(-1, 1) == ground_truth_np.reshape(1, -1)).sum(axis=1)
    dcg = (hits * ndcg_weights[:predicted_count]).sum()
    idcg = ndcg_idcg[gt_count - 1]
    ndcg = dcg / idcg
    return ndcg


def hitrate_per_user(pl_struct: Mapping[str, Sequence[int]]) -> float:

    predicted = pl_struct["predicted"]
    ground_truth = pl_struct["ground_truth"]

    if predicted is None:
        return 0.0

    assert ground_truth is not None
    assert len(ground_truth) > 0

    predicted_np = np.array(predicted[:K])
    ground_truth_np = np.array(ground_truth)

    hitrate = int(len(np.intersect1d(predicted_np, ground_truth_np)) > 0)

    return hitrate


def compute_coverage(submission, train):
    
    list_of_lists = submission.select('predicted').to_series().to_list()
    all_pred_items = [x for xs in list_of_lists for x in xs]
    all_pred_items = set(all_pred_items)
    
    all_train_items = train.select('item_id').unique().to_series().to_list()
    
    coverage = len(all_pred_items.intersection(all_train_items)) / len(all_train_items)
    
    return coverage


def compute_novelty(submission, train):
    
    num_interactions = len(train)
    item_stats = train.groupby('item_id').count()
    item_stats = item_stats.with_columns(-np.log2(pl.col('count') / num_interactions).alias('item_novelty'))
    item_stats = item_stats.with_columns((pl.col('item_novelty') / np.log2(num_interactions)))
    item_stats = item_stats.select('item_id', 'item_novelty').to_pandas()
    
    list_of_lists = submission.select('predicted').to_series().to_list()
    all_pred_items = [x for xs in list_of_lists for x in xs]
    num_recommendations = len(all_pred_items)
    
    recs_items = pd.Series(all_pred_items).value_counts().reset_index()
    recs_items.columns = ['item_id', 'item_count']
    recs_items = pd.merge(recs_items, item_stats)
    recs_items['product'] = recs_items['item_count'] * recs_items['item_novelty']

    novelty = recs_items['product'].sum() / num_recommendations

    return novelty

In [2]:
prediction_path = "predict.parquet"
ground_truth_path = "test_gt.parquet"
train_path = "train.parquet"

In [None]:
evaluate(ground_truth_path, prediction_path, train_path)

In [14]:
! pip install lightfm
! pip install pandas numpy pyarrow RecTools

Collecting RecTools
  Downloading rectools-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting attrs<24.0.0,>=19.1.0 (from RecTools)
  Using cached attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting implicit<0.8.0,>=0.7.1 (from RecTools)
  Using cached implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Collecting numpy
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.13,>=1.10.1 (from RecTools)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting tqdm<5.0.0,>=4.27.0 (from RecTools)
  Using cached tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)
Collecting typeguard<5.0.0,>=4.1.0 (from RecTools)
  Downloading typeguard-4.4.1-py3-none-any.whl.metadata (3.7 kB)
Downloading rectools-0.8.0-py3-none-any.whl (143 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Using cached attrs-23.2.0-p

In [78]:
from lightfm import LightFM
import pandas as pd
import numpy as np 
import rectools
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitItemKNNWrapperModel
from implicit.nearest_neighbours import TFIDFRecommender

smm_train_data = pd.read_parquet('train_smm.parquet').drop_duplicates()
smm_test_data = pd.read_parquet('test_smm.parquet').drop_duplicates()

zvuk_train_data = pd.read_parquet('train_zvuk.parquet').drop_duplicates()
zvuk_test_data = pd.read_parquet('test_zvuk.parquet').drop_duplicates()

smm_train_data.columns = [Columns.User, Columns.Item, Columns.Datetime,  Columns.Weight]
smm_test_data.columns = [Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]
zvuk_train_data.columns = [Columns.User, Columns.Datetime, Columns.Item, Columns.Weight]
zvuk_test_data.columns = [Columns.User, Columns.Datetime, Columns.Item, Columns.Weight]

In [79]:

def delete_data(zvuk_train_data, smm_train_data):
    smm_train_data['datetime'] = pd.to_datetime(smm_train_data['datetime']).dt.date
    daily_counts = smm_train_data.groupby('datetime').size().reset_index(name='purchase_count')
    filtered_dates = daily_counts[daily_counts['purchase_count'] <= 5000]['datetime']
    filtered_data = smm_train_data[smm_train_data['datetime'].isin(filtered_dates)]

    zvuk_train_data['datetime'] = pd.to_datetime(zvuk_train_data['datetime']).dt.date
    daily_counts = zvuk_train_data.groupby('datetime').size().reset_index(name='purchase_count')
    filtered_dates = daily_counts[daily_counts['purchase_count'] <= 20000]['datetime']
    filtered_data = zvuk_train_data[zvuk_train_data['datetime'].isin(filtered_dates)]


    g_zvuk = zvuk_train_data.item_id.value_counts(True).reset_index()
    g_zvuk = g_zvuk[g_zvuk.proportion >= 0.000002]
    g_smm = smm_train_data.item_id.value_counts(True).reset_index()
    g_smm = g_smm[g_smm.proportion >= 0.0000025]
    smm_train_data[smm_train_data.item_id.isin(g_smm.item_id)], zvuk_train_data[zvuk_train_data.item_id.isin(g_zvuk.item_id)]
    smm_train_data = smm_train_data[smm_train_data['weight'] >= 2]
    zvuk_train_data = zvuk_train_data[zvuk_train_data['weight'] >= 3]
    zvuk_train_data = zvuk_train_data.groupby('item_id').filter(lambda x: len(x) >= 1000)
    smm_train_data = smm_train_data.groupby('item_id').filter(lambda x: len(x) >= 300)

    return smm_train_data, zvuk_train_data

zvuk_train_data, smm_train_data = delete_data(zvuk_train_data, smm_train_data)

In [80]:
def sparsing(features, column):
    features = pd.get_dummies(features)
    features_frames = []
    for feature in features.columns[1:]:
        feature_frame = features.reindex(columns=[column, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        features_frames.append(feature_frame)
    return pd.concat(features_frames)


zvuk_user_features_df = sparsing(pd.read_parquet('features/zvuk_user_features.parquet').reset_index().drop(columns=['user_first_interaction', 'user_last_interaction']), "user_id")
zvuk_item_features_df = sparsing(pd.read_parquet('features/zvuk_item_features.parquet').reset_index(), "item_id")
smm_user_features_df = sparsing(pd.read_parquet('features/smm_user_features.parquet').reset_index().drop(columns=['user_first_interaction', 'user_last_interaction']), "user_id")
smm_item_features_df = sparsing(pd.read_parquet('features/smm_item_features.parquet').reset_index(), "item_id")


In [81]:
zvuk_train_data.shape, smm_train_data.shape

((3536879, 4), (1894774, 4))

In [82]:
zvuk_dataset = Dataset.construct(
    interactions_df=zvuk_train_data[zvuk_train_data.user_id.isin(zvuk_test_data.user_id.unique())], 
    user_features_df=zvuk_user_features_df, 
    item_features_df=zvuk_item_features_df,
    )
smm_dataset = Dataset.construct(
    interactions_df=smm_train_data[smm_train_data.user_id.isin(smm_test_data.user_id.unique())], 
    user_features_df=smm_user_features_df, 
    item_features_df=smm_item_features_df,
    )


In [91]:
from rectools.models.lightfm import LightFMWrapperModel

model = LightFMWrapperModel(LightFM(loss='warp', no_components=500), epochs=1, num_threads=12, verbose=True)
model.fit(zvuk_dataset)

# Make recommendations
recos = model.recommend(
    users=zvuk_test_data[Columns.User].unique(),
    dataset=zvuk_dataset,
    k=10,
    filter_viewed=True,
    on_unsupported_targets='ignore'
)
answer = pd.DataFrame(recos.groupby('user_id')['item_id'].apply(list))
answer.to_parquet('submission_zvuk.parquet')

model.fit(smm_dataset)

# Make recommendations
recos = model.recommend(
    users=smm_test_data[Columns.User].unique(),
    dataset=smm_dataset,
    k=10,
    filter_viewed=True,
    on_unsupported_targets='ignore'
)

answer = pd.DataFrame(recos.groupby('user_id')['item_id'].apply(list))
answer.to_parquet('submission_smm.parquet')

Epoch: 100%|██████████| 1/1 [00:05<00:00,  5.23s/it]
Epoch: 100%|██████████| 1/1 [00:12<00:00, 12.60s/it]
