In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42)
K = 10
N_BOOT = 20       
NEG_RATIO = 1.0
ALPHA_DECAY = 0.01
TIME_WINDOW = 30
SAMPLE_USERS = 10000

In [3]:
def ndcg_at_k(r, k):
    r = np.array(r)[:k]
    dcg = np.sum((2**r - 1) / np.log2(np.arange(2, len(r) + 2)))
    ideal_r = np.sort(r)[::-1]
    idcg = np.sum((2**ideal_r - 1) / np.log2(np.arange(2, len(ideal_r) + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def apk(actual, predicted, k):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k) if actual else 0.0

In [4]:
def add_negative_samples(df, all_items, negative_ratio=1.0, seed=42):
    rng = np.random.default_rng(seed)
    rows = []
    for u, grp in df.groupby("user_id"):
        pos_items = set(grp[grp["rating"] > 0]["item_id"])
        n_neg = int(len(pos_items) * negative_ratio)
        if n_neg > 0:
            neg_items = rng.choice(list(set(all_items) - pos_items), size=n_neg, replace=False)
            for i in neg_items:
                rows.append({"user_id": u, "item_id": i, "rating": 0})
    neg_df = pd.DataFrame(rows)
    return pd.concat([df, neg_df]).reset_index(drop=True)

In [5]:
def make_random_recommender(items, k=10, seed=42):
    rng = np.random.default_rng(seed)
    return lambda users: {u: rng.choice(items, size=k, replace=False).tolist() for u in users}

def make_popular_recommender(train_df, k=10):
    top_items = train_df[train_df["rating"]>0]["item_id"].value_counts().index[:k].tolist()
    return lambda users: {u: top_items for u in users}

def make_popular_time_recommender(train_df, k=10, alpha=0.01):
    now = train_df["timestamp"].max()
    train_df["weight"] = train_df["rating"] * np.exp(-alpha * (now - train_df["timestamp"]).dt.days)
    top_items = train_df.groupby("item_id")["weight"].sum().sort_values(ascending=False).index[:k].tolist()
    return lambda users: {u: top_items for u in users}

def make_als_recommender(train_df, k=10, factors=32, reg=0.1, iters=15, seed=42):
    users = train_df["user_id"].unique()
    items = train_df["item_id"].unique()
    user_to_idx = {u:i for i,u in enumerate(users)}
    idx_to_item = {i:u for u,i in enumerate({item:i for i,item in enumerate(items)})}

    row = train_df["user_id"].map(user_to_idx)
    col = train_df["item_id"].map({i:j for j,i in enumerate(items)})
    data_vals = train_df["rating"]
    user_item_csr = csr_matrix((data_vals, (row, col)), shape=(len(users), len(items)))

    model = AlternatingLeastSquares(factors=factors, regularization=reg, iterations=iters, random_state=seed)
    model.fit(user_item_csr)

    def recommend(users_list):
        recs = {}
        for u in users_list:
            if u not in user_to_idx:
                recs[u] = train_df["item_id"].value_counts().index[:k].tolist()
                continue
            uid = user_to_idx[u]
            recommended, scores = model.recommend(uid, user_item_csr[uid], N=k, filter_already_liked_items=True)
            recs[u] = [items[i] for i in recommended]
        return recs
    return recommend

In [6]:
def evaluate_bootstrap(recommender, test_df, users, k=10, n_boot=20, seed=42):
    rng = np.random.default_rng(seed)
    scores_ndcg, scores_map = [], []
    for _ in range(n_boot):
        sampled_users = rng.choice(users, size=min(len(users), 500), replace=True)
        recs = recommender(sampled_users)
        ndcgs, maps = [], []
        for u in sampled_users:
            actual_items = test_df[(test_df["user_id"]==u) & (test_df["rating"]>0)]["item_id"].tolist()
            predicted = recs[u]
            r = [1 if i in actual_items else 0 for i in predicted]
            ndcgs.append(ndcg_at_k(r, k))
            maps.append(apk(actual_items, predicted, k))
        scores_ndcg.append(np.mean(ndcgs))
        scores_map.append(np.mean(maps))
    return {
        "ndcg_mean": np.mean(scores_ndcg),
        "ndcg_std": np.std(scores_ndcg),
        "map_mean": np.mean(scores_map),
        "map_std": np.std(scores_map),
        "n_boot": n_boot
    }

def print_results(name, res):
    print(f"{name}: nDCG@{K} = {res['ndcg_mean']:.4f} ± {res['ndcg_std']:.4f}, "
          f"MAP@{K} = {res['map_mean']:.4f} ± {res['map_std']:.4f} (n_boot={res['n_boot']})")

In [8]:
data = pd.read_csv("../data/ratings.csv")
data = data.rename(columns={'book_id': 'item_id'})
data["user_id"] = data["user_id"].astype(str)
data["item_id"] = data["item_id"].astype(str)
data["rating"] = data["rating"].astype(int)

if "timestamp" not in data.columns:
    start_date = datetime(2021,1,1)
    data["timestamp"] = [start_date + timedelta(days=random.randint(0, 365*2)) for _ in range(len(data))]
else:
    data["timestamp"] = pd.to_datetime(data["timestamp"], errors="coerce")
    mask = data["timestamp"].isna()
    if mask.any():
        start_date = datetime(2021,1,1)
        data.loc[mask, "timestamp"] = [start_date + timedelta(days=random.randint(0, 365*2)) for _ in range(mask.sum())]


print("Уникальные пользователи:", data["user_id"].nunique())
print("Уникальные айтемы:", data["item_id"].nunique())

Уникальные пользователи: 53424
Уникальные айтемы: 10000


In [9]:
def temporal_split(df, test_size=0.2):
    train, test = [], []
    for user_id in df['user_id'].unique():
        user_data = df[df['user_id'] == user_id].sort_values('timestamp')
        split_idx = int(len(user_data) * (1 - test_size))
        train.append(user_data.iloc[:split_idx])
        test.append(user_data.iloc[split_idx:])
    return pd.concat(train), pd.concat(test)

In [10]:
sample_users = np.random.choice(data["user_id"].unique(), size=SAMPLE_USERS, replace=False)
data = data[data["user_id"].isin(sample_users)].reset_index(drop=True)

# Простые модели
# train_simple, test_simple = train_test_split(data, test_size=0.2, random_state=42)
train_simple, test_simple = temporal_split(data)

# Сложные модели (time-based)
data_sorted = data.sort_values("timestamp").reset_index(drop=True)
split_idx = int(len(data_sorted) * 0.8)
train_time = data_sorted.iloc[:split_idx].reset_index(drop=True)
test_time  = data_sorted.iloc[split_idx:].reset_index(drop=True)

In [11]:
all_items = data["item_id"].unique()
train_simple_ns = add_negative_samples(train_simple, all_items, negative_ratio=NEG_RATIO)
test_simple_ns  = add_negative_samples(test_simple,  all_items, negative_ratio=NEG_RATIO)
train_time_ns   = add_negative_samples(train_time,  all_items, negative_ratio=NEG_RATIO)
test_time_ns    = add_negative_samples(test_time,   all_items, negative_ratio=NEG_RATIO)


rec_random   = make_random_recommender(all_items, k=K)
rec_pop      = make_popular_recommender(train_simple_ns, k=K)
rec_pop_time = make_popular_time_recommender(train_time_ns, k=K, alpha=ALPHA_DECAY)
rec_als      = make_als_recommender(train_time_ns, k=K, factors=32, reg=0.1, iters=15, seed=7)

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [12]:
eval_users_simple = test_simple_ns["user_id"].unique()
eval_users_time   = test_time_ns["user_id"].unique()

res_random   = evaluate_bootstrap(rec_random,   test_simple_ns, eval_users_simple, k=K, n_boot=N_BOOT)
res_pop      = evaluate_bootstrap(rec_pop,      test_simple_ns, eval_users_simple, k=K, n_boot=N_BOOT)
res_pop_time = evaluate_bootstrap(rec_pop_time, test_time_ns,   eval_users_time,   k=K, n_boot=N_BOOT)
res_als      = evaluate_bootstrap(rec_als,      test_time_ns,   eval_users_time,   k=K, n_boot=N_BOOT)

In [14]:
print_results("Random", res_random)
print_results("Popular", res_pop)
print_results("Popular+Time", res_pop_time)
print_results("ALS(SVD)", res_als)

Random: nDCG@10 = 0.0097 ± 0.0035, MAP@10 = 0.0006 ± 0.0003 (n_boot=20)
Popular: nDCG@10 = 0.2420 ± 0.0161, MAP@10 = 0.0266 ± 0.0024 (n_boot=20)
Popular+Time: nDCG@10 = 0.2237 ± 0.0125, MAP@10 = 0.0266 ± 0.0028 (n_boot=20)
ALS(SVD): nDCG@10 = 0.6306 ± 0.0127, MAP@10 = 0.1766 ± 0.0102 (n_boot=20)
