In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Настройки графиков
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Пути к файлам (замени на свои при необходимости)
data_path = Path("C:/Users/lelik/mine/e-commerce-recommender/data")
events_path = data_path / "events.csv"

# Загрузка данных
events = pd.read_csv(events_path)
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [2]:
# Оставим только нужные столбцы
events = events[['timestamp', 'visitorid', 'itemid', 'event']]

# Преобразуем время
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')

# Оставим только события "transaction"
transactions = events[events['event'] == 'transaction'].copy()

# Переименуем для удобства
transactions = transactions.rename(columns={
    'visitorid': 'user_id',
    'itemid': 'item_id',
    'timestamp': 'datetime'
})

# Сбросим индекс
transactions.reset_index(drop=True, inplace=True)
transactions.head()


Unnamed: 0,datetime,user_id,item_id,event
0,2015-06-02 05:17:56.276,599528,356475,transaction
1,2015-06-01 21:18:20.981,121688,15335,transaction
2,2015-06-01 21:25:15.008,552148,81345,transaction
3,2015-06-01 16:38:56.375,102019,150318,transaction
4,2015-06-01 16:01:58.180,189384,310791,transaction


In [None]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Таблица для item-based моделей
user_item_df = (
    transactions
    .groupby(['user_id', 'item_id'])
    .size()
    .unstack(fill_value=0)
)

# Преобразуем в разреженную матрицу
user_item_sparse = csr_matrix(user_item_df.values)

# Получим список item_ids
item_ids = user_item_df.columns


item_id,15,19,25,42,147,168,199,212,233,304,...,466319,466321,466342,466443,466464,466526,466603,466614,466710,466861
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
172,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
264,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
539,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1406787,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1406981,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1407070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1407110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Косинусное сходство между товарами (по колонкам)
item_similarity = cosine_similarity(user_item_sparse.T)

# Оборачиваем в DataFrame
item_sim_df = pd.DataFrame(item_similarity, index=item_ids, columns=item_ids)


item_sim_df[15335].sort_values(ascending=False)[1:6]

item_id
105792    1.0
80582     1.0
25353     1.0
237753    1.0
302422    1.0
Name: 15335, dtype: float64

In [29]:
from collections import defaultdict
from itertools import combinations

# Словарь: item_id -> (другой item -> число co-purchases)
co_matrix = defaultdict(lambda: defaultdict(int))

# Группируем покупки по user_id
user_groups = transactions.groupby('user_id')['item_id'].apply(list)

# Заполняем co-occurrence матрицу
for items in user_groups:
    unique_items = list(set(items))  # Убираем дубли
    for i, j in combinations(unique_items, 2):
        co_matrix[i][j] += 1
        co_matrix[j][i] += 1
co_matrix

defaultdict(<function __main__.<lambda>()>,
            {465522: defaultdict(int,
                         {10034: 1,
                          348160: 1,
                          56323: 1,
                          350724: 1,
                          332295: 1,
                          271887: 1,
                          11279: 1,
                          257040: 1,
                          133647: 1,
                          150547: 1,
                          294419: 1,
                          21013: 1,
                          284180: 1,
                          104468: 1,
                          432152: 1,
                          345104: 1,
                          117788: 1,
                          176669: 1,
                          450082: 1,
                          171555: 1,
                          7716: 1,
                          458277: 1,
                          414755: 1,
                          235559: 1,
                          258600: 1,

In [31]:
def recommend_similar_items_co(item_id, top_n=5):
    if item_id not in co_matrix:
        return []
    
    # Сортируем по co-occurrence score
    similar_items = sorted(co_matrix[item_id].items(), key=lambda x: x[1], reverse=True)
    return [item for item, score in similar_items[:top_n]]

# Пример
recommend_similar_items_co(15335)


[105792, 200793, 12836, 80582, 380775]

In [36]:
def recommend_for_user_co(user_id, top_n=5):
    if user_id not in user_item_df.index:
        return []

    user_items = user_item_df.loc[user_id]
    bought_items = user_items[user_items > 0].index.tolist()
    
    recs = defaultdict(int)
    for item in bought_items:
        for sim_item, score in co_matrix[item].items():
            if sim_item not in bought_items:
                recs[sim_item] += score

    # Топ-N рекомендаций
    sorted_recs = sorted(recs.items(), key=lambda x: x[1], reverse=True)
    return [item for item, _ in sorted_recs[:top_n]]

# Пример
recommend_for_user_co(741178)


[10572, 171878, 218794, 32581, 320130]

In [41]:
# Выбираем дату как границу
cutoff_time = transactions['datetime'].quantile(0.9)

# Train: до cutoff
train_data = transactions[transactions['datetime'] < cutoff_time]

# Test: после cutoff
test_data = transactions[transactions['datetime'] >= cutoff_time]

print(f"Train: {train_data.shape}, Test: {test_data.shape}")


Train: (20211, 4), Test: (2246, 4)


In [42]:
# User-item для train
train_user_item_df = (
    train_data
    .groupby(['user_id', 'item_id'])
    .size()
    .unstack(fill_value=0)
)

# Обновим co_matrix на основе train
from collections import defaultdict
from itertools import combinations

co_matrix_train = defaultdict(lambda: defaultdict(int))
user_groups = train_data.groupby('user_id')['item_id'].apply(list)

for items in user_groups:
    unique_items = list(set(items))
    for i, j in combinations(unique_items, 2):
        co_matrix_train[i][j] += 1
        co_matrix_train[j][i] += 1


In [43]:
def precision_at_k(user_id, co_matrix, train_df, test_df, k=5):
    # Что пользователь купил в train
    if user_id not in train_df.index:
        return None
    train_items = train_df.loc[user_id]
    bought_train = train_items[train_items > 0].index.tolist()
    
    # Что он купил в test
    bought_test = test_df[test_df['user_id'] == user_id]['item_id'].unique().tolist()
    if not bought_test:
        return None

    # Рекомендации
    recs = defaultdict(int)
    for item in bought_train:
        for sim_item, score in co_matrix[item].items():
            if sim_item not in bought_train:
                recs[sim_item] += score
    
    top_k = [item for item, _ in sorted(recs.items(), key=lambda x: x[1], reverse=True)[:k]]
    
    # Precision
    hits = len(set(top_k) & set(bought_test))
    return hits / k


In [69]:
# Тестовые пользователи
test_users = test_data['user_id'].unique()

scores = []
for user in test_users:
    score = precision_at_k(user, co_matrix_train, train_user_item_df, test_data, k=10)
    if score is not None:
        scores.append(score)

print(f"Average Precision@5: {sum(scores)/len(scores):.7f}")


Average Precision@5: 0.0000000


  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()


Sparse matrix created for Implicit.
Shape: (10552, 11063)


100%|██████████| 20/20 [00:00<00:00, 27.12it/s, loss=0.00151]

ALS Model Trained.

Starting ALS Evaluation...
Evaluating on 72 users present in both train and test.





ValueError: too many values to unpack (expected 2)