In [68]:
!pip install implicit



In [103]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from implicit.als import AlternatingLeastSquares

In [70]:
train = pd.read_csv('/kaggle/input/dz5fdf/hse_train.csv')
train.head()

Unnamed: 0,user_id,item_id,timestamp
0,258671,74254,1511701649
1,258671,115615,1511841435
2,258671,176624,1512105022
3,240498,45484,1511605442
4,240498,39504,1511756830


In [71]:
train.shape

(4842338, 3)

In [72]:
train.timestamp = pd.to_datetime(train["timestamp"], unit='s')

In [73]:
train.timestamp.min(), train.timestamp.max()

(Timestamp('2017-11-24 16:00:00'), Timestamp('2017-12-02 23:59:59'))

То есть собраны данные за неделю

# Топ 20 рекомендаций. Бейзлайн 1

In [6]:
popular_items = train['item_id'].value_counts().index.tolist()
top_k = 20
top_items = popular_items[:top_k]

In [19]:
# Получаем уникальных пользователей из теста
users = train['user_id'].unique()

# Составим рекомендации
recommendations = []

for user in users:
    for item in top_items:
        recommendations.append({'user_id': user, 'items': item})

df_submission = pd.DataFrame(recommendations)

In [10]:
df_submission.to_csv('top_20.csv', index=False)

# ALS. Бейзлайн 2

In [29]:
train_users = train['user_id'].unique()
train_items = train['item_id'].unique()
max_timestamp = train['timestamp'].max()
train['days_ago'] = (max_timestamp - train['timestamp']).dt.days
train['weight'] = 1 / (1 + train['days_ago'] / 30)

In [30]:
train.head()

Unnamed: 0,user_id,item_id,timestamp,days_ago,weight
0,258671,74254,2017-11-26 13:07:29,6,0.833333
1,258671,115615,2017-11-28 03:57:15,4,0.882353
2,258671,176624,2017-12-01 05:10:22,1,0.967742
3,240498,45484,2017-11-25 10:24:02,7,0.810811
4,240498,39504,2017-11-27 04:27:10,5,0.857143


In [31]:
#Кодируем user_id и item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
train['user_id'] = user_encoder.fit_transform(train['user_id'])
train['item_id'] = item_encoder.fit_transform(train['item_id'])

In [32]:
train['user_id'].nunique(), train['user_id'].max(), train['item_id'].nunique(), train['item_id'].max()

(701981, 701980, 180599, 180598)

In [36]:
interactions = csr_matrix(
    (train['weight'].values, 
     (train['user_id'], train['item_id'])),
    shape=(train['user_id'].max()+1, train['item_id'].max()+1)
)

In [154]:
model = AlternatingLeastSquares(factors=100, iterations=15, regularization=0.01)
model.fit(interactions.T)



  0%|          | 0/15 [00:00<?, ?it/s]

In [155]:
item_counts = train['item_id'].value_counts()
popular_items = item_encoder.transform(item_counts.head(100).index.values)

recommendations = []

for user_id in tqdm(range(interactions.shape[0])):
    # Получаем рекомендации
    recommended = model.recommend(user_id, interactions[user_id], N=100)[0]
    top_items = set(popular_items)
    filtered = [item for item in recommended if item in top_items][:20]
    # Преобразование обратно к оригинальным ID
    original_items = item_encoder.inverse_transform(filtered)
    original_user_id = user_encoder.inverse_transform([user_id])[0]
    
    # Сохранение результатов
    for item_id in original_items:
        recommendations.append({'user_id': original_user_id, 'items': item_id})
df_submission = pd.DataFrame(recommendations)

  0%|          | 2799/701981 [01:28<6:07:32, 31.71it/s]


KeyboardInterrupt: 

In [64]:
item_counts = train['item_id'].value_counts()
popular_items = item_encoder.transform(item_counts.head(100).index.values)

recommendations = []

for user_id in tqdm(range(interactions.shape[0])):
    # Получаем рекомендации
    recommended = model.recommend(user_id, interactions[user_id], N=20, filter_already_liked_items=True)[0]
    # Преобразование обратно к оригинальным ID
    original_items = item_encoder.inverse_transform(recommended[:20])
    original_user_id = user_encoder.inverse_transform([user_id])[0]
    
    # Сохранение результатов
    for item_id in original_items:
        recommendations.append({'user_id': original_user_id, 'items': item_id})
df_submission = pd.DataFrame(recommendations)

100%|██████████| 701981/701981 [4:01:08<00:00, 48.52it/s]  


In [65]:
df_submission

Unnamed: 0,user_id,items
0,0,137568
1,0,3665
2,0,28020
3,0,102173
4,0,177158
...,...,...
14039615,701980,58593
14039616,701980,171750
14039617,701980,154143
14039618,701980,110938


In [66]:
df_submission.to_csv('als.csv', index=False)

# ALS plus Бейзлайн 3

In [75]:
train.head()

Unnamed: 0,user_id,item_id,timestamp
0,258671,74254,2017-11-26 13:07:29
1,258671,115615,2017-11-28 03:57:15
2,258671,176624,2017-12-01 05:10:22
3,240498,45484,2017-11-25 10:24:02
4,240498,39504,2017-11-27 04:27:10


In [78]:
train_users = train['user_id'].unique()
train_items = train['item_id'].unique()
train['weekday'] = train.timestamp.dt.weekday
train = train.sort_values('timestamp')

In [79]:
user_ids = train['user_id'].unique()
item_ids = train['item_id'].unique()
user_to_idx = {user: idx for idx, user in enumerate(user_ids)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}
item_to_idx = {item: idx for idx, item in enumerate(item_ids)}
idx_to_item = {idx: item for item, idx in item_to_idx.items()}

In [92]:
train.shape

(4842338, 4)

In [91]:
train.head()

Unnamed: 0,user_id,item_id,timestamp,weekday
3520194,340541,147655,2017-11-24 16:00:00,4
196612,381423,169172,2017-11-24 16:00:00,4
196613,381423,174050,2017-11-24 16:00:00,4
649097,442924,85153,2017-11-24 16:00:00,4
1612937,601531,33994,2017-11-24 16:00:00,4


In [94]:
joined = train.groupby('user_id').apply(
    lambda x: [(t1, t2, t3) for t1, t2, t3 in sorted(zip(x.item_id, 
                                                 x.timestamp,
                                                 x.weekday), key=lambda x: x[1])]
).reset_index()
joined.rename({0:'train'}, axis=1, inplace=True)

  joined = train.groupby('user_id').apply(


In [95]:
joined.head()

Unnamed: 0,user_id,train
0,0,"[(144433, 2017-11-24 16:44:54, 4), (90536, 201..."
1,1,"[(153245, 2017-11-30 14:28:32, 3)]"
2,2,"[(131868, 2017-11-26 12:40:38, 6), (28437, 201..."
3,3,"[(147375, 2017-11-29 02:51:05, 2), (165134, 20..."
4,4,"[(11870, 2017-11-24 20:47:32, 4), (11870, 2017..."


In [133]:
class ALSRecommender:
    def __init__(self, factors=50, regularization=0.01, iterations=15, default_score=0.0):
        self.model = AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations)
        self.user_mapping = {}
        self.item_mapping = {}
        self.user_inv_mapping = {}
        self.item_inv_mapping = {}
        self.trained = False
        self.default_score = default_score

    def fit(self, df, col='train'):
        interactions = []
        user_set, item_set = set(), set()

        for _, row in df.iterrows():
            user = row['user_id']
            for item, _, _ in row[col]:
                interactions.append((user, item, 1))
                user_set.add(user)
                item_set.add(item)

        self.user_mapping = {user: idx for idx, user in enumerate(sorted(user_set))}
        self.item_mapping = {item: idx for idx, item in enumerate(sorted(item_set))}
        self.user_inv_mapping = {idx: user for user, idx in self.user_mapping.items()}
        self.item_inv_mapping = {idx: item for item, idx in self.item_mapping.items()}

        user_ids = [self.user_mapping[u] for u, i, v in interactions]
        item_ids = [self.item_mapping[i] for u, i, v in interactions]
        values = [v for u, i, v in interactions]

        user_item_matrix = coo_matrix((values, (user_ids, item_ids)))
        self.model.fit(user_item_matrix)
        self.trained = True
        self.user_item_matrix = user_item_matrix.tocsr()

    def predict(self, df, topn=20) -> list:
        assert self.trained
        recs = []

        for _, row in tqdm(df.iterrows()):
            uid = row['user_id']
            if uid not in self.user_mapping:
                recs.append([])
                continue

            user_idx = self.user_mapping[uid]
            user_row = self.user_item_matrix[user_idx]

            item_ids, _ = self.model.recommend(
                userid=user_idx,
                user_items=user_row,
                N=topn
            )

            rec_items = [self.item_inv_mapping[iid] for iid in item_ids]
            recs.append(rec_items)

        return recs

In [134]:
als = ALSRecommender()
als.fit(joined)



  0%|          | 0/15 [00:00<?, ?it/s]

In [135]:
als_rec = als.predict(joined)

701981it [26:53, 434.96it/s]


In [137]:
joined['als_rec'] = als_rec

In [141]:
joined.head()

Unnamed: 0,user_id,train,als_rec
0,0,"[(144433, 2017-11-24 16:44:54, 4), (90536, 201...","[137568, 1316, 28020, 27625, 54441, 35996, 180..."
1,1,"[(153245, 2017-11-30 14:28:32, 3)]","[35996, 51169, 96724, 55251, 134733, 40824, 12..."
2,2,"[(131868, 2017-11-26 12:40:38, 6), (28437, 201...","[165835, 31832, 23893, 147358, 91220, 124359, ..."
3,3,"[(147375, 2017-11-29 02:51:05, 2), (165134, 20...","[28020, 122136, 90324, 153054, 46940, 69764, 1..."
4,4,"[(11870, 2017-11-24 20:47:32, 4), (11870, 2017...","[12738, 71828, 304, 154253, 38613, 79111, 1164..."


In [148]:
recommendations = []
for _, row in joined[['user_id', 'als_rec']].iterrows():
    user_id = row['user_id']
    for item_id in row['als_rec']:
        recommendations.append({'user_id': user_id, 'items': item_id})
        
df_submission = pd.DataFrame(recommendations)

In [149]:
df_submission

Unnamed: 0,user_id,items
0,0,137568
1,0,1316
2,0,28020
3,0,27625
4,0,54441
...,...,...
14039615,701980,127318
14039616,701980,166649
14039617,701980,110741
14039618,701980,54408


In [150]:
df_submission.to_csv('als2.csv', index=False)

In [151]:
als1 = pd.read_csv('/kaggle/working/als.csv')
als1.head()

Unnamed: 0,user_id,items
0,0,137568
1,0,3665
2,0,28020
3,0,102173
4,0,177158


# Бейзлайн 4