In [None]:
import os

In [None]:
!apt-get -qq update && apt-get -qq install cuda-11-8 > /dev/null

os.environ['PATH'] = '/usr/local/cuda-11.8/bin:' + os.environ['PATH']
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-11.8/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Extracting templates from packages: 100%


In [None]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares

In [None]:
train = pd.read_csv('hse_train.csv')
train.head()

Unnamed: 0,user_id,item_id,timestamp
0,258671,74254,1511701649
1,258671,115615,1511841435
2,258671,176624,1512105022
3,240498,45484,1511605442
4,240498,39504,1511756830


In [None]:
train.shape

(4842338, 3)

In [None]:
train.timestamp = pd.to_datetime(train["timestamp"], unit='s')

In [None]:
train.timestamp.min(), train.timestamp.max()

(Timestamp('2017-11-24 16:00:00'), Timestamp('2017-12-02 23:59:59'))

То есть собраны данные за неделю

In [None]:
max_timestamp = train['timestamp'].max()
train['days_ago'] = (max_timestamp - train['timestamp']).dt.days
train['weight'] = 1 / (1 + train['days_ago'])
train['weight'] = 1 + np.log1p(train['weight'])
train.head()

Unnamed: 0,user_id,item_id,timestamp,days_ago,weight
0,258671,74254,2017-11-26 13:07:29,6,1.133531
1,258671,115615,2017-11-28 03:57:15,4,1.182322
2,258671,176624,2017-12-01 05:10:22,1,1.405465
3,240498,45484,2017-11-25 10:24:02,7,1.117783
4,240498,39504,2017-11-27 04:27:10,5,1.154151


In [None]:
ALL_USERS = train['user_id'].unique().tolist()
ALL_ITEMS = train['item_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

reverse_user_map = {v: k for k, v in user_map.items()}
reverse_item_map = {v: k for k, v in item_map.items()}

train['user_ids'] = train['user_id'].map(user_map)
train['item_ids'] = train['item_id'].map(item_map)

In [None]:
row = train['user_ids'].values
col = train['item_ids'].values
data = train['weight'].values

coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
train_csr = coo_train.tocsr()

In [18]:
factors = 400
iterations = 60
alpha = 200
regularization = 0.1

weighted_interactions = (train_csr * alpha).astype(np.float32)

model = AlternatingLeastSquares(factors=factors,
                                iterations=iterations,
                                regularization=regularization,
                                random_state=42,
                                use_gpu=True)

model.fit(weighted_interactions)

  0%|          | 0/60 [00:00<?, ?it/s]

In [19]:
recommendations = []

for user_ids in tqdm(range(train_csr.shape[0])):
    # Получаем рекомендации для каждого юзера
    recommended = model.recommend(
        userid=user_ids,
        user_items=train_csr[user_ids],
        N=20,
        filter_already_liked_items=False
    )[0]
    original_user_id = reverse_user_map[user_ids]
    original_item_ids = [reverse_item_map[item_idx] for item_idx in recommended[:20]]
    for item_id in original_item_ids:
        recommendations.append({'user_id': original_user_id, 'items': item_id})
df_submission = pd.DataFrame(recommendations)

100%|██████████| 701981/701981 [16:44<00:00, 698.60it/s]


In [20]:
df_submission.to_csv('als.csv', index=False)