<a href="https://colab.research.google.com/github/PhilBurub/ML_course_MSc/blob/main/HW5_recsys/competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy==1.23.5



In [None]:
%%capture
!pip install implicit
!pip install catboost

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score
from pathlib import Path

from implicit.cpu.als import AlternatingLeastSquares
from implicit.cpu.lmf import LogisticMatrixFactorization
from implicit.cpu.bpr import BayesianPersonalizedRanking

from scipy.sparse import csr_matrix

from catboost import CatBoostClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from tqdm import tqdm
import torch

# 0. Unzip

```python
import zipfile

path = '/content/drive/MyDrive/recsys_contest/hse-25-rec-sys-course-competition.zip'
zip = zipfile.ZipFile(path)
zip.extractall('/content/drive/MyDrive/recsys_contest')
```

# 1. Разделение данных

In [None]:
train = pd.read_csv('/content/drive/MyDrive/recsys_contest/hse_train.csv')
train.head()

Unnamed: 0,user_id,item_id,timestamp
0,258671,74254,1511701649
1,258671,115615,1511841435
2,258671,176624,1512105022
3,240498,45484,1511605442
4,240498,39504,1511756830


In [None]:
len(train)

4842338

In [None]:
uids = train.user_id.unique()
uids.sort()

user2id = {id_: i for i, id_ in enumerate(uids)}

In [None]:
iids = train.item_id.unique()
iids.sort()

item2id = {id_: i for i, id_ in enumerate(iids)}

In [None]:
train.user_id = train.user_id.map(user2id)
train.item_id = train.item_id.map(item2id)

In [None]:
threshold = train.timestamp.quantile(0.75)
train_1, val_1 = train[train.timestamp <= threshold], train[train.timestamp > threshold]

# 2. Модели первого уровня

## 2.1. Эвристические модели

### 2.1.1. TopPop

In [None]:
class TopPopVanila:
  def fit(self, df):
    df = df.copy()
    self.preds = df.groupby('item_id').agg({'timestamp': 'count'})\
      .sort_values(by='timestamp', ascending=False)

  def score(self, item_ids, user_ids=[None]):
    return self.preds.loc[item_ids].values[:, 0].tolist()

  def top_n(self, n, user_ids=[None]):
    return [self.preds.index[:n].tolist()] * len(user_ids)

### 2.1.2. TopPop window

In [None]:
class TopPopWindow(TopPopVanila):
  def fit(self, df, window=50000):
    df = df.copy()
    df['value'] = df.timestamp > df.timestamp.max() - window
    self.preds = df.groupby('item_id').agg({'value': 'sum'})\
      .sort_values(by='value', ascending=False)

### 2.1.3. TopPop w smotthing

In [None]:
class TopPopularSmooth(TopPopVanila):
  def fit(self, df):
    df = df.copy()

    first = df.timestamp.min()
    last = df.timestamp.max()
    delta = last - first + 1
    df['value'] = df.timestamp.apply(lambda x: (x - first + 1) / delta)

    self.preds = df.groupby('item_id').agg({'value': 'sum'})\
      .sort_values(by='value', ascending=False)

## 2.2. Матричное разложение

### 2.2.1. iALS

In [None]:
class iALS:
  def fit(self, df):

    shape = (
      df.user_id.max() + 1,
      df.item_id.max() + 1
    )

    first = df.timestamp.min()
    last = df.timestamp.max()
    delta = last - first + 1
    df['value'] = df.timestamp.apply(lambda x: (x - first + 1) / delta)

    self.user_item_matrix = csr_matrix(
        (
            df['value'],
            df[['user_id', 'item_id']].T.values
        ),
        shape=shape
    )

    path = Path('/content/drive/MyDrive/recsys_contest/ials.npz')
    if path.exists():
      self.mf = AlternatingLeastSquares.load(path)
    else:
      self.mf = AlternatingLeastSquares(factors=15, calculate_training_loss=True)
      self.mf.fit(self.user_item_matrix)
      self.mf.save(path)

    self.userf = torch.tensor(self.mf.user_factors).to('cuda') if torch.cuda.is_available() \
      else torch.tensor(self.mf.user_factors)

    self.itemf = torch.tensor(self.mf.item_factors).to('cuda') if torch.cuda.is_available() \
      else torch.tensor(self.mf.item_factors)

  def score(self, item_ids, user_ids, batch_size=25000):
    scores = torch.tensor([]).to('cuda') if torch.cuda.is_available() else torch.tensor([])
    for i in tqdm(range(0, len(item_ids), batch_size)):
      logits = self.userf[user_ids[i:i+batch_size]] @ \
        self.itemf[item_ids[i:i+batch_size]].T
      scores = torch.concatenate(
          (
              scores,
              logits.diag()
          )
      )
    return scores.cpu()

  def top_n(self, n, user_ids):
    return self.mf.recommend(user_ids, self.user_item_matrix[user_ids], N=n)[0].tolist()

### 2.2.2. LMF

In [None]:
class LMF(iALS):
  def fit(self, df):

    shape = (
      df.user_id.max() + 1,
      df.item_id.max() + 1
    )

    first = df.timestamp.min()
    last = df.timestamp.max()
    delta = last - first + 1
    df['value'] = df.timestamp.apply(lambda x: (x - first + 1) / delta)

    self.user_item_matrix = csr_matrix(
        (
            df['value'],
            df[['user_id', 'item_id']].T.values
        ),
        shape=shape
    )

    path = Path('/content/drive/MyDrive/recsys_contest/lmf.npz')
    if path.exists():
      self.mf = LogisticMatrixFactorization.load(path)
    else:
      self.mf = LogisticMatrixFactorization()
      self.mf.fit(self.user_item_matrix)
      self.mf.save(path)

    self.userf = torch.tensor(self.mf.user_factors).to('cuda') if torch.cuda.is_available() \
      else self.mf.user_factors

    self.itemf = torch.tensor(self.mf.item_factors).to('cuda') if torch.cuda.is_available() \
      else self.mf.item_factors

### 2.2.3. BPR

In [None]:
class BPR(iALS):
  def fit(self, df):

    shape = (
      df.user_id.max() + 1,
      df.item_id.max() + 1
    )

    first = df.timestamp.min()
    last = df.timestamp.max()
    delta = last - first + 1
    df['value'] = df.timestamp.apply(lambda x: (x - first + 1) / delta)

    self.user_item_matrix = csr_matrix(
        (
            df['value'],
            df[['user_id', 'item_id']].T.values
        ),
        shape=shape
    )

    path = Path('/content/drive/MyDrive/recsys_contest/bpr.npz')
    if path.exists():
      self.mf = BayesianPersonalizedRanking.load(path)
    else:
      self.mf = BayesianPersonalizedRanking(factors=15)
      self.mf.fit(self.user_item_matrix)
      self.mf.save(path)

    self.userf = torch.tensor(self.mf.user_factors).to('cuda') if torch.cuda.is_available() \
      else self.mf.user_factors

    self.itemf = torch.tensor(self.mf.item_factors).to('cuda') if torch.cuda.is_available() \
      else self.mf.item_factors

## 2.3. Нейросетевые модели

### 2.3.1. BERT4Rec

In [None]:
# from rectools.models.nn.transformers.bert4rec import BERT4RecModel
# from rectools.dataset import Dataset

In [None]:
# class Bert:
#   def fit(self, df):
#     interactions_df = df[['user_id', 'item_id']]
#     interactions_df['datetime'] = df['timestamp']
#     interactions_df['weight'] = 1
#     self.dataset = Dataset.construct(interactions_df)

#     path = Path('/content/drive/MyDrive/recsys_contest/bert')
#     if path.exists():
#       self.bert = BERT4RecModel.load(path)
#     else:
#       self.bert = BERT4RecModel(n_heads=2, n_factors=32, epochs=1, batch_size=64)
#       self.bert.fit(self.dataset)
#       self.bert.save(path)

#   def score(self, item_id, user_id):
#     df = self.bert.recommend(
#           [user_id],
#           self.dataset,
#           k=len(self.dataset.item_id_map.external_ids),
#           filter_viewed=False
#     )
#     item_row = df[df.item_id == item_id]
#     if len(item_row) != 1:
#       return 0
#     return item_row.score.item()

#   def top_n(self, n, user_ids):
#     bert_preds = self.bert.recommend(user_ids, self.dataset, top_n=n, filter_viewed=True)
#     return bert_preds.groupby('user_id').agg({'item_id': list}).loc[user_ids].tolist()

### 2.3.1. SASRec

In [None]:
# from rectools.models.nn.transformers.sasrec import SASRecModel

In [None]:
# class SAS:
#   def fit(self, df):
#     interactions_df = df[['user_id', 'item_id']]
#     interactions_df['datetime'] = df['timestamp']
#     interactions_df['weight'] = 1
#     self.dataset = Dataset.construct(interactions_df)

#     path = Path('/content/drive/MyDrive/recsys_contest/sasrec')
#     if path.exists():
#       self.sasrec = SASRecModel.load(path)
#     else:
#       self.sasrec = SASRecModel(n_heads=2, n_factors=32, epochs=1, batch_size=64)
#       self.sasrec.fit(self.dataset)
#       self.sasrec.save(path)

#   def score(self, item_id, user_id):
#     df = self.sasrec.recommend(
#           [user_id],
#           self.dataset,
#           k=len(self.dataset.item_id_map.external_ids),
#           filter_viewed=False
#     )
#     item_row = df[df.item_id == item_id]
#     if len(item_row) != 1:
#       return 0
#     return item_row.score.item()

#   def top_n(self, n, user_ids):
#     sas_preds = self.sasrec.recommend(user_ids, self.dataset, top_n=n, filter_viewed=True)
#     return sas_preds.groupby('user_id').agg({'item_id': list}).loc[user_ids].tolist()

# 3. Обучение модели второго уровня

## 3.1. Сбор датасета

### 3.1.1. Генерация кандидатов

In [None]:
models = {
    'toppop': TopPopVanila(),
    'toppop_window': TopPopWindow(),
    'toppop_smooth': TopPopularSmooth(),
    'ials': iALS(),
    'lmf': LMF(),
    'bpr': BPR()
    # 'bert': Bert(),
    # 'sasrec': SAS()
}

In [None]:
for name, model in models.items():
  print(name)
  model.fit(train_1)

toppop
toppop_window
toppop_smooth
ials


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['value'] = df.timestamp.apply(lambda x: (x - first + 1) / delta)


lmf


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['value'] = df.timestamp.apply(lambda x: (x - first + 1) / delta)


bpr


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['value'] = df.timestamp.apply(lambda x: (x - first + 1) / delta)


In [None]:
pred_table = pd.DataFrame({'user_id': train.user_id.unique()})

In [None]:
for name, model in models.items():
  print(name)
  pred_table[name + '_items'] = model.top_n(10, pred_table.user_id.to_list())

toppop
toppop_window
toppop_smooth
ials
lmf
bpr


In [None]:
pred_table['combined'] = \
  pred_table.toppop_items + \
  pred_table.toppop_window_items + \
  pred_table.toppop_smooth_items + \
  pred_table.ials_items + \
  pred_table.lmf_items + \
  pred_table.bpr_items

In [None]:
pred_table['combined'] = pred_table['combined'].apply(set)

In [None]:
train_df = pred_table[['user_id', 'combined']].apply(
    lambda row: pd.DataFrame(
        {
            'user_id':[row.user_id] * len(row.combined),
            'combined': list(row.combined)
        }
    ),
    axis=1
)

In [None]:
pd.concat(train_df.tolist()).to_csv(
    '/content/drive/MyDrive/recsys_contest/light_models_pred.csv',
    index=False
)

#### 3.1.2. Доливка валидейта

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/recsys_contest/light_models_pred.csv').rename(columns={'combined': 'item_id'})
train_df['target'] = 0

In [None]:
iid_digits = np.log10(len(iids)) // 1 + 2
coef = 10 ** iid_digits

In [None]:
train_df['key'] = (train_df.user_id * coef) + train_df.item_id
val_1['key'] = (val_1.user_id * coef) + val_1.item_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_1['key'] = (val_1.user_id * coef) + val_1.item_id


In [None]:
train_df = train_df[~np.in1d(train_df['key'], val_1['key'])]
val_1 = val_1[
    np.in1d(val_1.user_id, train_1.user_id) & np.in1d(val_1.item_id, train_1.item_id)
]

In [None]:
val_1['target'] = 1
train_df = pd.concat(
    (
        val_1,
        train_df.sample(len(val_1))
    )
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_1['target'] = 1


In [None]:
train_df[['user_id', 'item_id', 'target']].to_csv(
    '/content/drive/MyDrive/recsys_contest/catboost_dataset.csv',
    index=False
)

### 3.1.3. Получение оценок моделей

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/recsys_contest/catboost_dataset.csv')

In [None]:
for name, model in models.items():
  print(name)
  train_df[name + '_score'] = model.score(
      train_df.item_id.values, train_df.user_id.values
  )

toppop
toppop_window
toppop_smooth
ials
lmf
bpr


## 3.2. Обучение модели

In [None]:
cboost = CatBoostClassifier()
X_tr, X_te, y_tr, y_te = train_test_split(
    train_df[
        [
            'toppop_score',
            'toppop_window_score',
            'toppop_smooth_score',
            'ials_score',
            'lmf_score',
            'bpr_score'
        ]
    ],
    train_df.target,
    test_size=0.1,
    random_state=42
)

In [None]:
scale = StandardScaler()
X_tr = scale.fit_transform(X_tr)
X_te = scale.transform(X_te)

In [None]:
cboost.fit(X_tr, y_tr)

In [None]:
roc_auc_score(y_te, cboost.predict(X_te))

0.9663376928452899

In [None]:
cboost.save_model('/content/drive/MyDrive/recsys_contest/catboost.cbm')

# 4. Подготовка рекомендаций

## 4.1. Генерация кандидатов

In [None]:
models = {
    'toppop': TopPopVanila(),
    'toppop_window': TopPopWindow(),
    'toppop_smooth': TopPopularSmooth(),
    'ials': iALS(),
    'lmf': LMF(),
    'bpr': BPR()
}

In [None]:
!export OPENBLAS_NUM_THREADS=1

In [None]:
for name, model in models.items():
  print(name)
  model.fit(train)

toppop
toppop_window
toppop_smooth


In [None]:
pred_table = pd.DataFrame(
    {
        'user_id': train.user_id.unique(),
        'item_ids': [[]] * len(train.user_id.unique())
    }
)

In [None]:
for name, model in models.items():
  print(name)
  pred_table['item_ids'] += pd.Series(model.top_n(20, pred_table.user_id.to_list()))

toppop
toppop_window
toppop_smooth
ials
lmf
bpr


In [None]:
pred_table.item_ids = pred_table.item_ids.apply(set).apply(list)

In [None]:
from tqdm import tqdm

test_df = {
    'user_id': [],
    'item_id': []
}

for _, row in tqdm(pred_table.iterrows(), total=len(pred_table)):
    test_df['user_id'].extend([row.user_id] * len(row.item_ids))
    test_df['item_id'].extend(row.item_ids)

100%|██████████| 701981/701981 [00:53<00:00, 13229.93it/s]


In [None]:
import json

with open(
    '/content/drive/MyDrive/recsys_contest/light_models_candidates.json',
    'w',
    encoding='utf-8'
) as f:
    json.dump(test_df, f)

## 4.2. Получение оценок моделей

In [None]:
pred_table = pd.read_json('/content/drive/MyDrive/recsys_contest/light_models_candidates.json')

In [None]:
for name, model in models.items():
  print(name)
  pred_table[name + '_score'] = model.score(
      pred_table.item_id.values, pred_table.user_id.values
  )

toppop
toppop_window
toppop_smooth


In [None]:
np.save('/content/drive/MyDrive/recsys_contest/toppop_score.npy', pred_table['toppop_score'].values)
np.save('/content/drive/MyDrive/recsys_contest/toppop_window_score.npy', pred_table['toppop_window_score'].values)
np.save('/content/drive/MyDrive/recsys_contest/toppop_smooth_score.npy', pred_table['toppop_smooth_score'].values)

In [None]:
np.save('/content/drive/MyDrive/recsys_contest/ials_score.npy', pred_table['ials_score'].values)
np.save('/content/drive/MyDrive/recsys_contest/lmf_score.npy', pred_table['lmf_score'].values)
np.save('/content/drive/MyDrive/recsys_contest/bpr_score.npy', pred_table['bpr_score'].values)

## 4.3. Получение предсказаний модели второго уровня

In [None]:
model = CatBoostClassifier()
model.load_model('/content/drive/MyDrive/recsys_contest/catboost.cbm')

<catboost.core.CatBoostClassifier at 0x79c66f20e590>

In [None]:
X = np.concatenate(
    (
        np.load('/content/drive/MyDrive/recsys_contest/toppop_score.npy').reshape((-1, 1)),
        np.load('/content/drive/MyDrive/recsys_contest/toppop_window_score.npy').reshape((-1, 1)),
        np.load('/content/drive/MyDrive/recsys_contest/toppop_smooth_score.npy').reshape((-1, 1)),
        np.load('/content/drive/MyDrive/recsys_contest/ials_score.npy').reshape((-1, 1)),
        np.load('/content/drive/MyDrive/recsys_contest/lmf_score.npy').reshape((-1, 1)),
        np.load('/content/drive/MyDrive/recsys_contest/bpr_score.npy').reshape((-1, 1))
    ),
    axis=1
)

In [None]:
scale = StandardScaler()
X = scale.fit_transform(X)

In [None]:
np.save('/content/drive/MyDrive/recsys_contest/scaled_scores.npy', X)

In [None]:
preds = model.predict_proba(X)

In [None]:
np.save('/content/drive/MyDrive/recsys_contest/preds.npy', preds)

## 4.4. Выбор кандидатов

In [None]:
pred_table = pd.read_json('/content/drive/MyDrive/recsys_contest/light_models_candidates.json')

In [None]:
proba = np.load('/content/drive/MyDrive/recsys_contest/preds.npy')
pred_table['proba'] = proba[:, 1]

In [None]:
submit = pred_table.groupby('user_id').apply(
    lambda row: row.item_id.values[row.proba.values.argsort()[-1:-10:-1]]
)

  submit = pred_table.groupby('user_id').apply(


In [None]:
from tqdm import tqdm

submit_df = {
    'user_id': [],
    'item_id': []
}

for user_id, row in tqdm(submit.items(), total=len(submit)):
    submit_df['user_id'].extend([user_id] * len(row))
    submit_df['item_id'].extend(row)

100%|██████████| 701981/701981 [00:01<00:00, 397942.62it/s]


In [None]:
submit = pd.DataFrame(submit_df)

In [None]:
uids[submit.user_id.values]

array([     0,      0,      0, ..., 701980, 701980, 701980])

In [None]:
submit.user_id = uids[submit.user_id.values]
submit.item_id = iids[submit.item_id.values]

In [None]:
submit.rename(columns={'item_id': 'items'}).to_csv('/content/drive/MyDrive/recsys_contest/submission.csv', index=False)