In [1]:
# !pip install -U ipywidgets

In [1]:
!pip install polars colorama torch ray recbole kmeans_pytorch tqdm

Collecting recbole
  Obtaining dependency information for recbole from https://files.pythonhosted.org/packages/1e/d1/81756635abf971deeaa8180dae167e6ee867f9ffe13dc128a51fb9efe710/recbole-1.2.0-py3-none-any.whl.metadata
  Downloading recbole-1.2.0-py3-none-any.whl.metadata (1.4 kB)
Collecting kmeans_pytorch
  Downloading kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)
Collecting colorlog==4.7.2 (from recbole)
  Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting thop>=0.1.1.post2207130030 (from recbole)
  Obtaining dependency information for thop>=0.1.1.post2207130030 from https://files.pythonhosted.org/packages/bb/0f/72beeab4ff5221dc47127c80f8834b4bcd0cb36f6ba91c0b1d04a1233403/thop-0.1.1.post2209072238-py3-none-any.whl.metadata
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Downloading recbole-1.2.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import polars as pl
import numpy as np
import torch
import datetime
import os
import gc
from tqdm.auto import tqdm

import logging
from logging import getLogger

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.quick_start import run_recbole
from recbole.quick_start.quick_start import load_data_and_model
from recbole.model.sequential_recommender import SASRecF
from recbole.trainer import Trainer
from recbole.utils.utils import get_trainer
from recbole.utils import init_seed, init_logger
from recbole.utils.case_study import full_sort_topk

def conv_date(source_date):
    return datetime.datetime.timestamp(source_date)


RANDOM_STATE = 42
N_PREDICTIONS = 100
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
DATASET_NAME = "hh_recsys"

2024-02-22 12:12:36,474	INFO util.py:129 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-02-22 12:12:36,980	INFO util.py:129 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
train_path = '/kaggle/input/boosters-hh-recsys/hh_recsys_train_hh.pq'
train = pl.read_parquet(train_path, low_memory=True)

vacancies_path = '/kaggle/input/boosters-hh-recsys/hh_recsys_vacancies.pq'
vacancies = pl.read_parquet(vacancies_path, low_memory=True)

vacancies = vacancies.with_columns(pl.col("compensation.from").fill_null(strategy="min"))
vacancies = vacancies.with_columns(pl.when(pl.col("compensation.to").is_null()).
                                   then(pl.col("compensation.from")).
                                   otherwise(pl.col("compensation.to")).
                                   alias("compensation.to"))
vacancies = vacancies.with_columns(pl.col("compensation.currencyCode").fill_null("RUR"))

vacancies = vacancies.with_columns(pl.col("description").
                                   str.replace_all("<.*?>", "").  # убираем html-тэги
                                   str.replace_all("&[A-Za-z0-9#]+;", "").   # убираем entity вроде  &quot;
                                   str.replace_all("[[:punct:]]", "").   # убираем знаки препинания
                                   str.to_lowercase())                       # к нижнему регистру

# ключевые навыки записываем в строку через пробел и в нижнем регистре
vacancies = vacancies.with_columns(pl.col("keySkills.keySkill").list.join(' ').str.to_lowercase())
vacancies = vacancies.with_columns(pl.col("keySkills.keySkill").fill_null("NONE"))

pairs = train.select(['user_id', 'vacancy_id', 'action_type', 'action_dt']).explode(['vacancy_id', 'action_type', 'action_dt'])

In [14]:
# # Преобразуем зарплаты в соответствии с курсом ЦБ (на 09.01.2023)
# currency_rates = {"UZS": 0.007269,
#                   "KGS": 1.01,
#                   "USD": 89.69,
#                   "GEL": 33.3,
#                   "BYR": 28.23,
#                   "AZN": 52.76,
#                   "KZT": 0.197708,
#                   "UAH": 2.36,
#                   "RUR": 1.0,
#                   "EUR": 99.19}

# vacancies = vacancies.rename({"compensation.currencyCode": "currencyRate", 
#                               "compensation.from": "compensation_from", 
#                               "compensation.to": "compensation_to"})

# vacancies = vacancies.with_columns(
#     currencyRate=pl.col("currencyRate").replace(currency_rates, default=0.0))  # для версии 0.20
# # vacancies = vacancies.with_columns(
# #     currencyRate=pl.col("currencyRate").map_dict(currency_rates, default=0.0))  # для версии 0.19

# vacancies = vacancies.with_columns(
#     compensation_from = pl.col('compensation_from') * pl.col('currencyRate'))
# vacancies = vacancies.with_columns(
#     compensation_to = pl.col('compensation_to') * pl.col('currencyRate'))

# # Переведем зарплату в символьный вид для SASRecF 
# vacancies = vacancies.with_columns(
#     salary_from=(pl.col("compensation_from").log10()*10).floor().cast(pl.Int64).cast(pl.Utf8)
# ).with_columns(
#     salary_to=(pl.col("compensation_to").log10()*10).floor().cast(pl.Int64).cast(pl.Utf8))

In [4]:
gc.collect()

10

---
# SASRec
https://www.recbole.io/docs/user_guide/model/sequential/sasrecf.html

In [5]:
if not os.path.exists(DATASET_NAME):
    os.mkdir(DATASET_NAME)

# для версии 0.20
pairs.with_columns(
    action_dt=pl.col("action_dt").map_elements(conv_date)
).rename({"user_id": "user_id:token", 
          "vacancy_id": "vacancy_id:token", 
          "action_type": "action_type:float", 
          "action_dt": "timestamp:float"}
        ).write_csv("./" + DATASET_NAME + "/hh_recsys.inter", include_header=True, separator="\t")

vacancies.with_columns(pl.col("keySkills.keySkill").cast(pl.List(pl.Utf8)).list.join(", ")
).rename({"vacancy_id": "vacancy_id:token", 
        "name": "name:token_seq", 
        "company.id": "company_id:token",
        "description": "description:token_seq",
        "keySkills.keySkill": "keySkills:token_seq", 
        "compensation.from": "compensation_from:float",
        "compensation.to": "compensation_to:float", 
        "compensation.currencyCode": "compensation.currencyCode:token",
        "area.id": "area_id:token", 
        "area.regionId": "area_regionId:token",
        "employment": "employment:token", 
        "workSchedule": "workSchedule:token", 
        "workExperience": "workExperience:token"}
    ).write_csv("./" + DATASET_NAME + "/hh_recsys.item", include_header=True, separator="\t")

In [6]:
gc.collect()

0

In [7]:
config_dict = {
    "data_path": "/kaggle/working",
    "USER_ID_FIELD": "user_id",
    "ITEM_ID_FIELD": "vacancy_id",
    "RATING_FIELD": "action_type",
    "TIME_FIELD": "timestamp",
    "user_inter_num_interval": "[1, inf)",
    "item_inter_num_interval": "[25, inf)",
    "load_col": {
        "inter": ['user_id', 'vacancy_id', 'action_type', 'timestamp'],
        "item": ['vacancy_id', 'name', 'company_id', 'keySkills', 
                 'area_id', 'area_regionId', 'employment', 'workSchedule', 
                 'workExperience']
    }, 
    "selected_features": ['name', 
                          'company_id', 
                          'keySkills', 
                          'area_id', 
                          'area_regionId', 
                          'employment', 
                          'workSchedule', 
                          'workExperience'
                         ],
    'train_batch_size': 1024,
    'eval_batch_size': 1024,
    'hidden_size': 64,  
    'inner_size': 256,  
    "neg_sampling": None,
    'train_neg_sample_args': None,
    "eval_args": {
        "split": {"RS": [0.95, 0.03, 0.02]},
        "group_by": "user",
        "order": "TO",
        "mode": "full"
    },
    "metrics":  ['Recall', 'MRR'], 
    "topk": 100,
    "epochs": 2,
    "stopping_step": 2, 
    "show_progress": True,
    "valid_metric": 'MRR@100',
    "learning_rate": 0.003,
}

In [None]:
run_recbole(model='SASRecF', dataset=DATASET_NAME, config_dict=config_dict)

  split_point = np.cumsum(feat[field].agg(len))[:-1]


In [18]:
# parameter_dict = {
#     'data_path': './',
#     'USER_ID_FIELD': 'user_id',
#     'ITEM_ID_FIELD': 'vacancy_id',
#     'RATING_FIELD': 'action_type',
#     'TIME_FIELD': 'timestamp', 
#     'user_inter_num_interval': "[10,inf)",
#     'item_inter_num_interval': "[15,inf)",
# #     'seq_len': {'vacancy_id': 10},
#     'MAX_ITEM_LIST_LENGTH': 10,    
#     'load_col': {'inter': ['user_id', 'vacancy_id', 'action_type', 'timestamp'],
#                  'item': ['vacancy_id', 'name', 'company_id', 'keySkills', 
#                           'salary_from', 'salary_to', 'area_id', 'area_regionId', 
#                           'employment', 'workSchedule', 'workExperience']},
#     'selected_features': ['name', 'company_id', 'keySkills', 'salary_from', 
#                           'salary_to', 'area_id', 'area_regionId', 'employment', 
#                           'workSchedule', 'workExperience'],
#     'neg_sampling': None,
#     'train_neg_sample_args': None,
#     'train_batch_size': 1024,
#     'eval_batch_size': 1024,
#     'epochs': 25,
#     'metrics': ['Recall', 'MRR'],
#     'loss_type': 'CE',
#     'topk': 100,
#     'valid_metric': 'MRR@100',
#     "stopping_step": 2,
# #     'hidden_size': 64,
# #     'inner_size': 256,
#     'hidden_dropout_prob': 0.3,
#     'attn_dropout_prob': 0.3,
#     'eval_args': {'split': {'RS': [0.95, 0.03, 0.02]},
#                   'group_by': 'user',
#                   'order': 'TO',
#                   'mode': 'full'},
#     'seed': 42,
#     'reproducibility': True,
#     "device": DEVICE,
# }

In [None]:
# parameter_dict = {
#     'data_path': './',
#     'USER_ID_FIELD': 'user_id',
#     'ITEM_ID_FIELD': 'vacancy_id',
#     'RATING_FIELD': 'action_type',
#     'TIME_FIELD': 'timestamp', 
#     'user_inter_num_interval': "[10,inf)",
#     'item_inter_num_interval': "[15,inf)",
#     'seq_len': {'vacancy_id': 10},
#     'MAX_ITEM_LIST_LENGTH': 10,    
#     'load_col': {'inter': ['user_id', 'vacancy_id', 'action_type', 'timestamp'],
#                  'item': ['vacancy_id', 'name', 'company_id', 'keySkills', 
#                           'salary_from', 'salary_to', 'area_id', 'area_regionId', 
#                           'employment', 'workSchedule', 'workExperience']},
#     'selected_features': ['name', 'company_id', 'keySkills', 'salary_from', 
#                           'salary_to', 'area_id', 'area_regionId', 'employment', 
#                           'workSchedule', 'workExperience'],
#     'neg_sampling': None,
#     'train_neg_sample_args': None,
#     'train_batch_size': 1024,
#     'eval_batch_size': 1024,
#     'epochs': 20,
#     'metrics': ['Recall', 'MRR'],
#     'loss_type': 'CE',
#     'topk': 100,
#     'valid_metric': 'MRR@100',
#     "stopping_step": 2,
#     'hidden_size': 64,
#     'inner_size': 256,
#     'hidden_dropout_prob': 0.3,
#     'attn_dropout_prob': 0.3,
#     'eval_args': {'split': {'RS': [0.95, 0.03, 0.02]},
#                   'group_by': 'user',
#                   'order': 'TO',
#                   'mode': 'full'},
#     'seed': 42,
#     'reproducibility': True,
#     "device": DEVICE,
# }

# Trainable parameters: 28041408
# epoch 0 training [time: 2182.38s, train loss: 105236.6074]
# epoch 0 evaluating [time: 61.79s, valid_score: 0.118000]
# valid result: 
# recall@100 : 0.4858    mrr@100 : 0.118
# Saving current: saved/SASRecF-Feb-19-2024_15-47-30.pth
# epoch 1 training [time: 2185.21s, train loss: 91502.9345]
# epoch 1 evaluating [time: 64.06s, valid_score: 0.133400]
# valid result: 
# recall@100 : 0.5184    mrr@100 : 0.1334
# Saving current: saved/SASRecF-Feb-19-2024_15-47-30.pth
# epoch 2 training [time: 2194.06s, train loss: 88985.4760]
# epoch 2 evaluating [time: 64.37s, valid_score: 0.141400]
# valid result: 
# recall@100 : 0.5325    mrr@100 : 0.1414
# Saving current: saved/SASRecF-Feb-19-2024_15-47-30.pth
# epoch 3 training [time: 2197.01s, train loss: 87588.3414]
# epoch 3 evaluating [time: 64.04s, valid_score: 0.143900]
# valid result: 
# recall@100 : 0.5398    mrr@100 : 0.1439
# Saving current: saved/SASRecF-Feb-19-2024_15-47-30.pth
# epoch 4 training [time: 2197.31s, train loss: 86684.4105]
# epoch 4 evaluating [time: 64.08s, valid_score: 0.146500]
# valid result: 
# recall@100 : 0.544    mrr@100 : 0.1465
# Saving current: saved/SASRecF-Feb-19-2024_15-47-30.pth
# epoch 5 training [time: 2193.55s, train loss: 86042.4307]
# epoch 5 evaluating [time: 64.01s, valid_score: 0.148200]
# valid result: 
# recall@100 : 0.5468    mrr@100 : 0.1482
# Saving current: saved/SASRecF-Feb-19-2024_15-47-30.pth
# epoch 6 training [time: 2193.16s, train loss: 85564.8586]
# epoch 6 evaluating [time: 64.82s, valid_score: 0.149500]
# valid result: 
# recall@100 : 0.5483    mrr@100 : 0.1495
# Saving current: saved/SASRecF-Feb-19-2024_15-47-30.pth
# epoch 7 training [time: 2183.19s, train loss: 85195.1310]
# epoch 7 evaluating [time: 63.33s, valid_score: 0.149800]
# valid result: 
# recall@100 : 0.5509    mrr@100 : 0.1498
# Saving current: saved/SASRecF-Feb-19-2024_15-47-30.pth

In [None]:
# config = Config(model='SASRecF', dataset=DATASET_NAME, config_dict=parameter_dict)

# init_seed(config['seed'], config['reproducibility'])

# # инициализируем логгеры для вывода информации
# logger = getLogger()
# logger.setLevel(logging.INFO)

# c_handler = logging.StreamHandler()
# c_handler.setLevel(logging.INFO)

# logger.addHandler(c_handler)
# logger.info(config)

# # Создаём объекты тренировочной выборки и валидацонной
# dataset = create_dataset(config)
# logger.info(dataset)
# train_data, valid_data, test_data = data_preparation(config, dataset)

# # Инициализируем модель и обучаем
# model = SASRecF(config, train_data.dataset).to(config['device'])
# logger.info(model)

# # инициализируем "тренера" модели
# trainer = Trainer(config, model)

# # сохраняем лучшие результаты
# best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

In [None]:
# trainer.evaluate(test_data)

In [29]:
# del dataset
# del train_data
# del valid_data
# del test_data

gc.collect()

962

In [5]:
!pip install gdown

Collecting gdown
  Obtaining dependency information for gdown from https://files.pythonhosted.org/packages/cb/56/f4845ed78723a4eb8eb22bcfcb46e1157a462c78c0a5ed318c68c98f9a79/gdown-5.1.0-py3-none-any.whl.metadata
  Downloading gdown-5.1.0-py3-none-any.whl.metadata (5.7 kB)
Downloading gdown-5.1.0-py3-none-any.whl (17 kB)
Installing collected packages: gdown
Successfully installed gdown-5.1.0


saved/SASRecF-Feb-20-2024_14-30-24.pth

https://drive.google.com/file/d/1JCvq-LLztH8CCFWsU-kH2LM58Yb-BBNZ/view?usp=sharing

In [6]:
!gdown 1JCvq-LLztH8CCFWsU-kH2LM58Yb-BBNZ
!unzip -q saved.zip

Downloading...
From (original): https://drive.google.com/uc?id=1JCvq-LLztH8CCFWsU-kH2LM58Yb-BBNZ
From (redirected): https://drive.google.com/uc?id=1JCvq-LLztH8CCFWsU-kH2LM58Yb-BBNZ&confirm=t&uuid=4a17c6d8-e4c6-4f85-ac59-3b49193a2c68
To: /kaggle/working/saved.zip
100%|█████████████████████████████████████████| 294M/294M [00:01<00:00, 233MB/s]


hh_recsys

https://drive.google.com/file/d/1hGs8jBChC9BkSOZlBzzU1WlYxbKpAUZ-/view?usp=sharing

In [9]:
!gdown 1hGs8jBChC9BkSOZlBzzU1WlYxbKpAUZ-
!unzip -q hh_recsys.zip

Downloading...
From (original): https://drive.google.com/uc?id=1hGs8jBChC9BkSOZlBzzU1WlYxbKpAUZ-
From (redirected): https://drive.google.com/uc?id=1hGs8jBChC9BkSOZlBzzU1WlYxbKpAUZ-&confirm=t&uuid=6026eb42-fb28-4703-8d9b-5196eb2ce370
To: /kaggle/working/hh_recsys.zip
100%|███████████████████████████████████████| 1.79G/1.79G [00:10<00:00, 167MB/s]


In [14]:
gc.collect()

499

In [13]:
from recbole.utils import init_seed, init_logger, get_model

model_file_path = '/kaggle/working/saved/SASRecF-Feb-20-2024_14-30-24.pth'
checkpoint = torch.load(model_file_path)
# config = checkpoint["config"]
model = get_model(config["model"])(config, train_data.dataset).to(config['device'])
model.load_state_dict(checkpoint["state_dict"])
model.load_other_parameter(checkpoint.get("other_parameter"))

NameError: name 'train_data' is not defined

In [None]:
train_data, valid_data, test_data = data_preparation(config, dataset)
model = SASRecF(config, train_data.dataset).to(config['device'])

In [11]:
config = Config(model='SASRecF', dataset=DATASET_NAME, config_dict=config_dict)
dataset = create_dataset(config)

  split_point = np.cumsum(feat[field].agg(len))[:-1]


In [8]:
model_path = '/kaggle/working/saved/SASRecF-Feb-20-2024_14-30-24.pth'
model.load_state_dict(torch.load(model_path))

NameError: name 'model' is not defined

---
# ALS

In [8]:
!pip install implicit

Collecting implicit
  Obtaining dependency information for implicit from https://files.pythonhosted.org/packages/cd/cc/deac70cae8cc32c9885d0cd73bc66e1b3cbea36ae7080b8c83995eaf5322/implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [9]:
from scipy.sparse import csr_matrix
import implicit

In [10]:
unique_users = train['user_id'].unique().to_list()
unique_vacancies = train['vacancy_id'].explode().unique().to_list()

user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
vac2idx = {vac_id: idx for idx, vac_id in enumerate(unique_vacancies)}
idx2vac = {idx: vac_id for vac_id, idx in vac2idx.items()}

action_weights = {
    1: 4.0,
    2: 1.0,
    3: 2.0
}

# для версии 0.20
users_list = pairs['user_id'].replace(user2idx, default=None).to_numpy()
vacancies_list = pairs['vacancy_id'].replace(vac2idx, default=None).to_numpy()
preferences = pairs['action_type'].replace(action_weights, default=None).to_numpy()

In [11]:
uv_mat = csr_matrix((preferences, (users_list, vacancies_list)))

als_model = implicit.als.AlternatingLeastSquares(
    factors=200,   # 150
    random_state=RANDOM_STATE,
    iterations=100,
    alpha=3.0,
    calculate_training_loss=True,
    regularization=0.001,
    num_threads=8
)
als_model.fit(uv_mat)

  0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
import pickle
with open('ALS-200-fulldata.pkl', 'wb') as f:
    pickle.dump(als_model, f)

In [19]:
del train
del vacancies
del pairs

In [30]:
test = pl.read_parquet("/kaggle/input/boosters-hh-recsys/hh_recsys_test_hh.pq", low_memory=True)
test_users = test['user_id'].to_list()
test_vacancies = test.select(pl.col('vacancy_id').list.unique(maintain_order=True))['vacancy_id'].to_list()

In [None]:
# Предсказание ALS
predictions = []

for user, vacs in tqdm(zip(test_users, test_vacancies), total=len(test_users)):
    if user not in user2idx:
        predictions.append(vacs)
        continue
    
    cuser = user2idx[user]

    recommendations = als_model.recommend(cuser, 
                                          uv_mat[cuser], 
                                          N=N_PREDICTIONS, 
                                          filter_already_liked_items=False)[0]
    recommendations = [idx2vac[cv] for cv in recommendations]
    predictions.append(recommendations)

In [None]:
test = test.with_columns(pl.lit(pl.Series(predictions)).alias('predictions'))
test.select(['user_id', 'session_id', 'predictions']).write_parquet('als_submission.pq')

---
# LightFM

In [5]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25ldone
[?25h  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=464219 sha256=481e01c09004cfb299b3586311cbdd4f963eb4704b417e81b0006df386f10c4a
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [7]:
from lightfm import LightFM
from lightfm.data import Dataset
from scipy.sparse import csr_matrix

In [8]:
unique_users = train['user_id'].unique().to_list()
unique_vacancies = train['vacancy_id'].explode().unique().to_list()

user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
vac2idx = {vac_id: idx for idx, vac_id in enumerate(unique_vacancies)}
idx2vac = {idx: vac_id for vac_id, idx in vac2idx.items()}

action_weights = {
    1: 4.0,
    2: 1.0,
    3: 2.0
}

# для версии 0.20
users_list = pairs['user_id'].replace(user2idx, default=None).to_numpy()
vacancies_list = pairs['vacancy_id'].replace(vac2idx, default=None).to_numpy()
preferences = pairs['action_type'].replace(action_weights, default=None).to_numpy()

uv_mat = csr_matrix((preferences, (users_list, vacancies_list)))

In [None]:
lfm_params = {
    'no_components': 64,
    'learning_rate': 0.01,
    'max_sampled': 5,
    'loss': 'warp',
    'random_state': 42
    }
lfm_model = LightFM(**lfm_params)

num_epochs = 50
for _ in tqdm(range(num_epochs)):
    lfm_model.fit_partial(uv_mat)

  0%|          | 0/50 [00:00<?, ?it/s]

In [19]:
del train
del vacancies
del pairs

In [30]:
test = pl.read_parquet("/kaggle/input/boosters-hh-recsys/hh_recsys_test_hh.pq", low_memory=True)
test_users = test['user_id'].to_list()
test_vacancies = test.select(pl.col('vacancy_id').list.unique(maintain_order=True))['vacancy_id'].to_list()

In [None]:
# Предсказание ALS
predictions = []

for user, vacs in tqdm(zip(test_users, test_vacancies), total=len(test_users)):
    if user not in user2idx:
        predictions.append(vacs)
        continue
    
    cuser = user2idx[user]

    recommendations = als_model.recommend(cuser, 
                                          uv_mat[cuser], 
                                          N=N_PREDICTIONS, 
                                          filter_already_liked_items=False)[0]
    recommendations = [idx2vac[cv] for cv in recommendations]
    predictions.append(recommendations)

In [None]:
test = test.with_columns(pl.lit(pl.Series(predictions)).alias('predictions'))
test.select(['user_id', 'session_id', 'predictions']).write_parquet('als_submission.pq')

---
# ALS + SASRecF

In [31]:
def add_last_vacancy(old_interaction, last_vacancy_id, max_len=10):   # изначально max_len=50
    new_seq_vacancies = old_interaction['vacancy_id_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_vacancies[old_interaction['item_length'][-1].item()] = last_vacancy_id
    else:
        new_seq_vacancies = torch.roll(new_seq_vacancies, -1)
        new_seq_vacancies[-1] = last_vacancy_id
    return new_seq_vacancies.view(1, len(new_seq_vacancies))


def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset.inter_feat[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[np.nonzero(index)[0]]
#         input_interaction = dataset[index]  # 9 секунд на полном датасете
        test = {'vacancy_id_list': add_last_vacancy(
            input_interaction, input_interaction['vacancy_id'][-1].item(), model.max_seq_length),
                'item_length': torch.tensor(
                    [input_interaction['item_length'][-1].item() + 1 
                     if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, N_PREDICTIONS)

In [None]:
# predictions = []
# num_not_found = 0
# zero_topk = 0

# dataset_uids = dataset.field2token_id[dataset.uid_field]

# for user, vacs in tqdm(zip(test_users, test_vacancies), total=len(test_users)):
# #    if user not in dataset.field2token_id[dataset.uid_field]:   # либо в тесте есть такой юзер, которого не было в трейне, либо в трейне он отсеялся по порогу (?)
#     if user not in dataset_uids:   # либо в тесте есть такой юзер, которого не было в трейне, либо в трейне он отсеялся по порогу (?)
#         predictions.append(vacs)
#         num_not_found += 1
#         continue

# #    cuser = dataset.token2id(dataset.uid_field, user)   # получили id очередного юзера по его имени
# #    _, topk_iid_list = full_sort_topk([cuser], model, test_data, k=N_PREDICTIONS, device='cpu')
#     _, topk_iid_list = predict_for_all_item(user, dataset, model)
#     if len(topk_iid_list) == 0:
#         predictions.append(vacs)
#         zero_topk += 1
#         continue

#     last_topk_iid_list = topk_iid_list[-1]
#     recommendations = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()

#     predictions.append(recommendations)

# print("Not found: ", num_not_found)
# print("Zero topk: ", zero_topk)

In [32]:
predictions = []
num_not_found = 0
num_not_found_als = 0
zero_topk = 0

dataset_uids = dataset.field2token_id[dataset.uid_field]

for user, vacs in tqdm(zip(test_users, test_vacancies), total=len(test_users)):
#    if user not in dataset.field2token_id[dataset.uid_field]:
    if user not in dataset_uids:
        if user not in user2idx:
             predictions.append(vacs)
             num_not_found_als += 1
             continue
        cuser = user2idx[user]
        
        recommendations = als_model.recommend(cuser, uv_mat[cuser], N=N_PREDICTIONS, filter_already_liked_items=False)[0]
        recommendations = [idx2vac[cv] for cv in recommendations]
        predictions.append(recommendations)

        num_not_found += 1
        continue

#    cuser = dataset.token2id(dataset.uid_field, user)   # получили id очередного юзера по его имени
#    _, topk_iid_list = full_sort_topk([cuser], model, test_data, k=N_PREDICTIONS, device='cpu')
    _, topk_iid_list = predict_for_all_item(user, dataset, model)
    if len(topk_iid_list) == 0:
        if user not in user2idx:
             predictions.append(vacs)
             num_not_found_als += 1
             continue
        cuser = user2idx[user]
        
        recommendations = als_model.recommend(cuser, uv_mat[cuser], N=N_PREDICTIONS, filter_already_liked_items=False)[0]
        recommendations = [idx2vac[cv] for cv in recommendations]
        predictions.append(recommendations)

        zero_topk += 1
        continue

    last_topk_iid_list = topk_iid_list[-1]
    recommendations = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()

    predictions.append(recommendations)

print("Not found: ", num_not_found)
print("Not found by ALS: ", num_not_found_als)
print("Zero topk: ", zero_topk)

  0%|          | 0/83189 [00:00<?, ?it/s]

AttributeError: 'dict' object has no attribute 'eval'

In [None]:
test = test.with_columns(pl.lit(pl.Series(predictions)).alias('predictions'))
test.select(['user_id', 'session_id', 'predictions']).write_parquet('sasrecf_als_submission.pq')

In [None]:
test = test.with_columns(pl.lit(pl.Series(predictions)).alias('predictions'))
test.select(['user_id', 'session_id', 'predictions']).write_parquet('als_submission.pq')

In [None]:
predictions = test.select(pl.col('vacancy_id').list.unique(maintain_order=True).list.tail(2))['vacancy_id'].to_list()

test = test.with_columns(pl.lit(pl.Series(predictions)).alias('predictions'))
test.select(['user_id', 'session_id', 'predictions']).write_parquet('submission.pq')

In [None]:
test

In [35]:
gc.collect()
torch.cuda.empty_cache()

---

# Optuna

In [None]:
# !pip install optuna

In [None]:
# # Импортируем необходимые библиотеки
# import optuna
# from recbole.quick_start import run_recbole
# from recbole.config import Config
# from recbole.utils import init_seed

# # Задаем конфигурацию модели и данных
# base_config_dict = {
#     'data_path': './',
#     'USER_ID_FIELD': 'user_id',
#     'ITEM_ID_FIELD': 'vacancy_id',
#     'RATING_FIELD': 'action_type',
#     'TIME_FIELD': 'timestamp', 
#     'user_inter_num_interval': "[10,inf)",
#     'item_inter_num_interval': "[15,inf)",
# #     'seq_len': {'vacancy_id': 10},
#     'MAX_ITEM_LIST_LENGTH': 10,    
#     'load_col': {'inter': ['user_id', 'vacancy_id', 'action_type', 'timestamp'],
#                  'item': ['vacancy_id', 'name', 'company_id', 'keySkills', 
#                           'salary_from', 'salary_to', 'area_id', 'area_regionId', 
#                           'employment', 'workSchedule', 'workExperience']},
#     'selected_features': ['name', 'company_id', 'keySkills', 'salary_from', 
#                           'salary_to', 'area_id', 'area_regionId', 'employment', 
#                           'workSchedule', 'workExperience'],
#     'neg_sampling': None,
#     'train_neg_sample_args': None,
#     'train_batch_size': 1024,
#     'eval_batch_size': 1024,
#     'epochs': 25,
#     'metrics': ['MRR'],
#     'loss_type': 'CE',
#     'topk': 100,
#     'valid_metric': 'MRR@100',
#     "stopping_step": 2,
# #     'hidden_size': 64,
# #     'inner_size': 256,
#     'hidden_dropout_prob': 0.3,
#     'attn_dropout_prob': 0.3,
#     'eval_args': {'split': {'RS': [0.95, 0.03, 0.02]},
#                   'group_by': 'user',
#                   'order': 'TO',
#                   'mode': 'full'},
#     'seed': 42,
#     'reproducibility': True,
#     "device": DEVICE,
# }

# # Определяем функцию для оптимизации гиперпараметров
# def objective(trial):
#     # Случайным образом выбираем значения гиперпараметров из заданных диапазонов
#     learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
#     # l2_reg = trial.suggest_float("l2_reg", 1e-5, 1e-3, log=True)
# #     hidden_size = trial.suggest_int("hidden_size", 128, 256, step=64)
#     num_layers = trial.suggest_int("num_layers", 1, 4)
#     dropout_prob = trial.suggest_float("dropout_prob", 0.1, 0.5)
#     MAX_ITEM_LIST_LENGTH = trial.suggest_int("MAX_ITEM_LIST_LENGTH", 80, 160, step=20)

#     parameter_overwrite = {
# #         "learning_rate": learning_rate,
# #         "reg_weight": l2_reg,
#         "hidden_size": hidden_size,
#         "num_layers": num_layers,
#         "dropout_prob": dropout_prob,
#         "MAX_ITEM_LIST_LENGTH": MAX_ITEM_LIST_LENGTH
#     }

#     # Combine base config with dynamic parameters
#     combined_config = {**base_config_dict, **parameter_overwrite}

#     # Инициализируем случайный сид для воспроизводимости
#     init_seed(combined_config["seed"], combined_config["reproducibility"])

#     # Specify the model and dataset
#     model = 'GRU4Rec'
#     dataset = 'recbole_data'  # Ensure this matches your dataset's name

#     # Run the experiment
#     result_dict = run_recbole(model=model, dataset=dataset, config_dict=combined_config)

#     # Extract the best validation score
#     best_valid_score = result_dict['best_valid_score']
#     return best_valid_score

# # Создаем объект студии optuna
# study = optuna.create_study(direction="maximize")

# # Запускаем оптимизацию гиперпараметров с заданным количеством итераций
# study.optimize(objective, n_trials=10)

# # Выводим лучшие значения гиперпараметров и метрику
# print("Best hyperparameters: ", study.best_params)
# print("Best valid score: ", study.best_value)