# **Import packages and open data**

In [1]:
import polars as pl
import numpy as np
import re
from tqdm import tqdm
from scipy.sparse import csr_matrix
from torch import nn
import implicit
from transformers import BertModel, BertTokenizer

RANDOM_STATE = 42
N_PREDICTIONS = 100

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_train_hh.pq")
test_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_test_hh.pq")
vac_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_vacancies.pq")

In [3]:
vac_df.shape

(2734129, 13)

In [4]:
vac_df = vac_df[:vac_df.shape[0]//10]

# **Work with vacancies**

In [5]:
vac_df.head()

vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,list[str],i64,i64,str,str,str,str,str,str
"""v_862116""","""Смотритель муз…","""c_162972""","""<strong>Обязан…","[""Пользователь ПК"", ""Работа в команде"", … ""PR-консультирование""]",16500,,"""RUR""","""a_4761""","""ar_33""","""full""","""fullDay""","""noExperience"""
"""v_288642""","""Ведущий менедж…","""c_208672""","""<p><strong>Воз…","[""Активные продажи"", ""Холодные продажи"", … ""Организация мероприятий""]",50000,,"""RUR""","""a_744""","""ar_2""","""full""","""fullDay""","""noExperience"""
"""v_1840054""","""Бухгалтер (по …","""c_198109""","""<strong>Обязан…",,50000,65000.0,"""RUR""","""a_6223""","""ar_78""","""full""","""fullDay""","""between3And6"""
"""v_2346232""","""Пекарь (Токсов…","""c_6137""","""<p><strong>Для…",,38500,42000.0,"""RUR""","""a_4795""","""ar_51""","""full""","""fullDay""","""noExperience"""
"""v_312507""","""Торговый предс…","""c_206699""","""<p>Компания ТД…","[""Продуктивность"", ""Клиентоориентированность"", … ""Развитие продаж""]",60000,,"""RUR""","""a_6837""","""ar_4""","""full""","""fullDay""","""between1And3"""


In [6]:
vac_df.describe()

statistic,vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str
"""count""","""273412""","""273412""","""273412""","""273412""",150175.0,206248.0,123408.0,"""221587""","""273412""","""271270""","""273412""","""273412""","""273412"""
"""null_count""","""0""","""0""","""0""","""0""",123237.0,67164.0,150004.0,"""51825""","""0""","""2142""","""0""","""0""","""0"""
"""mean""",,,,,,72191.681102,102458.696681,,,,,,
"""std""",,,,,,2185000.0,2827500.0,,,,,,
"""min""","""v_1000005""","""""Оператор call…","""c_1""",""" <p align=""ce…",,1.0,10.0,"""BYR""","""a_1""","""ar_0""","""full""","""flexible""","""between1And3"""
"""25%""",,,,,,40000.0,50000.0,,,,,,
"""50%""",,,,,,55000.0,70000.0,,,,,,
"""75%""",,,,,,80000.0,120000.0,,,,,,
"""max""","""v_999983""","""财务总监 Финансовы…","""c_99996""","""❗️❗️ Требуются…",,991788366.0,991788366.0,"""UZS""","""a_999""","""ar_99""","""volunteer""","""shift""","""noExperience"""


In [7]:
vac_df.shape[0]//25_000, vac_df.shape[0]%25_000

(10, 23412)

In [10]:
area_id = sorted(vac_df["area.id"].unique().to_list())
empl = sorted(vac_df["employment"].unique().to_list())
work_sch = sorted(vac_df["workSchedule"].unique().to_list())
work_exp = sorted(vac_df["workExperience"].unique().to_list())
area2idx = {area_id[i] : i for i in range(len(area_id))}
empl2idx = {empl[i] : i for i in range(len(empl))}
sch2idx = {work_sch[i] : i for i in range(len(work_sch))}
exp2idx = {work_exp[i] : i for i in range(len(work_exp))}
features = pl.DataFrame()
for v in tqdm(range(1, 10)):
    lb = (v-1)*25_000
    rb = v*25_000
    cur = vac_df[lb:rb].drop("name", "compensation.currencyCode", "area.regionId", "description", "keySkills.keySkill")
    cur = cur.with_columns(cur["compensation.from"].fill_null(0).alias("compensation.from"))
    cur = cur.with_columns(cur["compensation.to"].fill_null(0).alias("compensation.to"))
    cur = cur.with_columns(pl.Series("compensation.to", [cur["compensation.to"][i] if cur["compensation.to"][i]!=0 else cur["compensation.from"][i] for i in range(25_000)]))
    cur = cur.with_columns(cur["area.id"].replace(area2idx).cast(int).alias("area.id"))
    cur = cur.with_columns(cur["employment"].replace(empl2idx).cast(int).alias("employment"))
    cur = cur.with_columns(cur["workSchedule"].replace(sch2idx).cast(int).alias("workSchedule"))
    cur = cur.with_columns(cur["workExperience"].replace(exp2idx).cast(int).alias("workExperience"))
    features = pl.concat([features, cur])

lb = v*25_000
rb = lb + 23412
cur = vac_df[lb:rb].drop("name", "compensation.currencyCode", "area.regionId", "description", "keySkills.keySkill")
cur = cur.with_columns(cur["compensation.from"].fill_null(0).alias("compensation.from"))
cur = cur.with_columns(cur["compensation.to"].fill_null(0).alias("compensation.to")).with_columns(pl.Series("compensation.to", [cur["compensation.to"][i] if cur["compensation.to"][i]!=0 else cur["compensation.from"][i] for i in range(23_412)]).alias("compensation.to"))
cur = cur.with_columns(cur["area.id"].replace(area2idx).cast(int).alias("area.id"))
cur = cur.with_columns(cur["employment"].replace(empl2idx).cast(int).alias("employment"))
cur = cur.with_columns(cur["workSchedule"].replace(sch2idx).cast(int).alias("workSchedule"))
cur = cur.with_columns(cur["workExperience"].replace(exp2idx).cast(int).alias("workExperience"))
features = pl.concat([features, cur])
features.head()


  0%|          | 0/9 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:00<00:00, 11.55it/s]


vacancy_id,company.id,compensation.from,compensation.to,area.id,employment,workSchedule,workExperience
str,str,i64,i64,i64,i64,i64,i64
"""v_862116""","""c_162972""",16500,16500,2596,0,2,3
"""v_288642""","""c_208672""",50000,50000,4183,0,2,3
"""v_1840054""","""c_198109""",50000,65000,3606,0,2,1
"""v_2346232""","""c_6137""",38500,42000,2621,0,2,3
"""v_312507""","""c_206699""",60000,60000,4029,0,2,0


# **Preparing data**

In [None]:
min_seq_len = 8
min_item_cnt = 8

In [None]:
train_df.shape[0]%50_000

13064

In [None]:
train = pl.DataFrame()
for i in tqdm(range(1, 69)):
    train = pl.concat([train, train_df[(i-1)*50_000:i*50_000].explode(columns=["vacancy_id", "action_type", "action_dt"]).sort("action_dt")])
train = pl.concat([train, train_df[i*50_000:i*50_000+13064].explode(columns=["vacancy_id", "action_type", "action_dt"]).sort("action_dt")])
train.head()

100%|██████████| 69/69 [00:21<00:00,  3.16it/s]


user_id,session_id,vacancy_id,action_type,action_dt
str,str,str,i64,datetime[ns]
"""u_370846""","""s_24698241""","""v_697571""",1,2023-11-01 00:00:00.919
"""u_461521""","""s_7559925""","""v_2514797""",2,2023-11-01 00:00:06.973
"""u_332204""","""s_6570164""","""v_742810""",2,2023-11-01 00:00:12.594
"""u_229153""","""s_23936793""","""v_1411424""",2,2023-11-01 00:00:13.518
"""u_1128173""","""s_14266530""","""v_248154""",2,2023-11-01 00:00:16.114


In [None]:
train.shape

(21516070, 5)

In [None]:
train = train.filter(pl.col("vacancy_id").is_in(vac_df["vacancy_id"]))

In [None]:
train.shape

(2104417, 5)

In [None]:
train = train.filter(pl.col("vacancy_id").is_in(train["vacancy_id"].value_counts().filter(pl.col("count") >= min_item_cnt)["vacancy_id"]))
train = train.filter(pl.col("user_id").is_in(train["user_id"].value_counts().filter(pl.col("count") >= min_item_cnt)["user_id"]))
train.head()

user_id,session_id,vacancy_id,action_type,action_dt
str,str,str,i64,datetime[ns]
"""u_757137""","""s_9550840""","""v_513154""",1,2023-11-01 00:03:08.382
"""u_757137""","""s_9550840""","""v_1111908""",1,2023-11-01 00:04:05.463
"""u_1063336""","""s_26465030""","""v_1447898""",2,2023-11-01 00:06:48.406
"""u_1063336""","""s_26465030""","""v_1447898""",1,2023-11-01 00:07:19.600
"""u_845515""","""s_13111736""","""v_2064498""",2,2023-11-01 00:11:14.598


In [None]:
train.shape

(1029706, 5)

In [None]:
unique_users = sorted(train["user_id"].unique().to_list())
unique_vacancies = sorted(train["vacancy_id"].unique().to_list())

user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
vac2idx = {vac_id: idx for idx, vac_id in enumerate(unique_vacancies)}

# **Making matrix**

In [None]:
idx2vac = {idx: vac_id for vac_id, idx in vac2idx.items()}
action_weights = {
    1: 4.0,
    2: 1.0,
    3: 2.0
}

In [None]:
pairs = train.select(["user_id", "vacancy_id", "action_type"])

In [None]:
users = pairs["user_id"].replace(user2idx).to_numpy().astype(int)
vacancies = pairs['vacancy_id'].replace(vac2idx).to_numpy().astype(int)
preferences = pairs['action_type'].replace(action_weights).to_numpy().astype(int)

In [None]:
users

array([46525, 46525,  3632, ..., 41151,  4966,  3149])

In [None]:
vacancies

array([39815,  2305,  9075, ..., 19864, 39411, 19814])

In [None]:
preferences

SeriesView([4, 4, 1, ..., 1, 1, 1])

In [None]:
uv_mat = csr_matrix((preferences, (users, vacancies)))
uv_mat

<60213x49703 sparse matrix of type '<class 'numpy.intc'>'
	with 629901 stored elements in Compressed Sparse Row format>

# **Working with BERT and ALS**