# **Import packages and open data**

In [2]:
import polars as pl
import numpy as np
import re
from tqdm import tqdm
from scipy.sparse import csr_matrix
from torch import nn
import implicit
from transformers import BertModel, BertTokenizer

RANDOM_STATE = 42
N_PREDICTIONS = 100

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_train_hh.pq")
test_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_test_hh.pq")
vac_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_vacancies.pq")

In [4]:
vac_df.shape

(2734129, 13)

In [5]:
vac_df = vac_df[:vac_df.shape[0]//100]

# **Work with vacancies**

In [6]:
vac_df.head()

vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,list[str],i64,i64,str,str,str,str,str,str
"""v_862116""","""Смотритель муз…","""c_162972""","""<strong>Обязан…","[""Пользователь ПК"", ""Работа в команде"", … ""PR-консультирование""]",16500,,"""RUR""","""a_4761""","""ar_33""","""full""","""fullDay""","""noExperience"""
"""v_288642""","""Ведущий менедж…","""c_208672""","""<p><strong>Воз…","[""Активные продажи"", ""Холодные продажи"", … ""Организация мероприятий""]",50000,,"""RUR""","""a_744""","""ar_2""","""full""","""fullDay""","""noExperience"""
"""v_1840054""","""Бухгалтер (по …","""c_198109""","""<strong>Обязан…",,50000,65000.0,"""RUR""","""a_6223""","""ar_78""","""full""","""fullDay""","""between3And6"""
"""v_2346232""","""Пекарь (Токсов…","""c_6137""","""<p><strong>Для…",,38500,42000.0,"""RUR""","""a_4795""","""ar_51""","""full""","""fullDay""","""noExperience"""
"""v_312507""","""Торговый предс…","""c_206699""","""<p>Компания ТД…","[""Продуктивность"", ""Клиентоориентированность"", … ""Развитие продаж""]",60000,,"""RUR""","""a_6837""","""ar_4""","""full""","""fullDay""","""between1And3"""


In [7]:
vac_df.describe()

statistic,vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str
"""count""","""27341""","""27341""","""27341""","""27341""",14957.0,20611.0,12313.0,"""22143""","""27341""","""27128""","""27341""","""27341""","""27341"""
"""null_count""","""0""","""0""","""0""","""0""",12384.0,6730.0,15028.0,"""5198""","""0""","""213""","""0""","""0""","""0"""
"""mean""",,,,,,68558.723594,96158.224559,,,,,,
"""std""",,,,,,112842.072794,218133.210496,,,,,,
"""min""","""v_1000206""","""""Фронтенд разр…","""c_100002""",""" <p…",,20.0,30.0,"""BYR""","""a_1""","""ar_0""","""full""","""flexible""","""between1And3"""
"""25%""",,,,,,40000.0,50000.0,,,,,,
"""50%""",,,,,,55000.0,70000.0,,,,,,
"""75%""",,,,,,80000.0,120000.0,,,,,,
"""max""","""v_999892""","""​​​​​​​Purchas…","""c_99995""","""‼Компания ПРОМ…",,8000000.0,20000000.0,"""UZS""","""a_996""","""ar_99""","""volunteer""","""shift""","""noExperience"""


In [15]:
all_steps = vac_df.shape[0]
max_steps = vac_df.shape[0]//25_000
last_step = vac_df.shape[0]%25_000
all_steps, max_steps, last_step

(27341, 1, 2341)

In [24]:
area_id = sorted(vac_df["area.id"].unique().to_list())
empl = sorted(vac_df["employment"].unique().to_list())
work_sch = sorted(vac_df["workSchedule"].unique().to_list())
work_exp = sorted(vac_df["workExperience"].unique().to_list())
area2idx = {area_id[i] : i for i in range(len(area_id))}
empl2idx = {empl[i] : i for i in range(len(empl))}
sch2idx = {work_sch[i] : i for i in range(len(work_sch))}
exp2idx = {work_exp[i] : i for i in range(len(work_exp))}
last_v = 0
features = pl.DataFrame()
# for v in tqdm(range(1, max_steps)):
#     lb = (v-1)*25_000
#     rb = v*25_000
#     cur = vac_df[lb:rb].drop("name", "company.id", "compensation.currencyCode", "area.regionId", "description", "keySkills.keySkill")
#     cur = cur.with_columns(cur["compensation.from"].fill_null(0).alias("compensation.from"))
#     cur = cur.with_columns(cur["compensation.to"].fill_null(0).alias("compensation.to"))
#     cur = cur.with_columns(pl.Series("compensation.to", [cur["compensation.to"][i] if cur["compensation.to"][i]!=0 else cur["compensation.from"][i] for i in range(25_000)]))
#     cur = cur.with_columns(cur["area.id"].replace(area2idx).cast(int).alias("area.id"))
#     cur = cur.with_columns(cur["employment"].replace(empl2idx).cast(int).alias("employment"))
#     cur = cur.with_columns(cur["workSchedule"].replace(sch2idx).cast(int).alias("workSchedule"))
#     cur = cur.with_columns(cur["workExperience"].replace(exp2idx).cast(int).alias("workExperience"))
#     features = pl.concat([features, cur])
#     last_v = v
# lb = last_v*25_000
# rb = lb + 23412
cur = vac_df.drop("name", "company.id", "compensation.currencyCode", "area.regionId", "description", "keySkills.keySkill")
cur = cur.with_columns(cur["compensation.from"].fill_null(0).alias("compensation.from"))
cur = cur.with_columns(cur["compensation.to"].fill_null(0).alias("compensation.to"))
cur = cur.with_columns(pl.Series("compensation.to", [cur["compensation.to"][i] if cur["compensation.to"][i]!=0 else cur["compensation.from"][i] for i in range(all_steps)]))
cur = cur.with_columns(cur["area.id"].replace(area2idx).cast(int).alias("area.id"))
cur = cur.with_columns(cur["employment"].replace(empl2idx).cast(int).alias("employment"))
cur = cur.with_columns(cur["workSchedule"].replace(sch2idx).cast(int).alias("workSchedule"))
cur = cur.with_columns(cur["workExperience"].replace(exp2idx).cast(int).alias("workExperience"))
features = pl.concat([features, cur])
print(features.shape)
features.head()


(27341, 7)


vacancy_id,compensation.from,compensation.to,area.id,employment,workSchedule,workExperience
str,i64,i64,i64,i64,i64,i64
"""v_862116""",16500,16500,973,0,2,3
"""v_288642""",50000,50000,1567,0,2,3
"""v_1840054""",50000,65000,1383,0,2,1
"""v_2346232""",38500,42000,986,0,2,3
"""v_312507""",60000,60000,1515,0,2,0


In [25]:
features.describe()

statistic,vacancy_id,compensation.from,compensation.to,area.id,employment,workSchedule,workExperience
str,str,f64,f64,f64,f64,f64,f64
"""count""","""27341""",27341.0,27341.0,27341.0,27341.0,27341.0,27341.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,51682.96156,69786.711934,717.196189,0.092279,2.178962,1.293588
"""std""",,102328.510266,170885.630145,491.771363,0.35276,0.931395,1.354938
"""min""","""v_1000206""",0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",,3000.0,30000.0,186.0,0.0,2.0,0.0
"""50%""",,45000.0,55000.0,630.0,0.0,2.0,1.0
"""75%""",,70000.0,90000.0,1208.0,0.0,2.0,3.0
"""max""","""v_999892""",8000000.0,20000000.0,1628.0,4.0,4.0,3.0


# **Preparing data**

In [26]:
min_seq_len = 8
min_item_cnt = 8

In [27]:
train_df.shape[0]%50_000

13064

In [28]:
train = pl.DataFrame()
for i in tqdm(range(1, 69)):
    train = pl.concat([train, train_df[(i-1)*50_000:i*50_000].explode(columns=["vacancy_id", "action_type", "action_dt"]).sort("action_dt")])
train = pl.concat([train, train_df[i*50_000:i*50_000+13064].explode(columns=["vacancy_id", "action_type", "action_dt"]).sort("action_dt")])
train.head()

100%|██████████| 68/68 [00:17<00:00,  3.87it/s]


user_id,session_id,vacancy_id,action_type,action_dt
str,str,str,i64,datetime[ns]
"""u_370846""","""s_24698241""","""v_697571""",1,2023-11-01 00:00:00.919
"""u_461521""","""s_7559925""","""v_2514797""",2,2023-11-01 00:00:06.973
"""u_332204""","""s_6570164""","""v_742810""",2,2023-11-01 00:00:12.594
"""u_229153""","""s_23936793""","""v_1411424""",2,2023-11-01 00:00:13.518
"""u_1128173""","""s_14266530""","""v_248154""",2,2023-11-01 00:00:16.114


In [29]:
train.shape

(21285044, 5)

In [30]:
train = train.filter(pl.col("vacancy_id").is_in(vac_df["vacancy_id"]))

In [31]:
train.shape

(195768, 5)

In [32]:
train = train.filter(pl.col("vacancy_id").is_in(train["vacancy_id"].value_counts().filter(pl.col("count") >= min_item_cnt)["vacancy_id"]))
train = train.filter(pl.col("user_id").is_in(train["user_id"].value_counts().filter(pl.col("count") >= min_item_cnt)["user_id"]))
train.head()

user_id,session_id,vacancy_id,action_type,action_dt
str,str,str,i64,datetime[ns]
"""u_347542""","""s_24879683""","""v_1337611""",2,2023-11-01 00:49:41.006
"""u_965257""","""s_17627628""","""v_1332935""",2,2023-11-01 00:50:58.748
"""u_639152""","""s_7884954""","""v_706548""",2,2023-11-01 10:58:31.505
"""u_284033""","""s_26366513""","""v_2242773""",2,2023-11-01 11:20:39.140
"""u_874519""","""s_414506""","""v_362128""",2,2023-11-01 12:10:59.042


In [33]:
train.shape

(20263, 5)

In [34]:
unique_users = sorted(train["user_id"].unique().to_list())
unique_vacancies = sorted(train["vacancy_id"].unique().to_list())

user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
vac2idx = {vac_id: idx for idx, vac_id in enumerate(unique_vacancies)}

# **Making matrix**

In [35]:
idx2vac = {idx: vac_id for vac_id, idx in vac2idx.items()}
action_weights = {
    1: 4.0,
    2: 1.0,
    3: 2.0
}

In [36]:
pairs = train.select(["user_id", "vacancy_id", "action_type"])

In [37]:
users = pairs["user_id"].replace(user2idx).to_numpy().astype(int)
vacancies = pairs['vacancy_id'].replace(vac2idx).to_numpy().astype(int)
preferences = pairs['action_type'].replace(action_weights).to_numpy().astype(int)

In [38]:
users

array([ 624, 1589, 1069, ...,  202,  202, 1347])

In [39]:
vacancies

array([ 441,  433, 2702, ...,  988,  988, 2092])

In [40]:
preferences

SeriesView([1, 1, 1, ..., 2, 1, 1])

In [41]:
uv_mat = csr_matrix((preferences, (users, vacancies)))
uv_mat

<1649x3093 sparse matrix of type '<class 'numpy.intc'>'
	with 8487 stored elements in Compressed Sparse Row format>

# **Working with BERT and ALS**