# **Import packages and open data**

In [1]:
import polars as pl
import numpy as np
import re
from tqdm import tqdm
from scipy.sparse import csr_matrix
from torch import nn
import implicit
from transformers import BertModel, BertTokenizer

RANDOM_STATE = 42
N_PREDICTIONS = 100

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_train_hh.pq")
test_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_test_hh.pq")
vac_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_vacancies.pq")

# **Work with vacancies**

In [3]:
vac_df.head()

vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,list[str],i64,i64,str,str,str,str,str,str
"""v_862116""","""Смотритель муз…","""c_162972""","""<strong>Обязан…","[""Пользователь ПК"", ""Работа в команде"", … ""PR-консультирование""]",16500,,"""RUR""","""a_4761""","""ar_33""","""full""","""fullDay""","""noExperience"""
"""v_288642""","""Ведущий менедж…","""c_208672""","""<p><strong>Воз…","[""Активные продажи"", ""Холодные продажи"", … ""Организация мероприятий""]",50000,,"""RUR""","""a_744""","""ar_2""","""full""","""fullDay""","""noExperience"""
"""v_1840054""","""Бухгалтер (по …","""c_198109""","""<strong>Обязан…",,50000,65000.0,"""RUR""","""a_6223""","""ar_78""","""full""","""fullDay""","""between3And6"""
"""v_2346232""","""Пекарь (Токсов…","""c_6137""","""<p><strong>Для…",,38500,42000.0,"""RUR""","""a_4795""","""ar_51""","""full""","""fullDay""","""noExperience"""
"""v_312507""","""Торговый предс…","""c_206699""","""<p>Компания ТД…","[""Продуктивность"", ""Клиентоориентированность"", … ""Развитие продаж""]",60000,,"""RUR""","""a_6837""","""ar_4""","""full""","""fullDay""","""between1And3"""


In [4]:
vac_df.describe()

statistic,vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str
"""count""","""2734129""","""2734129""","""2734129""","""2734129""",1498838.0,2067784.0,1237135.0,"""2220766""","""2734129""","""2712786""","""2734129""","""2734129""","""2734129"""
"""null_count""","""0""","""0""","""0""","""0""",1235291.0,666345.0,1496994.0,"""513363""","""0""","""21343""","""0""","""0""","""0"""
"""mean""",,,,,,67860.137144,95380.003839,,,,,,
"""std""",,,,,,694490.816107,910186.501464,,,,,,
"""min""","""v_0""","""!Токарь 4,5 ра…","""c_0""","""  Требования:…",,1.0,1.0,"""AZN""","""a_0""","""ar_0""","""full""","""flexible""","""between1And3"""
"""25%""",,,,,,40000.0,50000.0,,,,,,
"""50%""",,,,,,55000.0,70000.0,,,,,,
"""75%""",,,,,,80000.0,120000.0,,,,,,
"""max""","""v_999999""","""𓆉 Менеджер по …","""c_99999""","""️Требования_<b…",,991788366.0,991788366.0,"""UZS""","""a_999""","""ar_99""","""volunteer""","""shift""","""noExperience"""


In [5]:
vac_df.shape[0]//25_000, vac_df.shape[0]%25_000

(109, 9129)

In [8]:
area_id = sorted(vac_df["area.id"].unique().to_list())
empl = sorted(vac_df["employment"].unique().to_list())
work_sch = sorted(vac_df["workSchedule"].unique().to_list())
work_exp = sorted(vac_df["workExperience"].unique().to_list())
area2idx = {area_id[i] : i for i in range(len(area_id))}
empl2idx = {empl[i] : i for i in range(len(empl))}
sch2idx = {work_sch[i] : i for i in range(len(work_sch))}
exp2idx = {work_exp[i] : i for i in range(len(work_exp))}
# features = pl.DataFrame()
# for v in tqdm(range(1, 50)):
#     lb = (v-1)*25_000
#     rb = v*25_000
#     cur = vac_df[lb:rb].drop("company.id", "compensation.currencyCode", "area.regionId")
#     cur = cur.with_columns(vac_df[lb:rb]["keySkills.keySkill"].fill_null([""]).alias("keySkills.keySkill"))
#     cur = cur.with_columns(cur["description"].map_elements(lambda x: ''.join(re.split(r'</[a-z]+>|<[a-z]+>', str(x))).split()).alias("description"))
#     cur = cur.with_columns(cur["compensation.from"].fill_null(0).alias("compensation.from"))
#     cur = cur.with_columns(cur["compensation.to"].fill_null(0).alias("compensation.to")).with_columns(pl.Series("compensation.to", [cur["compensation.to"][i] if cur["compensation.to"][i]!=0 else cur["compensation.from"][i] for i in range(25_000)]).alias("compensation.to"))
#     cur = cur.with_columns(cur["area.id"].map_elements(lambda x: area2idx[x]).alias("area.id"))
#     cur = cur.with_columns(cur["employment"].map_elements(lambda x: empl2idx[x]).alias("employment"))
#     cur = cur.with_columns(cur["workSchedule"].map_elements(lambda x: sch2idx[x]).alias("workSchedule"))
#     cur = cur.with_columns(cur["workExperience"].map_elements(lambda x: exp2idx[x]).alias("workExperience"))
#     features = pl.concat([features, cur])
# features.write_parquet("C:\labs\hh_hack\extracted_features.pq")

In [9]:
features = pl.read_parquet("C:\labs\hh_hack\extracted_features.pq")
for v in tqdm(range(50, 110)):
    lb = (v-1)*25_000
    rb = v*25_000
    cur = vac_df[lb:rb].drop("company.id", "compensation.currencyCode", "area.regionId")
    cur = cur.with_columns(vac_df[lb:rb]["keySkills.keySkill"].fill_null([""]).alias("keySkills.keySkill"))
    cur = cur.with_columns(cur["description"].map_elements(lambda x: ''.join(re.split(r'</[a-z]+>|<[a-z]+>', str(x))).split()).alias("description"))
    cur = cur.with_columns(cur["compensation.from"].fill_null(0).alias("compensation.from"))
    cur = cur.with_columns(cur["compensation.to"].fill_null(0).alias("compensation.to")).with_columns(pl.Series("compensation.to", [cur["compensation.to"][i] if cur["compensation.to"][i]!=0 else cur["compensation.from"][i] for i in range(25_000)]).alias("compensation.to"))
    cur = cur.with_columns(cur["area.id"].map_elements(lambda x: area2idx[x]).alias("area.id"))
    cur = cur.with_columns(cur["employment"].map_elements(lambda x: empl2idx[x]).alias("employment"))
    cur = cur.with_columns(cur["workSchedule"].map_elements(lambda x: sch2idx[x]).alias("workSchedule"))
    cur = cur.with_columns(cur["workExperience"].map_elements(lambda x: exp2idx[x]).alias("workExperience"))
    features = pl.concat([features, cur])
# lb = v*25_000
# rb = lb+9129
# cur = vac_df[lb:rb].drop("company.id", "compensation.currencyCode", "area.regionId")
# cur = cur.with_columns(vac_df[lb:rb]["keySkills.keySkill"].fill_null([""]).alias("keySkills.keySkill"))
# cur = cur.with_columns(cur["description"].map_elements(lambda x: ''.join(re.split(r'</[a-z]+>|<[a-z]+>', str(x))).split()).alias("description"))
# cur = cur.with_columns(cur["compensation.from"].fill_null(0).alias("compensation.from"))
# cur = cur.with_columns(cur["compensation.to"].fill_null(0).alias("compensation.to")).with_columns(pl.Series("compensation.to", [cur["compensation.to"][i] if cur["compensation.to"][i]!=0 else cur["compensation.from"][i] for i in range(9129)]).alias("compensation.to"))
# cur = cur.with_columns(cur["area.id"].map_elements(lambda x: area2idx[x]).alias("area.id"))
# cur = cur.with_columns(cur["employment"].map_elements(lambda x: empl2idx[x]).alias("employment"))
# cur = cur.with_columns(cur["workSchedule"].map_elements(lambda x: sch2idx[x]).alias("workSchedule"))
# cur = cur.with_columns(cur["workExperience"].map_elements(lambda x: exp2idx[x]).alias("workExperience"))
# features = pl.concat([features, cur])

Series.map_elements is significantly slower than the native series API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - s.map_elements(lambda x: ...)
with this one instead:
  + s.replace(area2idx)

  cur = cur.with_columns(cur["area.id"].map_elements(lambda x: area2idx[x]).alias("area.id"))
Series.map_elements is significantly slower than the native series API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - s.map_elements(lambda x: ...)
with this one instead:
  + s.replace(empl2idx)

  cur = cur.with_columns(cur["employment"].map_elements(lambda x: empl2idx[x]).alias("employment"))
Series.map_elements is significantly slower than the native series API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - s.map_elements(lambda x: ...)
with this one instead:
  + s.replace(sch2idx)

  cur = cur.with_columns(cur["workSchedule"].map_elements(lambda x

: 

# **Preparing data**

In [None]:
train = pl.DataFrame()
for i in tqdm(range(1, 70)):
    new_df = pl.concat([train, train_df[(i-1)*50_000:i*50_000].explode(columns=["vacancy_id", "action_type", "action_dt"]).sort("action_dt")])
train.head()

100%|██████████| 69/69 [00:27<00:00,  2.47it/s]


user_id,session_id,vacancy_id,action_type,action_dt
str,str,str,i64,datetime[ns]
"""u_370846""","""s_24698241""","""v_697571""",1,2023-11-01 00:00:00.919
"""u_461521""","""s_7559925""","""v_2514797""",2,2023-11-01 00:00:06.973
"""u_332204""","""s_6570164""","""v_742810""",2,2023-11-01 00:00:12.594
"""u_229153""","""s_23936793""","""v_1411424""",2,2023-11-01 00:00:13.518
"""u_1128173""","""s_14266530""","""v_248154""",2,2023-11-01 00:00:16.114


In [None]:
train_df = new_df

# **Making matrix**

In [None]:
unique_users = train_df["user_id"].unique().to_list()
unique_vacancies = train_df["vacancy_id"].unique().to_list()

user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
vac2idx = {vac_id: idx for idx, vac_id in enumerate(unique_vacancies)}

In [None]:
idx2vac = {idx: vac_id for vac_id, idx in vac2idx.items()}
action_weights = {
    1: 4.0,
    2: 1.0,
    3: 2.0
}

In [None]:
pairs = train_df.select(["user_id", "vacancy_id", "action_type"])

In [None]:
users = pairs["user_id"].replace(user2idx).to_numpy().astype(int)
vacancies = pairs['vacancy_id'].replace(vac2idx).to_numpy().astype(int)
preferences = pairs['action_type'].replace(action_weights).to_numpy().astype(int)

In [None]:
users

array([352731, 709819,  98735, ..., 753727, 567196, 464422])

In [None]:
vacancies

array([1226898, 1248555, 1234307, ..., 1288734,  537012,  409658])

In [None]:
preferences

SeriesView([4, 1, 1, ..., 1, 1, 1])

In [None]:
uv_mat = csr_matrix((preferences, (users, vacancies)))
uv_mat

<880985x1457882 sparse matrix of type '<class 'numpy.intc'>'
	with 14995101 stored elements in Compressed Sparse Row format>

# **Working with BERT and ALS**