# **Import packages and open data**

In [1]:
import polars as pl
import numpy as np
import re
from tqdm import tqdm
from scipy.sparse import csr_matrix
from torch import nn
import implicit
from transformers import BertModel, BertTokenizer

RANDOM_STATE = 42
N_PREDICTIONS = 100

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_train_hh.pq")
test_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_test_hh.pq")
vac_df = pl.read_parquet("C:\labs\hh_hack\hh_recsys_vacancies.pq")

In [4]:
vac_df.shape

(2734129, 13)

In [3]:
vac_df = vac_df[:vac_df.shape[0]//100]

# **Work with vacancies**

In [6]:
vac_df.head()

vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,list[str],i64,i64,str,str,str,str,str,str
"""v_862116""","""Смотритель муз…","""c_162972""","""<strong>Обязан…","[""Пользователь ПК"", ""Работа в команде"", … ""PR-консультирование""]",16500,,"""RUR""","""a_4761""","""ar_33""","""full""","""fullDay""","""noExperience"""
"""v_288642""","""Ведущий менедж…","""c_208672""","""<p><strong>Воз…","[""Активные продажи"", ""Холодные продажи"", … ""Организация мероприятий""]",50000,,"""RUR""","""a_744""","""ar_2""","""full""","""fullDay""","""noExperience"""
"""v_1840054""","""Бухгалтер (по …","""c_198109""","""<strong>Обязан…",,50000,65000.0,"""RUR""","""a_6223""","""ar_78""","""full""","""fullDay""","""between3And6"""
"""v_2346232""","""Пекарь (Токсов…","""c_6137""","""<p><strong>Для…",,38500,42000.0,"""RUR""","""a_4795""","""ar_51""","""full""","""fullDay""","""noExperience"""
"""v_312507""","""Торговый предс…","""c_206699""","""<p>Компания ТД…","[""Продуктивность"", ""Клиентоориентированность"", … ""Развитие продаж""]",60000,,"""RUR""","""a_6837""","""ar_4""","""full""","""fullDay""","""between1And3"""


In [7]:
vac_df.describe()

statistic,vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str
"""count""","""27341""","""27341""","""27341""","""27341""",14957.0,20611.0,12313.0,"""22143""","""27341""","""27128""","""27341""","""27341""","""27341"""
"""null_count""","""0""","""0""","""0""","""0""",12384.0,6730.0,15028.0,"""5198""","""0""","""213""","""0""","""0""","""0"""
"""mean""",,,,,,68558.723594,96158.224559,,,,,,
"""std""",,,,,,112842.072794,218133.210496,,,,,,
"""min""","""v_1000206""","""""Фронтенд разр…","""c_100002""",""" <p…",,20.0,30.0,"""BYR""","""a_1""","""ar_0""","""full""","""flexible""","""between1And3"""
"""25%""",,,,,,40000.0,50000.0,,,,,,
"""50%""",,,,,,55000.0,70000.0,,,,,,
"""75%""",,,,,,80000.0,120000.0,,,,,,
"""max""","""v_999892""","""​​​​​​​Purchas…","""c_99995""","""‼Компания ПРОМ…",,8000000.0,20000000.0,"""UZS""","""a_996""","""ar_99""","""volunteer""","""shift""","""noExperience"""


In [8]:
all_steps = vac_df.shape[0]
max_steps = vac_df.shape[0]//25_000
last_step = vac_df.shape[0]%25_000
all_steps, max_steps, last_step

(27341, 1, 2341)

In [9]:
area_id = sorted(vac_df["area.id"].unique().to_list())
empl = sorted(vac_df["employment"].unique().to_list())
work_sch = sorted(vac_df["workSchedule"].unique().to_list())
work_exp = sorted(vac_df["workExperience"].unique().to_list())
area2idx = {area_id[i] : i for i in range(len(area_id))}
empl2idx = {empl[i] : i for i in range(len(empl))}
sch2idx = {work_sch[i] : i for i in range(len(work_sch))}
exp2idx = {work_exp[i] : i for i in range(len(work_exp))}
last_v = 0
features = pl.DataFrame()
cur = vac_df.drop("name", "company.id", "compensation.currencyCode", "area.regionId", "description", "keySkills.keySkill", "area.id")
cur = cur.with_columns(cur["compensation.from"].fill_null(0).alias("compensation.from"))
cur = cur.with_columns(cur["compensation.to"].fill_null(0).alias("compensation.to"))
cur = cur.with_columns(pl.Series("compensation.to", [cur["compensation.to"][i] if cur["compensation.to"][i]!=0 else cur["compensation.from"][i] for i in range(all_steps)]))
# cur = cur.with_columns(cur["area.id"].replace(area2idx).cast(int).alias("area.id"))
cur = cur.with_columns(cur["employment"].replace(empl2idx).cast(int).map_elements(lambda x: 2 if x == 0 else x/4).alias("employment"))
cur = cur.with_columns(cur["workSchedule"].replace(sch2idx).cast(int).map_elements(lambda x: 2 if x == 0 else x/4).alias("workSchedule"))
cur = cur.with_columns(cur["workExperience"].replace(exp2idx).cast(int).map_elements(lambda x: 2 if x == 0 else x/3).alias("workExperience"))
cur = cur.with_columns(((pl.col("compensation.from") + pl.col("compensation.to"))/2/60734.836747/167.2).alias("compensation.mean"))
cur = cur.with_columns(((pl.col("employment") + pl.col("workSchedule") + pl.col("workExperience") + pl.col("compensation.mean"))/4).alias("rate"))
cur = cur.drop("compensation.from", "compensation.to", "employment", "workSchedule", "workExperience", "compensation.mean")
features = pl.concat([features, cur])
print(features.shape)
features.head()


(27341, 2)


vacancy_id,rate
str,f64
"""v_862116""",0.875406
"""v_288642""",0.876231
"""v_1840054""",0.709749
"""v_2346232""",0.875991
"""v_312507""",1.126477


# **Preparing data**

In [10]:
min_seq_len = 8
min_item_cnt = 8

In [11]:
train_df.shape[0]%50_000

13064

In [12]:
train = pl.DataFrame()
# for i in tqdm(range(1, 69)):
#     train = pl.concat([train, train_df[(i-1)*50_000:i*50_000].explode(columns=["vacancy_id", "action_type", "action_dt"]).sort("action_dt")])
# train = pl.concat([train, train_df[i*50_000:i*50_000+13064].explode(columns=["vacancy_id", "action_type", "action_dt"]).sort("action_dt")])
train = pl.concat([train, train_df.explode(columns=["vacancy_id", "action_type", "action_dt"]).sort("action_dt")])
train.head()

user_id,session_id,vacancy_id,action_type,action_dt
str,str,str,i64,datetime[ns]
"""u_229843""","""s_6902690""","""v_2519518""",2,2023-11-01 00:00:00.019
"""u_399965""","""s_20560802""","""v_665882""",2,2023-11-01 00:00:00.074
"""u_760080""","""s_6503120""","""v_896765""",2,2023-11-01 00:00:00.557
"""u_125008""","""s_26036381""","""v_1575915""",3,2023-11-01 00:00:00.739
"""u_1150559""","""s_1912334""","""v_2093333""",2,2023-11-01 00:00:00.747


In [13]:
train.shape

(21516070, 5)

In [14]:
train = train.filter(pl.col("vacancy_id").is_in(features["vacancy_id"]))

In [15]:
train.head()

user_id,session_id,vacancy_id,action_type,action_dt
str,str,str,i64,datetime[ns]
"""u_800105""","""s_14201662""","""v_703293""",2,2023-11-01 00:00:11.156
"""u_847702""","""s_34563950""","""v_1740412""",2,2023-11-01 00:00:22.942
"""u_232671""","""s_14038""","""v_2627133""",2,2023-11-01 00:00:43.167
"""u_351123""","""s_4817650""","""v_1198228""",2,2023-11-01 00:00:50.295
"""u_794918""","""s_23209024""","""v_291450""",2,2023-11-01 00:00:50.664


In [16]:
train.shape

(197841, 5)

In [17]:
train = train.filter(pl.col("vacancy_id").is_in(train["vacancy_id"].value_counts().filter(pl.col("count") >= min_item_cnt)["vacancy_id"]))
train = train.filter(pl.col("user_id").is_in(train["user_id"].value_counts().filter(pl.col("count") >= min_item_cnt)["user_id"]))
train.head()

user_id,session_id,vacancy_id,action_type,action_dt
str,str,str,i64,datetime[ns]
"""u_934685""","""s_811276""","""v_2006692""",2,2023-11-01 00:06:22.427
"""u_934685""","""s_811276""","""v_2006692""",1,2023-11-01 00:06:33.150
"""u_1142370""","""s_291772""","""v_1243207""",2,2023-11-01 00:06:46.709
"""u_1153819""","""s_10827508""","""v_2722626""",2,2023-11-01 00:12:24.688
"""u_1115024""","""s_6067860""","""v_896237""",2,2023-11-01 00:12:39.766


In [18]:
features = features.filter(pl.col("vacancy_id").is_in(train["vacancy_id"].unique()))

In [19]:
train.shape, features.shape

((20696, 5), (3123, 2))

In [20]:
unique_users = sorted(train["user_id"].unique().to_list())
unique_vacancies = sorted(train["vacancy_id"].unique().to_list())

user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
vac2idx = {vac_id: idx for idx, vac_id in enumerate(unique_vacancies)}

# **Making matrix**

In [21]:
idx2vac = {idx: vac_id for vac_id, idx in vac2idx.items()}
action_weights = {
    1: 4.0,
    2: 1.0,
    3: 2.0
}

In [22]:
features = features.with_columns(features["vacancy_id"].replace(vac2idx).alias("vacancy_id"))
features = features.with_columns(features["rate"].fill_null(features["rate"].mean()).alias("rate"))
features.head()

vacancy_id,rate
str,f64
"""2252""",0.713011
"""2430""",1.125985
"""2972""",1.189716
"""942""",0.876662
"""2820""",0.977777


In [23]:
pairs = train.select(["user_id", "vacancy_id", "action_type"])
pairs = pairs.with_columns(pairs["user_id"].replace(user2idx).alias("user_id"))
pairs = pairs.with_columns(pairs['vacancy_id'].replace(vac2idx).alias("vacancy_id"))
pairs = pairs.with_columns(pairs['action_type'].replace(action_weights).alias("action_type"))
pairs.head()

user_id,vacancy_id,action_type
str,str,f64
"""1570""","""1304""",1.0
"""1570""","""1304""",4.0
"""232""","""311""",1.0
"""245""","""2182""",1.0
"""173""","""2996""",1.0


In [24]:
pairs["vacancy_id"].unique().shape, features["vacancy_id"].unique().shape

((3123,), (3123,))

In [25]:
pairs = pairs.filter(pl.col("vacancy_id").is_in(features["vacancy_id"]))

In [26]:
# action_w = pl.DataFrame({"action_weights": pl.Series([]).cast(float)})
l = []
for i in tqdm(range(pairs.shape[0])): 
    for j in range(features.shape[0]):
        if pairs[i]["vacancy_id"].item() == features[j]["vacancy_id"].item():
            l.append(pairs[i]["action_type"].item()*features[j]["rate"].item())
            break
# action_w = action_w["action_weights"].append(pl.Series(l))
# action_w


  0%|          | 24/20696 [00:00<04:57, 69.38it/s]

100%|██████████| 20696/20696 [07:52<00:00, 43.80it/s] 


In [27]:
action_w = pl.DataFrame({"action_weights": pl.Series([]).cast(float)})
action_w = action_w["action_weights"].append(pl.Series(l))
action_w.head()

action_weights
f64
0.938731
3.754924
0.940774
0.877216
0.9375
1.190762
1.126846
4.507386
1.125
0.708333


In [28]:
pairs = pairs.with_columns(action_type = pl.Series(l))

In [29]:
users = pairs["user_id"].to_numpy().astype(int)
vacancies = pairs['vacancy_id'].to_numpy().astype(int)
preferences = pairs['action_type'].to_numpy().astype(float)

In [30]:
users

array([1570, 1570,  232, ...,  872,  959,  741])

In [31]:
vacancies

array([1304, 1304,  311, ...,  875, 1871, 2018])

In [32]:
preferences

SeriesView([0.93873094, 3.75492375, 0.94077429, ..., 0.97777653,
            1.125     , 1.1875    ])

In [33]:
uv_mat = csr_matrix((preferences, (users, vacancies)))
uv_mat

<1682x3123 sparse matrix of type '<class 'numpy.float64'>'
	with 8674 stored elements in Compressed Sparse Row format>

# **Working with BERT and ALS**

In [34]:
als_model = implicit.als.AlternatingLeastSquares(
    factors=150,
    random_state=RANDOM_STATE,
    iterations=20,
    calculate_training_loss=True,
    regularization=0.001
)
als_model.fit(uv_mat)

  check_blas_config()
100%|██████████| 20/20 [00:02<00:00,  7.70it/s, loss=0.00186]


# **Collecting data for user**

In [None]:
idx2vac = {str(vac2idx[i]): i for i in vac2idx}
idx2vac

In [None]:
val_vacs = pairs["vacancy_id"].unique().replace(idx2vac).shuffle()[::300]
val_vacs

In [71]:
parse_data = vac_df.filter(pl.col("vacancy_id").is_in(pairs["vacancy_id"].unique().replace(idx2vac).shuffle()[::300]))

vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,list[str],i64,i64,str,str,str,str,str,str
"""v_1210394""","""Менеджер по ра…","""c_270182""","""<p>Отдел по ра…",,200000.0,250000.0,"""RUR""","""a_1756""","""ar_41""","""full""","""fullDay""","""between1And3"""
"""v_1316797""","""Оператор call-…","""c_256239""","""<p><strong>Мед…","[""Грамотная речь"", ""входящие звонки"", … ""Прием и распределение телефонных звонков""]",51000.0,,"""RUR""","""a_1756""","""ar_41""","""full""","""shift""","""between1And3"""
"""v_241838""","""Разработчик C#…","""c_50063""","""<p>Ищем в кома…","[""Разработка ПО"", "".NET Core"", … ""Git""]",,,,"""a_1756""","""ar_41""","""full""","""flexible""","""between3And6"""
"""v_1922369""","""Менеджер отдел…","""c_262513""","""<p><strong>Обя…",,120000.0,300000.0,"""RUR""","""a_1756""","""ar_41""","""full""","""fullDay""","""noExperience"""
"""v_1430957""","""Менеджер по за…","""c_256601""","""<p><strong>ООО…","[""Деловая переписка"", ""Закупка товаров и услуг"", … ""Деловая коммуникация""]",60000.0,,"""RUR""","""a_6223""","""ar_78""","""full""","""fullDay""","""between1And3"""
"""v_1106255""","""Специалист упр…","""c_62154""","""<p><strong>Обя…",,67000.0,67000.0,"""RUR""","""a_1756""","""ar_41""","""full""","""fullDay""","""between1And3"""
"""v_3304""","""Оператор интер…","""c_18877""","""<p>Привет!</p>…","[""Телефонные переговоры"", ""оформление заказов"", … ""Консультирование клиентов по телефону""]",40000.0,,"""RUR""","""a_1756""","""ar_41""","""full""","""remote""","""between1And3"""
"""v_1419670""","""Системный инже…","""c_212655""","""<p><em><strong…","[""Linux"", ""SQL"", … ""внедрения решений IT""]",,,,"""a_1756""","""ar_41""","""full""","""fullDay""","""between1And3"""
"""v_2336712""","""Делопроизводит…","""c_59453""","""<strong>Обязан…","[""Пользователь ПК"", ""Грамотная речь"", … ""работа в ЭДО""]",35000.0,43000.0,"""RUR""","""a_1756""","""ar_41""","""full""","""fullDay""","""between1And3"""
"""v_1540618""","""Финансовый дир…","""c_274595""","""<p>Мы выходим …","[""1С: Предприятие 8"", ""Бухгалтерский учет"", … ""1С: Управление Торговлей""]",150000.0,,"""RUR""","""a_1756""","""ar_41""","""full""","""fullDay""","""moreThan6"""


# **Parse data**

In [54]:
parse_vac = vac_df[0]

vacancy_id,name,company.id,description,keySkills.keySkill,compensation.from,compensation.to,compensation.currencyCode,area.id,area.regionId,employment,workSchedule,workExperience
str,str,str,str,list[str],i64,i64,str,str,str,str,str,str
"""v_1840054""","""Бухгалтер (по …","""c_198109""","""<strong>Обязан…",,50000,65000,"""RUR""","""a_6223""","""ar_78""","""full""","""fullDay""","""between3And6"""


In [81]:
cur_code2rus = {"KZT": "₸", "BYR": "Br.", "EUR": "€", "KGS": "som", "RUR": "₽", "USD": "$", "UZS": "so'm"}
sch2rus = {
    "fullDay": "Полный рабочий день",
    "remote": "Удаленная работа",
    "flexible": "Гибкий график",
    "shift": "Работа по сменам",
    "flyInFlyOut": "Работа на вылет"
}
empl2rus = {
    "part": "Частичная занятость",
    "probation": "Испытательный срок",
    "full": "Полная занятость",
    "project": "Проектная работа",
    "volunteer": "Волонтерство"
}
exp2rus = {
    "moreThan6": "более 6 лет",
    "between1And3": "от 1 до 3 лет",
    "between3And6": "от 3 до 6 лет",
    "noExperience": "без опыта работы"
}

In [82]:
name = parse_vac["name"].item()
description = parse_vac["description"].item()
compensation_from = parse_vac["compensation.from"].item() if not(parse_vac["compensation.from"].item() is None) else 0
compensation_to = parse_vac["compensation.to"].item() if not(parse_vac["compensation.to"].item() is None) else compensation_from
res_compensation = f"{compensation_from} - {compensation_to}" if compensation_from != compensation_to else compensation_from
compensation_code = cur_code2rus[parse_vac["compensation.currencyCode"].item()] if not(parse_vac["compensation.currencyCode"].item() is None) else ""
skills = ", ".join(parse_vac["keySkills.keySkill"].item().to_list()) if not(parse_vac["keySkills.keySkill"].item() is None) else "Ключевые навыки не требуются"
employment = empl2rus[parse_vac["employment"].item()]
parse_work_sch = sch2rus[parse_vac["workSchedule"].item()]
exp = exp2rus[parse_vac["workExperience"].item()]

In [83]:

from IPython.display import display, HTML
# print_formatted_text(HTML("'<strong>Обязанности:</strong> <ul> <li>Осуществлять контроль за соблюдением посетителями правил поведения в музее</li> <li>При возникновении угрозы повреждения или кражи музейных предметов информировать руководство музея</li> </ul> <strong>Требования:</strong> <ul> <li>образование средне-специальное или высшее</li> <li>музейное дело, культурология</li> </ul> <strong>Условия:</strong> <ul> <li>Выходные: 2 дня. График с 09:00 до 18:00, по четвергам с 11:00 до 20:00</li> <li>Обед скользящий</li> </ul>'"))
display(HTML(f'''
             <h3>{name}</h3>
             {description}
             <p><b>Компенсация:</b> {res_compensation} {compensation_code}</p>
             <p><b>Ключевые навыки:</b> {skills}</p>
             <p><b>Трудоустройтво:</b> {employment}</p>
             <p><b>Расписание:</b> {parse_work_sch}</p>
             <p><b>Опыт работы:</b> {exp}</p>
             '''))