In [170]:
import re
import pandas as pd
import numpy as np
import nltk
import pymorphy3
import optuna
import mlflow

from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ndcg_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.sparse import csr_matrix, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from gensim.utils import simple_preprocess

import warnings
warnings.simplefilter('ignore', FutureWarning)

In [105]:
RANDOM_STATE = 42

In [103]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "hr-ai-scout"

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

# Load data

In [7]:
df = pd.read_csv('/Users/user/Documents/Magistracy/yearly_project/hr-ai-scout/total_df.csv')
df.head()

Unnamed: 0,vacancy_id,vacancy_name,vacancy_area,vacancy_experience,vacancy_employment,vacancy_schedule,vacancy_salary_from,vacancy_salary_to,vacancy_salary_currency,vacancy_salary_gross,...,resume_education,resume_courses,resume_salary,resume_age,resume_total_experience,resume_experience_months,resume_location,resume_gender,resume_applicant_status,target
0,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,300000.0,,RUR,False,...,['–ö–∞–∑–∞–Ω—Å–∫–∏–π –ê–≤–∏–∞—Ü–∏–æ–Ω–Ω—ã–π –ò–Ω—Å—Ç–∏—Ç—É—Ç'],,,65.0,19¬†–ª–µ—Ç,228.0,–ú–æ—Å–∫–≤–∞,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,1
1,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,300000.0,,RUR,False,...,"['–û–û–û ""–û—Ç–∫—Ä—ã—Ç—ã–π –£—á–µ–±–Ω—ã–π –¶–µ–Ω—Ç—Ä –°–æ—Ñ—Ç–ë–∞–ª–∞–Ω—Å"", –≥. ...","['–û–û–û ""–û—Ç–∫—Ä—ã—Ç—ã–π –£—á–µ–±–Ω—ã–π –¶–µ–Ω—Ç—Ä –°–æ—Ñ—Ç–ë–∞–ª–∞–Ω—Å"", –≥. ...",,43.0,17¬†–ª–µ—Ç 4¬†–º–µ—Å—è—Ü–∞,208.0,–ú–æ—Å–∫–≤–∞,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,1
2,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,300000.0,,RUR,False,...,['–û—Ä—Å–∫–∏–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π –ø–µ–¥–∞–≥–æ–≥–∏—á–µ—Å–∫–∏–π –∏–Ω—Å—Ç–∏—Ç...,,200‚Äâ000¬†‚ÇΩ –Ω–∞¬†—Ä—É–∫–∏,52.0,30¬†–ª–µ—Ç,360.0,–ú–æ—Å–∫–≤–∞,–ñ–µ–Ω—â–∏–Ω–∞,,1
3,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,300000.0,,RUR,False,...,['–ö—Ä–∞—Å–Ω–æ—è—Ä—Å–∫–∏–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç'],,500‚Äâ000¬†‚ÇΩ –Ω–∞¬†—Ä—É–∫–∏,56.0,29¬†–ª–µ—Ç 8¬†–º–µ—Å—è—Ü–µ–≤,356.0,–ö—Ä–∞—Å–Ω–æ—è—Ä—Å–∫,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,1
4,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,300000.0,,RUR,False,...,['–ë–µ–ª–æ—Ä—É—Å–∫–∏–π –ì–æ—Å. –£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –ò–Ω—Ñ–æ—Ä–º–∞—Ç–∏–∫–∏ –∏ –†–∞...,"['SAP CIS, SAP XI', '–®–∫–æ–ª–∞ –õ–æ–≥–∏—Å—Ç–∏–∫–∏ –ú–ê–î–ò', 'S...",,48.0,25¬†–ª–µ—Ç 1¬†–º–µ—Å—è—Ü,301.0,Moscow,Male,,1


# Preprocessing

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
–í –ø–µ—Ä–≤—É—é –æ—á–µ—Ä–µ–¥—å —É–±–µ—Ä–µ–º —Å—Ç—Ä–æ–∫–∏, –≥–¥–µ –ø—Ä–æ–ø—É—â–µ–Ω—ã –≤—Å–µ –∫–ª—é—á–µ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ —Ä–µ–∑—é–º–µ:
</div>

In [8]:
t1 = df.shape[0]
df = df.dropna(subset= ["resume_education",
                        "resume_last_experience_description",
                        "resume_last_position",
                        "resume_last_company_experience_period",
                        "resume_total_experience",
                        "resume_experience_months",
                        "resume_location",
                        "resume_specialization",
                        # "resume_gender",
                        # "resume_title"
                       ], how="all")
t2 = df.shape[0]
print('–£–¥–∞–ª–µ–Ω–æ ', t1 - t2 ,' —Å—Ç—Ä–æ–∫–∏')

–£–¥–∞–ª–µ–Ω–æ  84  —Å—Ç—Ä–æ–∫–∏


<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
–£–¥–∞–ª–∏–º –µ—â–µ —Ç–µ —Å—Ç—Ä–æ–∫–∏, –≥–¥–µ —Å–ª—É—á–∏–ª—Å—è —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π —Å–±–æ–π –≤ –ø–∞—Ä—Å–∏–Ω–≥–µ, –≥–¥–µ —É –∫–∞–Ω–¥–∏–¥–∞—Ç–∞ –æ–±—â–∏–π –æ–ø—ã—Ç –µ—Å—Ç—å, –∞ –ø–æ—Å–ª–µ–¥–Ω–∏–π –æ–ø—ã—Ç –Ω–µ —É–∫–∞–∑–∞–Ω (–∏ –Ω–∞–æ–±–æ—Ä–æ—Ç):
</div>

In [9]:
t1 = df.shape[0]
df = df.loc[~(df["resume_total_experience"].notna()
        & df["resume_last_experience_description"].isna()
        & df["resume_last_position"].isna())]
t2 = df.shape[0]
print('–£–¥–∞–ª–µ–Ω–æ ', t1 - t2 ,' —Å—Ç—Ä–æ–∫')

–£–¥–∞–ª–µ–Ω–æ  1543  —Å—Ç—Ä–æ–∫


In [10]:
t1 = df.shape[0]
df = df.loc[~(df["resume_total_experience"].isna()
        & df["resume_last_experience_description"].notna()
        & df["resume_last_position"].notna())]
t2 = df.shape[0]
print('–£–¥–∞–ª–µ–Ω–æ ', t1 - t2 ,' —Å—Ç—Ä–æ–∫')

–£–¥–∞–ª–µ–Ω–æ  0  —Å—Ç—Ä–æ–∫


<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
–ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ –ø—Ä–æ–ø—É—Å–∫–∏ –æ—Ç–¥–µ–ª—å–Ω–æ –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–º –∏ —á–∏—Å–ª–æ–≤—ã–º –ø—Ä–∏–∑–Ω–∞–∫–∞–º.
</div>

In [11]:
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns

In [12]:
df[cat_cols] = df[cat_cols].fillna('NDT')

In [13]:
df.loc[df['resume_experience_months'].isna(), 'resume_last_experience_description'].unique()

array(['NDT'], dtype=object)

In [14]:
df['resume_age'] = df['resume_age'].fillna(df['resume_age'].mean())
df['resume_experience_months'] = df['resume_experience_months'].fillna(0)

In [15]:
df = df.drop(['vacancy_salary_to', 'vacancy_salary_from',
              'vacancy_salary_currency', 'vacancy_salary_gross'], axis=1)

In [16]:
df.loc[df['resume_last_company_experience_period'] == 'NDT', 'resume_last_experience_description'].unique()

array(['NDT'], dtype=object)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

- –û–≥—Ä–∞–Ω–∏—á–∏–º –≤—ã–±—Ä–æ—Å—ã –ø–æ –∑–∞—Ä–ø–ª–∞—Ç–µ, –ø–æ—Ç–æ–º—É —á—Ç–æ —Ä–æ–≤–Ω–æ –æ–¥–Ω–æ –∑–Ω–∞—á–µ–Ω–∏–µ –ø–æ –æ–∂–∏–¥–∞–µ–º–æ–π –∑–∞—Ä–∞–±–æ—Ç–æ–Ω–æ–π –ø–ª–∞—Ç–µ = 999,999,999 (—Å–º–µ—à–Ω–æ, –Ω–æ –Ω–µ—Ç)

- –û–≥—Ä–∞–Ω–∏—á–∏–º –æ–ø—ã—Ç –æ–±—â–∏–π –∏ –≤–Ω—É—Ç—Ä–∏ –æ–¥–Ω–æ–π –∫–æ–º–ø–∞–Ω–∏–∏ –¥–æ 720 –º–µ—Å—è—Ü–µ–≤ (60 –ª–µ—Ç, –Ω–∏—á–µ–≥–æ —Å–µ–±–µ —É–∂–µ)

- –£–±–µ—Ä–µ–º –≤–æ–∑—Ä–∞—Å—Ç > 90, –Ω–µ –∂–¥–µ–º, —á—Ç–æ —ç—Ç–∏ –∫–∞–Ω–¥–∏–¥–∞—Ç—ã –Ω–∞—Ö–æ–¥—è—Ç—Å—è –≤ –ø–æ–∏—Å–∫–µ –≤–∞–∫–∞–Ω—Å–∏–∏
</div>

In [26]:
df = df[~(df.resume_salary > 1e7)]
df.loc[df['resume_experience_months'] > 720, 'resume_experience_months'] = 720
df.loc[df['resume_last_company_experience_months'] > 720, 'resume_last_company_experience_months'] = 720
df = df[~(df.resume_age > 90)]

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

- –¢–∞–∫–∂–µ —É–±–µ—Ä–µ–º —Å—Ç—Ä–æ–∫–∏, –≥–¥–µ –ø–æ—Å–ª–µ–¥–Ω–∏–π –æ–ø—ã—Ç –∫–∞–Ω–¥–∏–¥–∞—Ç–∞ –±–æ–ª—å—à–µ, —á–µ–º –æ–±—â–∏–π

- –ò –≥–¥–µ –æ–±—â–∏–π –æ–ø—ã—Ç –∫–∞–Ω–¥–∏–¥–∞—Ç–∞ +16 –ª–µ—Ç –±–æ–ª—å—à–µ —á–µ–º –≤–æ–∑—Ä–∞—Å—Ç (—Ö–æ—Ç—å —Ç–∞–∫)

</div>

In [27]:
df = df[~(df.resume_experience_months < df.resume_last_company_experience_months)]
df = df[~(df.resume_age < (df.resume_experience_months // 12) + 16)]

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

–ó–∞–º–µ–Ω–∏–º —Ç–µ–∫—É—â–∏–π —Ñ–æ—Ä–º–∞—Ç —Ä–∞–∑–±—Ä–æ—Å–∞ –ø–æ–ª–æ–≤ –≤ –¥–∞—Ç–∞—Å–µ—Ç–µ –Ω–∞ —É–Ω–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–π

</div>

In [29]:
gender_map = {
    '–ú—É–∂—á–∏–Ω–∞': '–ú—É–∂—á–∏–Ω–∞',
    'Male': '–ú—É–∂—á–∏–Ω–∞',
    '–ñ–µ–Ω—â–∏–Ω–∞': '–ñ–µ–Ω—â–∏–Ω–∞',
    'Female': '–ñ–µ–Ω—â–∏–Ω–∞'
}

df['resume_gender'] = df['resume_gender'].apply(lambda x: gender_map[x] if x in gender_map else '–ù–µ–∏–∑–≤–µ—Å—Ç–Ω–æ')

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
–ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Å–Ω–∞—á–∞–ª–∞ –æ–∂–∏–¥–∞–µ–º—ã–µ –∑–∞—Ä–ø–ª–∞—Ç—ã
</div>

In [19]:
df['resume_salary_split'] = df['resume_salary'].apply(lambda x: x.split())

df['salary_int'] = df['resume_salary_split'].apply(
    lambda x: int(''.join(part for part in x if re.fullmatch(r'\d+', part)))
              if any(re.fullmatch(r'\d+', part) for part in x)
              else np.nan
)

currency_symbols = ['‚ÇΩ', '$', '‚Ç¨', '‚Ç¥', '‚Ç∏', '‚Çº', '‚Çæ', 'Br', "so'm"]

rates_rub = {
    "‚ÇΩ": 1.0,
    "$": 80.85,
    "‚Ç¨": 94.14,
    "‚Ç¥": 1.94,
    "‚Ç∏": 0.150,
    "‚Çº": 47.8,
    "‚Çæ": 33.5,
    "Br": 28.7,
    "so'm": 0.0068
}

df['currency_symbol'] = df['resume_salary_split'].apply(
    lambda x: next((sym for sym in x if sym in currency_symbols), np.nan)
)

df['salary_converted'] = (df['salary_int'] * df['currency_symbol'].map(rates_rub)).fillna(0)

df['resume_salary'] = df['salary_converted']

df = df.drop(['resume_salary_split', 'salary_int', 'currency_symbol', 'salary_converted'], axis=1)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
–î–æ–±–∞–≤–∏–º –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–π —Å—Ç–æ–ª–±–µ—Ü —Å –æ–ø—ã—Ç–æ–º —Ä–∞–±–æ—Ç—ã –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–π –∫–æ–º–ø–∞–Ω–∏–∏ –≤ –º–µ—Å—è—Ü–∞—Ö –¥–ª—è —É–¥–æ–±—Å—Ç–≤–∞
</div>

In [21]:
def experience_to_months(experience_text):
    months = 0
    # –û–ø—ã—Ç –≤ –≥–æ–¥–∞—Ö
    years_match = re.search(r'(\d+)\s*–≥–æ–¥', experience_text)
    if years_match:
        months += int(years_match.group(1)) * 12

    years_match = re.search(r'(\d+)\s*–ª–µ—Ç', experience_text)
    if years_match:
        months += int(years_match.group(1)) * 12

    # –û–ø—ã—Ç –≤ –º–µ—Å—è—Ü–∞—Ö
    months_match = re.search(r'(\d+)\s*–º–µ—Å—è—Ü', experience_text)
    if months_match:
        months += int(months_match.group(1))

    return months if months > 0 else np.nan

In [22]:
df['resume_last_company_experience_months'] = df['resume_last_company_experience_period'].apply(experience_to_months)

In [23]:
df.loc[df['resume_last_company_experience_period'] == 'NDT', 'resume_last_experience_description'].unique()

array(['NDT'], dtype=object)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
–¢.–∫. –≤ –Ω–∞–∑–≤–∞–Ω–∏–∏ –∫–æ–º–ø–∞–Ω–∏–∏ —Å—Ç–æ–∏—Ç NDT, –º–æ–∂–Ω–æ —Å—Ç–æ–ª–±–µ—Ü resume_last_company_experience_months –∑–∞–ø–æ–ª–Ω—è—Ç—å –Ω—É–ª–µ–≤—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏.
</div>

In [24]:
df['resume_last_company_experience_months'] = df['resume_last_company_experience_months'].fillna(0)

# Base model

## Train-test split

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
–í—ã–±–µ—Ä–µ–º –ø—Ä–∏–∑–Ω–∞–∫–∏ –¥–ª—è –ø–µ—Ä–≤–∏—á–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è

</div>

In [238]:
features = [
    'vacancy_area',
    'vacancy_experience',
    'vacancy_employment', 
    'vacancy_schedule',
    # 'resume_specialization',
    # 'resume_education', 
    # 'resume_courses', 
    'resume_salary',
    'resume_age', 
    'resume_experience_months',
    'resume_location',
    'resume_gender', 
    'resume_applicant_status', 
    'resume_last_company_experience_months'
]
df[features]

Unnamed: 0,vacancy_experience,vacancy_employment,vacancy_schedule,resume_salary,resume_age,resume_experience_months,resume_gender,resume_applicant_status,resume_last_company_experience_months,location_matching,resume_skill_count_in_vacancy,last_position_in_vacancy,similarity_score_tfidf
0,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,65.000000,228.0,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,76.0,1,3,0.666667,0.284047
1,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,43.000000,208.0,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,8.0,1,2,0.500000,0.308726
2,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,200000.0,52.000000,360.0,–ñ–µ–Ω—â–∏–Ω–∞,NDT,136.0,1,1,0.000000,0.510093
3,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,500000.0,56.000000,356.0,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,135.0,0,2,0.333333,0.301062
4,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,48.000000,301.0,–ú—É–∂—á–∏–Ω–∞,NDT,0.0,0,2,0.600000,0.075429
...,...,...,...,...,...,...,...,...,...,...,...,...,...
325538,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,242550.0,66.000000,521.0,–ñ–µ–Ω—â–∏–Ω–∞,NDT,270.0,0,0,0.166667,0.072670
325539,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,40.000000,213.0,–ú—É–∂—á–∏–Ω–∞,–ê–∫—Ç–∏–≤–Ω–æ –∏—â–µ—Ç —Ä–∞–±–æ—Ç—É,35.0,1,0,0.000000,0.000000
325540,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,80000.0,44.060813,121.0,–ú—É–∂—á–∏–Ω–∞,NDT,44.0,1,0,0.200000,0.047398
325541,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,32.000000,117.0,–ñ–µ–Ω—â–∏–Ω–∞,NDT,96.0,1,0,0.200000,0.029086


In [239]:
numeric_features = df[features].select_dtypes(include=np.number).columns
categorical_features = df[features].select_dtypes(exclude=np.number).columns

In [270]:
X = df[features]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

In [271]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((260434, 13), (65109, 13), (260434,), (65109,))

## Column Transformer

In [273]:
col_transformer = ColumnTransformer([
        ('numeric_scaling', StandardScaler(), numeric_features),
        ('categorical_encoding', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

In [274]:
X_train_transformed = col_transformer.fit_transform(X_train)
X_test_transformed = col_transformer.transform(X_test)

## Train model

In [275]:
lr = LogisticRegression(random_state=RANDOM_STATE)
lr.fit(X_train_transformed, y_train)
y_pred_proba = lr.predict_proba(X_test_transformed)

df_test = df.loc[X_test.index]
df_test['y_pred_proba'] = y_pred_proba[:, 1]

In [272]:
def calculate_metrics(df_test: pd.DataFrame) -> pd.DataFrame:
    ndcg_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    vacancy_ids = df_test['vacancy_id'].unique()
    
    for vacancy_id in vacancy_ids:
        mask = df_test['vacancy_id'] == vacancy_id
        y_true = df_test.loc[mask, 'target'].values
        y_score = df_test.loc[mask, 'y_pred_proba'].values
        
        if len(y_true) <= 1:
            continue
        
        y_true_2d = y_true.reshape(1, -1)
        y_score_2d = y_score.reshape(1, -1)
        
        ndcg = ndcg_score(y_true_2d, y_score_2d)
        ndcg_scores.append(ndcg)
        
        y_pred_binary = (y_score >= 0.5).astype(int)
        
        precision = precision_score(y_true, y_pred_binary, zero_division=0)
        recall = recall_score(y_true, y_pred_binary, zero_division=0)
        f1 = f1_score(y_true, y_pred_binary, zero_division=0)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    if ndcg_scores:
        print(f"–°—Ä–µ–¥–Ω–∏–π NDCG: {np.mean(ndcg_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π Precision: {np.mean(precision_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π Recall: {np.mean(recall_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π F1-Score: {np.mean(f1_scores):.4f}")

        return np.mean(ndcg_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)
    else:
        print("–ù–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ –º–µ—Ç—Ä–∏–∫")

        return None, None, None, None

In [276]:
ndcg, precision, recall, f1 = calculate_metrics(df_test)
metrics_baseline = {}
metrics_baseline['ndcg'] = ndcg
metrics_baseline['precision'] = precision
metrics_baseline['recall'] = recall
metrics_baseline['f1'] = f1

–°—Ä–µ–¥–Ω–∏–π NDCG: 0.7371
–°—Ä–µ–¥–Ω–∏–π Precision: 0.6091
–°—Ä–µ–¥–Ω–∏–π Recall: 0.5684
–°—Ä–µ–¥–Ω–∏–π F1-Score: 0.5734


In [265]:
best_params = lr.get_params()

In [266]:
RUN_NAME = "base_line" 
REGISTRY_MODEL_NAME = "base_line_model"

In [268]:
signature = mlflow.models.infer_signature(X_test, y_test)
input_example = X_test_transformed[:10]
code_paths = ["linear_models.ipynb"]

try:
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
except:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    lr_info = mlflow.sklearn.log_model(sk_model=lr, 
                                       artifact_path='base_line_model',
                                       registered_model_name=REGISTRY_MODEL_NAME,
                                       input_example=input_example,
                                       code_paths=code_paths,
                                       await_registration_for=60
                                      )
    mlflow.log_metrics(metrics_baseline)
    mlflow.log_params(best_params)

Successfully registered model 'base_model'.
2025/11/19 16:51:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 60 seconds for model version to finish creation. Model name: base_model, version 1


üèÉ View run base_model at: http://127.0.0.1:5000/#/experiments/1/runs/dbfc86766916445bbbaf5a2fd5219e63
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '1' of model 'base_model'.


# Feature engineering

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
–î–æ–±–∞–≤–∏–º –ø—Ä–∏–∑–Ω–∞–∫ –º–∞—Ç—á–∏–Ω–≥–∞ –ª–æ–∫–∞—Ü–∏–∏ –≤–∞–∫–∞–Ω—Å–∏–∏ –∏ —Ä–µ–∑—é–º–µ

</div>

In [85]:
df['location_matching'] = df.apply(lambda row: 1 if row['vacancy_area'] == row['resume_location'] else 0, axis=1)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

–°–¥–µ–ª–∞–µ–º –Ω–æ–≤—ã–π –ø—Ä–∏–∑–Ω–∞–∫, –∞ –∏–º–µ–Ω–Ω–æ –ø–æ—Å—á–∏—Ç–∞–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –Ω–∞–≤—ã–∫–æ–≤ –∫–∞–Ω–¥–∏–¥–∞—Ç–∞, –∫–æ—Ç–æ—Ä—ã–µ —É–∫–∞–∑–∞–Ω—ã –≤ –≤–∞–∫–∞–Ω—Å–∏–∏.

</div>

In [86]:
def resume_skill_count_in_vacancy(row):
    count = 0
    skill_list = row['resume_skills'].replace('[', '').replace(']', '').replace("'", "").split(', ')
    for i in skill_list:
        if i in row['vacancy_description']:
            count += 1
    return count

df['resume_skill_count_in_vacancy'] = df.apply(resume_skill_count_in_vacancy, axis=1)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

–¢–∞–∫–∂–µ –ø–æ—Å—á–∏—Ç–∞–µ–º –¥–æ–ª—é —Å–ª–æ–≤ –∏–∑ –ø–æ—Å–ª–µ–¥–Ω–µ–π –¥–æ–ª–∂–Ω–æ—Å—Ç–∏ –≤ —Ä–µ–∑—é–º–µ, –∫–æ—Ç–æ—Ä—ã–µ —É–∫–∞–∑–∞–Ω—ã –≤ –≤–∞–∫–∞–Ω—Å–∏–∏.

</div>

In [76]:
def last_position_in_vacancy(row):
    bow = []
    seps = [' ', '-', '_']
    for sep in seps:
        bow += row['resume_last_position'].split(sep=sep)
        bow = list(set(bow))
    
    c = 0
    for word in bow:
        if word in row['vacancy_description']:
            c +=1
    
    return c / len(bow)

In [79]:
df['last_position_in_vacancy'] = df.apply(last_position_in_vacancy, axis=1)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

–¢–µ–ø–µ—Ä—å –∑–∞–∫–æ–¥–∏—Ä—É–µ–º –æ–ø–∏—Å–∞–Ω–∏–µ –≤–∞–∫–∞–Ω—Å–∏–∏ –∏ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ –æ–ø—ã—Ç–∞ —Ä–∞–±–æ—Ç—ã —Ä–∞–∑–Ω—ã–º–∏ —Å–ø–æ—Å–æ–±–∞–º–∏ –∏ —Å—Ä–∞–≤–Ω–∏–º —á–µ—Ä–µ–∑ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ.

</div>

In [32]:
def preprocess_data(df):
    """–û–±—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π –≤ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –ø–æ–ª—è—Ö"""
    print("–ü—Ä–æ–≤–µ—Ä–∫–∞ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π...")
    print(f"–ü—Ä–æ–ø—É—Å–∫–∏ –≤ vacancy_description: {df['vacancy_description'].isna().sum()}")
    print(f"–ü—Ä–æ–ø—É—Å–∫–∏ –≤ resume_last_experience_description: {df['resume_last_experience_description'].isna().sum()}")
    
    # –ó–∞–ø–æ–ª–Ω—è–µ–º –ø—Ä–æ–ø—É—Å–∫–∏ –ø—É—Å—Ç—ã–º–∏ —Å—Ç—Ä–æ–∫–∞–º–∏
    df['vacancy_description'] = df['vacancy_description'].fillna('')
    df['resume_last_experience_description'] = df['resume_last_experience_description'].fillna('')
    
    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –≤—Å–µ –∑–Ω–∞—á–µ–Ω–∏—è —Ç–µ–ø–µ—Ä—å —Å—Ç—Ä–æ–∫–æ–≤—ã–µ
    df['vacancy_description'] = df['vacancy_description'].astype(str)
    df['resume_last_experience_description'] = df['resume_last_experience_description'].astype(str)
    
    return df

In [33]:
def save_results(df, output_file):
    """–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –≤ CSV —Ñ–∞–π–ª"""
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"–†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ —Ñ–∞–π–ª: {output_file}")

In [34]:
def calculate_cosine_similarity(embeddings1, embeddings2):
    """–í—ã—á–∏—Å–ª–µ–Ω–∏–µ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞ –º–µ–∂–¥—É –¥–≤—É–º—è –Ω–∞–±–æ—Ä–∞–º–∏ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤"""
    similarities = []
    
    for i in tqdm(range(embeddings1.shape[0])):
        emb1_row = embeddings1[i]
        emb2_row = embeddings2[i]
        
        similarity = cosine_similarity(emb1_row, emb2_row)[0][0]
        similarities.append(similarity)
    
    return similarities

In [35]:
warnings.filterwarnings('ignore')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger_ru')
except LookupError:
    nltk.download('averaged_perceptron_tagger_ru')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

morph = pymorphy3.MorphAnalyzer()

[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [36]:
def lemmatize_russian(tokens):
    """–õ–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è —Ä—É—Å—Å–∫–∏—Ö —Å–ª–æ–≤"""
    lemmas = []
    for token in tokens:
        parsed = morph.parse(token)[0]  # –ë–µ—Ä–µ–º —Å–∞–º—ã–π –≤–µ—Ä–æ—è—Ç–Ω—ã–π —Ä–∞–∑–±–æ—Ä
        lemmas.append(parsed.normal_form)
    return lemmas

In [37]:
def tokenize_and_lemmatize(text):
    """–¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞ —Å –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–µ–π –∏ —É–¥–∞–ª–µ–Ω–∏–µ–º —Å—Ç–æ–ø-—Å–ª–æ–≤"""
    # –ë–∞–∑–æ–≤–∞—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è
    tokens = simple_preprocess(text, deacc=True, min_len=2)
    
    # –£–¥–∞–ª—è–µ–º —Å—Ç–æ–ø-—Å–ª–æ–≤–∞
    stop_words = set(stopwords.words('russian') + stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # –õ–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è –¥–ª—è —Ä—É—Å—Å–∫–∏—Ö —Å–ª–æ–≤
    lemmatized_tokens = lemmatize_russian(tokens)
    
    return lemmatized_tokens

In [44]:
def get_tfidf_embeddings(texts, vectorizer=None, fit=True):
    """–°–æ–∑–¥–∞–Ω–∏–µ TF-IDF —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –¥–ª—è —Å–ø–∏—Å–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤ —Å –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–µ–π"""
    if fit:
        vectorizer = TfidfVectorizer(
            max_features=5000,
            min_df=2,
            max_df=0.8,
            ngram_range=(1, 2),
            tokenizer=tokenize_and_lemmatize,
            token_pattern=None,
            lowercase=False  # –£–∂–µ —Å–¥–µ–ª–∞–Ω–æ –≤ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏
        )
        embeddings = vectorizer.fit_transform(texts)
    else:
        embeddings = vectorizer.transform(texts)
    
    return embeddings, vectorizer

In [45]:
def get_tfidf_vacancy_embeddings(df, vectorizer=None):
    """–°–æ–∑–¥–∞–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –¥–ª—è —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –≤–∞–∫–∞–Ω—Å–∏–π —Å –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–µ–π"""
    unique_vacancies = df[['vacancy_id', 'vacancy_description']].drop_duplicates()
    
    unique_embeddings, vectorizer = get_tfidf_embeddings(
        unique_vacancies['vacancy_description'].tolist(), 
        vectorizer=vectorizer, 
        fit=(vectorizer is None)
    )
    
    # –°–æ–∑–¥–∞–µ–º mapping: vacancy_id -> sparse row
    vacancy_embedding_dict = dict(zip(unique_vacancies['vacancy_id'], unique_embeddings))
    
    rows = []
    for vid in df['vacancy_id']:
        rows.append(vacancy_embedding_dict[vid])
    
    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –≤ –æ–¥–Ω—É sparse –º–∞—Ç—Ä–∏—Ü—É
    all_vacancy_embeddings = vstack(rows)
    
    return all_vacancy_embeddings, vectorizer

In [46]:
def process_similarity_scores_tfidf(df, vectorizer=None, fit=True):
    """–§—É–Ω–∫—Ü–∏—è –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è —Å—Ö–æ–∂–µ—Å—Ç–∏ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º TF-IDF –∏ –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–∏"""    
    # –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö
    df = preprocess_data(df)
    
    print("–°–æ–∑–¥–∞–Ω–∏–µ TF-IDF —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –¥–ª—è –æ–ø–∏—Å–∞–Ω–∏–π –æ–ø—ã—Ç–∞ –≤ —Ä–µ–∑—é–º–µ...")
    experience_embeddings, tfidf_vectorizer = get_tfidf_embeddings(df['resume_last_experience_description'].tolist(), vectorizer=vectorizer, fit=fit)
    
    print("–°–æ–∑–¥–∞–Ω–∏–µ TF-IDF —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –¥–ª—è –æ–ø–∏—Å–∞–Ω–∏–π –≤–∞–∫–∞–Ω—Å–∏–π...")
    vacancy_embeddings, _ = get_tfidf_vacancy_embeddings(df, vectorizer=tfidf_vectorizer)
    
    print("–í—ã—á–∏—Å–ª–µ–Ω–∏–µ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞...")
    similarity_scores = calculate_cosine_similarity(vacancy_embeddings, experience_embeddings)
    
    # –î–æ–±–∞–≤–ª—è–µ–º scores –≤ DataFrame
    df['similarity_score_tfidf'] = similarity_scores
    
    return df, tfidf_vectorizer

In [50]:
df_tfidf = process_similarity_scores_tfidf(df.copy())
save_results(df_tfidf, 'description_df_with_scores_tfidf.csv')

–ü—Ä–æ–≤–µ—Ä–∫–∞ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π...
–ü—Ä–æ–ø—É—Å–∫–∏ –≤ vacancy_description: 0
–ü—Ä–æ–ø—É—Å–∫–∏ –≤ resume_last_experience_description: 0
–°–æ–∑–¥–∞–Ω–∏–µ TF-IDF —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –¥–ª—è –æ–ø–∏—Å–∞–Ω–∏–π –æ–ø—ã—Ç–∞ –≤ —Ä–µ–∑—é–º–µ...
–°–æ–∑–¥–∞–Ω–∏–µ TF-IDF —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –¥–ª—è –æ–ø–∏—Å–∞–Ω–∏–π –≤–∞–∫–∞–Ω—Å–∏–π...
–í—ã—á–∏—Å–ª–µ–Ω–∏–µ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 325543/325543 [01:04<00:00, 5032.68it/s]


–†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ —Ñ–∞–π–ª: description_df_with_scores_tfidf.csv


In [52]:
df = df.merge(df_tfidf)

In [30]:
df

Unnamed: 0,vacancy_id,vacancy_name,vacancy_area,vacancy_experience,vacancy_employment,vacancy_schedule,vacancy_description,resume_id,resume_title,resume_specialization,...,resume_education,resume_courses,resume_salary,resume_age,resume_experience_months,resume_location,resume_gender,resume_applicant_status,target,resume_last_company_experience_months
0,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,"–ü—Ä–∏–≤–µ—Ç!.redev ‚Äî —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∞—è –∫–æ–º–ø–∞–Ω–∏—è, —Å–æ–∑–¥...",6969174,ABAP-—Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫,"['–ü—Ä–æ–≥—Ä–∞–º–º–∏—Å—Ç, —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫']",...,['–ö–∞–∑–∞–Ω—Å–∫–∏–π –ê–≤–∏–∞—Ü–∏–æ–Ω–Ω—ã–π –ò–Ω—Å—Ç–∏—Ç—É—Ç'],NDT,0.0,65.000000,228.0,–ú–æ—Å–∫–≤–∞,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,1,76.0
1,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,"–ü—Ä–∏–≤–µ—Ç!.redev ‚Äî —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∞—è –∫–æ–º–ø–∞–Ω–∏—è, —Å–æ–∑–¥...",9100077,"ABAP —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ - SAP HCM, CRM, S/4HANA ERP(F...","['–ü—Ä–æ–≥—Ä–∞–º–º–∏—Å—Ç, —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫']",...,"['–û–û–û ""–û—Ç–∫—Ä—ã—Ç—ã–π –£—á–µ–±–Ω—ã–π –¶–µ–Ω—Ç—Ä –°–æ—Ñ—Ç–ë–∞–ª–∞–Ω—Å"", –≥. ...","['–û–û–û ""–û—Ç–∫—Ä—ã—Ç—ã–π –£—á–µ–±–Ω—ã–π –¶–µ–Ω—Ç—Ä –°–æ—Ñ—Ç–ë–∞–ª–∞–Ω—Å"", –≥. ...",0.0,43.000000,208.0,–ú–æ—Å–∫–≤–∞,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,1,8.0
2,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,"–ü—Ä–∏–≤–µ—Ç!.redev ‚Äî —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∞—è –∫–æ–º–ø–∞–Ω–∏—è, —Å–æ–∑–¥...",32644957,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ ABAP,"['–ü—Ä–æ–≥—Ä–∞–º–º–∏—Å—Ç, —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫']",...,['–û—Ä—Å–∫–∏–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π –ø–µ–¥–∞–≥–æ–≥–∏—á–µ—Å–∫–∏–π –∏–Ω—Å—Ç–∏—Ç...,NDT,200000.0,52.000000,360.0,–ú–æ—Å–∫–≤–∞,–ñ–µ–Ω—â–∏–Ω–∞,NDT,1,136.0
3,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,"–ü—Ä–∏–≤–µ—Ç!.redev ‚Äî —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∞—è –∫–æ–º–ø–∞–Ω–∏—è, —Å–æ–∑–¥...",27220466,ABAP-—Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫,"['–ü—Ä–æ–≥—Ä–∞–º–º–∏—Å—Ç, —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫']",...,['–ö—Ä–∞—Å–Ω–æ—è—Ä—Å–∫–∏–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç'],NDT,500000.0,56.000000,356.0,–ö—Ä–∞—Å–Ω–æ—è—Ä—Å–∫,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,1,135.0
4,126167948,–†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ SAP ABAP,–ú–æ—Å–∫–≤–∞,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,"–ü—Ä–∏–≤–µ—Ç!.redev ‚Äî —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∞—è –∫–æ–º–ø–∞–Ω–∏—è, —Å–æ–∑–¥...",7532708,ABAP —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫. Senior ABAP Developer. SAP T...,"['Programmer, developer']",...,['–ë–µ–ª–æ—Ä—É—Å–∫–∏–π –ì–æ—Å. –£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –ò–Ω—Ñ–æ—Ä–º–∞—Ç–∏–∫–∏ –∏ –†–∞...,"['SAP CIS, SAP XI', '–®–∫–æ–ª–∞ –õ–æ–≥–∏—Å—Ç–∏–∫–∏ –ú–ê–î–ò', 'S...",0.0,48.000000,301.0,Moscow,–ú—É–∂—á–∏–Ω–∞,NDT,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332325,125547086,Golang developer (IT –±–∞–Ω–∫),–ú–æ—Å–∫–≤–∞,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ú—ã ‚Äî Wildberries. –ú—ã ‚Äî —ç—Ç–æ –∫—Ä—É–ø–Ω–µ–π—à–∏–π –º–∞—Ä–∫–µ—Ç–ø–ª...,255135,"–£–ø—Ä–∞–≤–ª—è—é—â–∏–π –Ω–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç—å—é, –î–∏—Ä–µ–∫—Ç–æ—Ä –ø–æ —É–ø—Ä–∞–≤–ª–µ...",['–ê–≥–µ–Ω—Ç –ø–æ –Ω–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç–∏'],...,['–õ–µ–Ω–∏–Ω–≥—Ä–∞–¥—Å–∫–∏–π –≠–ª–µ–∫—Ç—Ä–æ—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π –ò–Ω—Å—Ç–∏—Ç—É—Ç –∏–º...,['–ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω—ã–π –ö–æ–ª–ª–µ–¥–∂ –û—Ü–µ–Ω–∫–∏ –∏ –£–ø—Ä–∞–≤–ª–µ–Ω–∏—è –ù–µ–¥...,242550.0,66.000000,521.0,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,–ñ–µ–Ω—â–∏–Ω–∞,NDT,0,270.0
332326,125547086,Golang developer (IT –±–∞–Ω–∫),–ú–æ—Å–∫–≤–∞,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ú—ã ‚Äî Wildberries. –ú—ã ‚Äî —ç—Ç–æ –∫—Ä—É–ø–Ω–µ–π—à–∏–π –º–∞—Ä–∫–µ—Ç–ø–ª...,34423776,"–î–∏—Ä–µ–∫—Ç–æ—Ä, –û–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω—ã–π –¥–∏—Ä–µ–∫—Ç–æ—Ä,–¢–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π –¥–∏...","['–ú–µ—Ö–∞–Ω–∏–∫', '–¢–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π –¥–∏—Ä–µ–∫—Ç–æ—Ä (CTO)']",...,['–ö—É–∑–±–∞—Å—Å–∫–∏–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π —É–Ω–∏–≤–µ...,NDT,0.0,40.000000,213.0,–ú–æ—Å–∫–≤–∞,–ú—É–∂—á–∏–Ω–∞,–ê–∫—Ç–∏–≤–Ω–æ –∏—â–µ—Ç —Ä–∞–±–æ—Ç—É,0,35.0
332327,125547086,Golang developer (IT –±–∞–Ω–∫),–ú–æ—Å–∫–≤–∞,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ú—ã ‚Äî Wildberries. –ú—ã ‚Äî —ç—Ç–æ –∫—Ä—É–ø–Ω–µ–π—à–∏–π –º–∞—Ä–∫–µ—Ç–ø–ª...,54564265,–ò–Ω—Ç–µ—Ä–Ω–µ—Ç-–º–∞—Ä–∫–µ—Ç–æ–ª–æ–≥,"['–ú–µ–Ω–µ–¥–∂–µ—Ä –ø–æ –º–∞—Ä–∫–µ—Ç–∏–Ω–≥—É, –∏–Ω—Ç–µ—Ä–Ω–µ—Ç-–º–∞—Ä–∫–µ—Ç–æ–ª–æ–≥']",...,['–ú–æ—Å–∫–æ–≤—Å–∫–∏–π —Ñ–∏–Ω–∞–Ω—Å–æ–≤–æ-–ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω—ã–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ...,['–¢–æ—á–∫–∞ –ö–æ–¥–∞ - –í—Å–µ—Ä–æ—Å—Å–∏–π—Å–∫–∞—è —à–∫–æ–ª–∞ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ...,80000.0,44.060813,121.0,–ú–æ—Å–∫–≤–∞,–ú—É–∂—á–∏–Ω–∞,NDT,0,44.0
332328,125547086,Golang developer (IT –±–∞–Ω–∫),–ú–æ—Å–∫–≤–∞,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ú—ã ‚Äî Wildberries. –ú—ã ‚Äî —ç—Ç–æ –∫—Ä—É–ø–Ω–µ–π—à–∏–π –º–∞—Ä–∫–µ—Ç–ø–ª...,70588832,–ú–µ–Ω–µ–¥–∂–µ—Ä –ø–æ –ø—Ä–æ–¥—É–∫—Ç—É,"['–ú–µ–Ω–µ–¥–∂–µ—Ä –ø—Ä–æ–¥—É–∫—Ç–∞', '–†—É–∫–æ–≤–æ–¥–∏—Ç–µ–ª—å –ø—Ä–æ–µ–∫—Ç–æ–≤']",...,"['–ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω—ã–π –∏—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç ""...","['ECCN2017, Poland, Warsaw, Speaker', 'Max Pla...",0.0,32.000000,117.0,–ú–æ—Å–∫–≤–∞,–ñ–µ–Ω—â–∏–Ω–∞,NDT,0,96.0


In [238]:
features = [
    'vacancy_area',
    'vacancy_experience',
    'vacancy_employment', 
    'vacancy_schedule',
    # 'resume_specialization',
    # 'resume_education', 
    # 'resume_courses', 
    'resume_salary',
    'resume_age', 
    'resume_experience_months',
    'resume_location',
    'resume_gender', 
    'resume_applicant_status', 
    'resume_last_company_experience_months', 
    'location_matching',
    'resume_skill_count_in_vacancy',
    'last_position_in_vacancy',
    'similarity_score_tfidf'
]
df[features]

Unnamed: 0,vacancy_experience,vacancy_employment,vacancy_schedule,resume_salary,resume_age,resume_experience_months,resume_gender,resume_applicant_status,resume_last_company_experience_months,location_matching,resume_skill_count_in_vacancy,last_position_in_vacancy,similarity_score_tfidf
0,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,65.000000,228.0,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,76.0,1,3,0.666667,0.284047
1,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,43.000000,208.0,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,8.0,1,2,0.500000,0.308726
2,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,200000.0,52.000000,360.0,–ñ–µ–Ω—â–∏–Ω–∞,NDT,136.0,1,1,0.000000,0.510093
3,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,500000.0,56.000000,356.0,–ú—É–∂—á–∏–Ω–∞,–†–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è,135.0,0,2,0.333333,0.301062
4,–ë–æ–ª–µ–µ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,48.000000,301.0,–ú—É–∂—á–∏–Ω–∞,NDT,0.0,0,2,0.600000,0.075429
...,...,...,...,...,...,...,...,...,...,...,...,...,...
325538,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,242550.0,66.000000,521.0,–ñ–µ–Ω—â–∏–Ω–∞,NDT,270.0,0,0,0.166667,0.072670
325539,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,40.000000,213.0,–ú—É–∂—á–∏–Ω–∞,–ê–∫—Ç–∏–≤–Ω–æ –∏—â–µ—Ç —Ä–∞–±–æ—Ç—É,35.0,1,0,0.000000,0.000000
325540,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,80000.0,44.060813,121.0,–ú—É–∂—á–∏–Ω–∞,NDT,44.0,1,0,0.200000,0.047398
325541,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,0.0,32.000000,117.0,–ñ–µ–Ω—â–∏–Ω–∞,NDT,96.0,1,0,0.200000,0.029086


In [239]:
numeric_features = df[features].select_dtypes(include=np.number).columns
categorical_features = df[features].select_dtypes(exclude=np.number).columns

## Train-test split

In [270]:
X = df[features]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

In [271]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((260434, 13), (65109, 13), (260434,), (65109,))

# Base pipeline

In [273]:
col_transformer = ColumnTransformer([
        ('numeric_scaling', StandardScaler(), numeric_features),
        ('categorical_encoding', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

In [274]:
X_train_transformed = col_transformer.fit_transform(X_train)
X_test_transformed = col_transformer.transform(X_test)

In [275]:
lr = LogisticRegression(random_state=RANDOM_STATE)
lr.fit(X_train_transformed, y_train)
y_pred_proba = lr.predict_proba(X_test_transformed)

df_test = df.loc[X_test.index]
df_test['y_pred_proba'] = y_pred_proba[:, 1]

In [272]:
def calculate_metrics(df_test: pd.DataFrame) -> pd.DataFrame:
    ndcg_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    vacancy_ids = df_test['vacancy_id'].unique()
    
    for vacancy_id in vacancy_ids:
        mask = df_test['vacancy_id'] == vacancy_id
        y_true = df_test.loc[mask, 'target'].values
        y_score = df_test.loc[mask, 'y_pred_proba'].values
        
        if len(y_true) <= 1:
            continue
        
        y_true_2d = y_true.reshape(1, -1)
        y_score_2d = y_score.reshape(1, -1)
        
        ndcg = ndcg_score(y_true_2d, y_score_2d)
        ndcg_scores.append(ndcg)
        
        y_pred_binary = (y_score >= 0.5).astype(int)
        
        precision = precision_score(y_true, y_pred_binary, zero_division=0)
        recall = recall_score(y_true, y_pred_binary, zero_division=0)
        f1 = f1_score(y_true, y_pred_binary, zero_division=0)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    if ndcg_scores:
        print(f"–°—Ä–µ–¥–Ω–∏–π NDCG: {np.mean(ndcg_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π Precision: {np.mean(precision_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π Recall: {np.mean(recall_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π F1-Score: {np.mean(f1_scores):.4f}")

        return np.mean(ndcg_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)
    else:
        print("–ù–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ –º–µ—Ç—Ä–∏–∫")

        return None, None, None, None

In [276]:
ndcg, precision, recall, f1 = calculate_metrics(df_test)
metrics_baseline = {}
metrics_baseline['ndcg'] = ndcg
metrics_baseline['precision'] = precision
metrics_baseline['recall'] = recall
metrics_baseline['f1'] = f1

–°—Ä–µ–¥–Ω–∏–π NDCG: 0.7371
–°—Ä–µ–¥–Ω–∏–π Precision: 0.6091
–°—Ä–µ–¥–Ω–∏–π Recall: 0.5684
–°—Ä–µ–¥–Ω–∏–π F1-Score: 0.5734


In [265]:
best_params = lr.get_params()

In [266]:
RUN_NAME = "base_model" 
REGISTRY_MODEL_NAME = "base_model"

In [268]:
signature = mlflow.models.infer_signature(X_test, y_test)
input_example = X_test_transformed[:10]
code_paths = ["linear_models.ipynb"]

try:
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
except:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    lr_info = mlflow.sklearn.log_model(sk_model=lr, 
                                       artifact_path='base_model',
                                       registered_model_name=REGISTRY_MODEL_NAME,
                                       input_example=input_example,
                                       code_paths=code_paths,
                                       await_registration_for=60
                                      )
    mlflow.log_metrics(metrics_baseline)
    mlflow.log_params(best_params)

Successfully registered model 'base_model'.
2025/11/19 16:51:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 60 seconds for model version to finish creation. Model name: base_model, version 1


üèÉ View run base_model at: http://127.0.0.1:5000/#/experiments/1/runs/dbfc86766916445bbbaf5a2fd5219e63
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '1' of model 'base_model'.


In [278]:
from sklearn import set_config
set_config(display='diagram')

pipeline = Pipeline([
    ('preprocessing', ColumnTransformer([
        ('numeric_scaling', StandardScaler(), numeric_features),
        ('categorical_encoding', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])),
    ('model', LogisticRegression(random_state=RANDOM_STATE))
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric_scaling', ...), ('categorical_encoding', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [279]:
y_pred_proba = pipeline.predict_proba(X_test)

In [280]:
y_pred_proba[:, 1]

array([0.00294784, 0.00319049, 0.00142391, ..., 0.00262649, 0.00302502,
       0.00397595], shape=(65109,))

In [281]:
df_test = df.loc[X_test.index]
df_test['y_pred_proba'] = y_pred_proba[:, 1]

In [282]:
def calculate_metrics(df_test: pd.DataFrame) -> pd.DataFrame:
    ndcg_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    vacancy_ids = df_test['vacancy_id'].unique()
    
    for vacancy_id in vacancy_ids:
        mask = df_test['vacancy_id'] == vacancy_id
        y_true = df_test.loc[mask, 'target'].values
        y_score = df_test.loc[mask, 'y_pred_proba'].values
        
        if len(y_true) <= 1:
            continue
        
        y_true_2d = y_true.reshape(1, -1)
        y_score_2d = y_score.reshape(1, -1)
        
        ndcg = ndcg_score(y_true_2d, y_score_2d)
        ndcg_scores.append(ndcg)
        
        y_pred_binary = (y_score >= 0.5).astype(int)
        
        precision = precision_score(y_true, y_pred_binary, zero_division=0)
        recall = recall_score(y_true, y_pred_binary, zero_division=0)
        f1 = f1_score(y_true, y_pred_binary, zero_division=0)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    if ndcg_scores:
        print(f"–°—Ä–µ–¥–Ω–∏–π NDCG: {np.mean(ndcg_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π Precision: {np.mean(precision_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π Recall: {np.mean(recall_scores):.4f}")
        print(f"–°—Ä–µ–¥–Ω–∏–π F1-Score: {np.mean(f1_scores):.4f}")

        return np.mean(ndcg_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)
    else:
        print("–ù–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ –º–µ—Ç—Ä–∏–∫")

        return None, None, None, None

In [283]:
ndcg, precision, recall, f1 = calculate_metrics(df_test)
metrics_baseline = {}
metrics_baseline['ndcg'] = ndcg
metrics_baseline['precision'] = precision
metrics_baseline['recall'] = recall
metrics_baseline['f1'] = f1

–°—Ä–µ–¥–Ω–∏–π NDCG: 0.7371
–°—Ä–µ–¥–Ω–∏–π Precision: 0.6091
–°—Ä–µ–¥–Ω–∏–π Recall: 0.5684
–°—Ä–µ–¥–Ω–∏–π F1-Score: 0.5734


In [None]:
RUN_NAME = "base_pip" 
REGISTRY_MODEL_NAME = "base_model"