In [85]:
import re
import pandas as pd
import numpy as np
import nltk
import pymorphy3
import optuna
import mlflow
import pickle

from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ndcg_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.sparse import csr_matrix, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from gensim.utils import simple_preprocess

import warnings
warnings.simplefilter('ignore', FutureWarning)

from sklearn import set_config
set_config(display='diagram')

In [2]:
RANDOM_STATE = 42

In [3]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "hr-ai-scout"

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

# Load data

In [4]:
df = pd.read_csv('/Users/user/Documents/Magistracy/yearly_project/hr-ai-scout/total_df.csv')
df.head()

Unnamed: 0,vacancy_id,vacancy_name,vacancy_area,vacancy_experience,vacancy_employment,vacancy_schedule,vacancy_salary_from,vacancy_salary_to,vacancy_salary_currency,vacancy_salary_gross,...,resume_education,resume_courses,resume_salary,resume_age,resume_total_experience,resume_experience_months,resume_location,resume_gender,resume_applicant_status,target
0,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,300000.0,,RUR,False,...,['Казанский Авиационный Институт'],,,65.0,19 лет,228.0,Москва,Мужчина,Рассматривает предложения,1
1,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,300000.0,,RUR,False,...,"['ООО ""Открытый Учебный Центр СофтБаланс"", г. ...","['ООО ""Открытый Учебный Центр СофтБаланс"", г. ...",,43.0,17 лет 4 месяца,208.0,Москва,Мужчина,Рассматривает предложения,1
2,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,300000.0,,RUR,False,...,['Орский государственный педагогический инстит...,,200 000 ₽ на руки,52.0,30 лет,360.0,Москва,Женщина,,1
3,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,300000.0,,RUR,False,...,['Красноярский государственный университет'],,500 000 ₽ на руки,56.0,29 лет 8 месяцев,356.0,Красноярск,Мужчина,Рассматривает предложения,1
4,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,300000.0,,RUR,False,...,['Белоруский Гос. Университет Информатики и Ра...,"['SAP CIS, SAP XI', 'Школа Логистики МАДИ', 'S...",,48.0,25 лет 1 месяц,301.0,Moscow,Male,,1


# Preprocessing

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
В первую очередь уберем строки, где пропущены все ключевые значения в резюме:
</div>

In [5]:
t1 = df.shape[0]
df = df.dropna(subset= ["resume_education",
                        "resume_last_experience_description",
                        "resume_last_position",
                        "resume_last_company_experience_period",
                        "resume_total_experience",
                        "resume_experience_months",
                        "resume_location",
                        "resume_specialization",
                        # "resume_gender",
                        # "resume_title"
                       ], how="all")
t2 = df.shape[0]
print('Удалено ', t1 - t2 ,' строки')

Удалено  84  строки


<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
Удалим еще те строки, где случился технический сбой в парсинге, где у кандидата общий опыт есть, а последний опыт не указан (и наоборот):
</div>

In [6]:
t1 = df.shape[0]
df = df.loc[~(df["resume_total_experience"].notna()
        & df["resume_last_experience_description"].isna()
        & df["resume_last_position"].isna())]
t2 = df.shape[0]
print('Удалено ', t1 - t2 ,' строк')

Удалено  1543  строк


In [7]:
t1 = df.shape[0]
df = df.loc[~(df["resume_total_experience"].isna()
        & df["resume_last_experience_description"].notna()
        & df["resume_last_position"].notna())]
t2 = df.shape[0]
print('Удалено ', t1 - t2 ,' строк')

Удалено  0  строк


<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
Посмотрим на пропуски отдельно по категориальным и числовым признакам.
</div>

In [8]:
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns

In [9]:
df[cat_cols] = df[cat_cols].fillna('NDT')

In [10]:
df.loc[df['resume_experience_months'].isna(), 'resume_last_experience_description'].unique()

array(['NDT'], dtype=object)

In [11]:
df['resume_age'] = df['resume_age'].fillna(df['resume_age'].mean())
df['resume_experience_months'] = df['resume_experience_months'].fillna(0)

In [12]:
df = df.drop(['vacancy_salary_to', 'vacancy_salary_from',
              'vacancy_salary_currency', 'vacancy_salary_gross'], axis=1)

In [13]:
df.loc[df['resume_last_company_experience_period'] == 'NDT', 'resume_last_experience_description'].unique()

array(['NDT'], dtype=object)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
Преобразуем сначала ожидаемые зарплаты
</div>

In [14]:
df['resume_salary_split'] = df['resume_salary'].apply(lambda x: x.split())

df['salary_int'] = df['resume_salary_split'].apply(
    lambda x: int(''.join(part for part in x if re.fullmatch(r'\d+', part)))
              if any(re.fullmatch(r'\d+', part) for part in x)
              else np.nan
)

currency_symbols = ['₽', '$', '€', '₴', '₸', '₼', '₾', 'Br', "so'm"]

rates_rub = {
    "₽": 1.0,
    "$": 80.85,
    "€": 94.14,
    "₴": 1.94,
    "₸": 0.150,
    "₼": 47.8,
    "₾": 33.5,
    "Br": 28.7,
    "so'm": 0.0068
}

df['currency_symbol'] = df['resume_salary_split'].apply(
    lambda x: next((sym for sym in x if sym in currency_symbols), np.nan)
)

df['salary_converted'] = (df['salary_int'] * df['currency_symbol'].map(rates_rub)).fillna(0)

df['resume_salary'] = df['salary_converted']

df = df.drop(['resume_salary_split', 'salary_int', 'currency_symbol', 'salary_converted'], axis=1)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
Добавим дополнительный столбец с опытом работы в последней компании в месяцах для удобства
</div>

In [15]:
def experience_to_months(experience_text):
    months = 0
    # Опыт в годах
    years_match = re.search(r'(\d+)\s*год', experience_text)
    if years_match:
        months += int(years_match.group(1)) * 12

    years_match = re.search(r'(\d+)\s*лет', experience_text)
    if years_match:
        months += int(years_match.group(1)) * 12

    # Опыт в месяцах
    months_match = re.search(r'(\d+)\s*месяц', experience_text)
    if months_match:
        months += int(months_match.group(1))

    return months if months > 0 else np.nan

In [16]:
df['resume_last_company_experience_months'] = df['resume_last_company_experience_period'].apply(experience_to_months)

In [17]:
df.loc[df['resume_last_company_experience_period'] == 'NDT', 'resume_last_experience_description'].unique()

array(['NDT'], dtype=object)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
Т.к. в названии компании стоит NDT, можно столбец resume_last_company_experience_months заполнять нулевыми значениями.
</div>

In [18]:
df['resume_last_company_experience_months'] = df['resume_last_company_experience_months'].fillna(0)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

- Ограничим выбросы по зарплате, потому что ровно одно значение по ожидаемой заработоной плате = 999,999,999 (смешно, но нет)

- Ограничим опыт общий и внутри одной компании до 720 месяцев (60 лет, ничего себе уже)

- Уберем возраст > 90, не ждем, что эти кандидаты находятся в поиске вакансии
</div>

In [19]:
df = df[~(df.resume_salary > 1e7)]
df.loc[df['resume_experience_months'] > 720, 'resume_experience_months'] = 720
df.loc[df['resume_last_company_experience_months'] > 720, 'resume_last_company_experience_months'] = 720
df = df[~(df.resume_age > 90)]

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

- Также уберем строки, где последний опыт кандидата больше, чем общий

- И где общий опыт кандидата +16 лет больше чем возраст (хоть так)

</div>

In [20]:
df = df[~(df.resume_experience_months < df.resume_last_company_experience_months)]
df = df[~(df.resume_age < (df.resume_experience_months // 12) + 16)]

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Заменим текущий формат разброса полов в датасете на унифицированный

</div>

In [21]:
gender_map = {
    'Мужчина': 'Мужчина',
    'Male': 'Мужчина',
    'Женщина': 'Женщина',
    'Female': 'Женщина'
}

df['resume_gender'] = df['resume_gender'].apply(lambda x: gender_map[x] if x in gender_map else 'Неизвестно')

# Base model

## Train-test split

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
Выберем признаки для первичного обучения

</div>

In [22]:
features = [
    'vacancy_area',
    'vacancy_experience',
    'vacancy_employment', 
    'vacancy_schedule',
    # 'resume_specialization',
    # 'resume_education', 
    # 'resume_courses', 
    'resume_salary',
    'resume_age', 
    'resume_experience_months',
    'resume_location',
    'resume_gender', 
    'resume_applicant_status', 
    'resume_last_company_experience_months'
]
df[features]

Unnamed: 0,vacancy_area,vacancy_experience,vacancy_employment,vacancy_schedule,resume_salary,resume_age,resume_experience_months,resume_location,resume_gender,resume_applicant_status,resume_last_company_experience_months
0,Москва,Более 6 лет,Полная занятость,Удаленная работа,0.0,65.000000,228.0,Москва,Мужчина,Рассматривает предложения,76.0
1,Москва,Более 6 лет,Полная занятость,Удаленная работа,0.0,43.000000,208.0,Москва,Мужчина,Рассматривает предложения,8.0
2,Москва,Более 6 лет,Полная занятость,Удаленная работа,200000.0,52.000000,360.0,Москва,Женщина,NDT,136.0
3,Москва,Более 6 лет,Полная занятость,Удаленная работа,500000.0,56.000000,356.0,Красноярск,Мужчина,Рассматривает предложения,135.0
4,Москва,Более 6 лет,Полная занятость,Удаленная работа,0.0,48.000000,301.0,Moscow,Мужчина,NDT,0.0
...,...,...,...,...,...,...,...,...,...,...,...
332325,Москва,От 3 до 6 лет,Полная занятость,Удаленная работа,242550.0,66.000000,521.0,Санкт-Петербург,Женщина,NDT,270.0
332326,Москва,От 3 до 6 лет,Полная занятость,Удаленная работа,0.0,40.000000,213.0,Москва,Мужчина,Активно ищет работу,35.0
332327,Москва,От 3 до 6 лет,Полная занятость,Удаленная работа,80000.0,44.060813,121.0,Москва,Мужчина,NDT,44.0
332328,Москва,От 3 до 6 лет,Полная занятость,Удаленная работа,0.0,32.000000,117.0,Москва,Женщина,NDT,96.0


In [23]:
numeric_features = df[features].select_dtypes(include=np.number).columns
categorical_features = df[features].select_dtypes(exclude=np.number).columns

In [24]:
X = df[features].copy()
y = df['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

In [25]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((260434, 11), (65109, 11), (260434,), (65109,))

## Base pipeline

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Построим базовый pipeline

</div>

In [26]:
%%time
pipeline = Pipeline([
    ('preprocessing', ColumnTransformer([
        ('numeric_scaling', StandardScaler(), numeric_features),
        ('categorical_encoding', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])),
    ('model', LogisticRegression(random_state=RANDOM_STATE))
])

pipeline.fit(X_train, y_train)

CPU times: user 1.05 s, sys: 25.4 ms, total: 1.07 s
Wall time: 1.08 s


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric_scaling', ...), ('categorical_encoding', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [27]:
y_pred_proba = pipeline.predict_proba(X_test)

In [28]:
y_pred_proba[:, 1]

array([0.0552009 , 0.05356376, 0.01734845, ..., 0.24757553, 0.12031918,
       0.08095766], shape=(65109,))

In [29]:
df_test = df.loc[X_test.index].copy()
df_test['y_pred_proba'] = y_pred_proba[:, 1]

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Напишем функцию для расчет метрики NDCG с учетом каждой вакансии, а также добавим вспомогательные метрики.

</div>

In [30]:
def calculate_metrics(df_test: pd.DataFrame) -> pd.DataFrame:
    ndcg_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    vacancy_ids = df_test['vacancy_id'].unique()
    
    for vacancy_id in vacancy_ids:
        mask = df_test['vacancy_id'] == vacancy_id
        y_true = df_test.loc[mask, 'target'].values
        y_score = df_test.loc[mask, 'y_pred_proba'].values
        
        if len(y_true) <= 1:
            continue
        
        y_true_2d = y_true.reshape(1, -1)
        y_score_2d = y_score.reshape(1, -1)
        
        ndcg = ndcg_score(y_true_2d, y_score_2d)
        ndcg_scores.append(ndcg)
        
        y_pred_binary = (y_score >= 0.5).astype(int)
        
        precision = precision_score(y_true, y_pred_binary, zero_division=0)
        recall = recall_score(y_true, y_pred_binary, zero_division=0)
        f1 = f1_score(y_true, y_pred_binary, zero_division=0)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    
    if ndcg_scores:
        print(f"Средний NDCG: {np.mean(ndcg_scores):.4f}")
        print(f"Средний Precision: {np.mean(precision_scores):.4f}")
        print(f"Средний Recall: {np.mean(recall_scores):.4f}")
        print(f"Средний F1-Score: {np.mean(f1_scores):.4f}")

        return np.mean(ndcg_scores), np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)
    else:
        print("Недостаточно данных для расчета метрик")

        return None, None, None, None

In [31]:
ndcg, precision, recall, f1 = calculate_metrics(df_test)
metrics_baseline = {}
metrics_baseline['NDCG'] = ndcg
metrics_baseline['Precision'] = precision
metrics_baseline['Recall'] = recall
metrics_baseline['F1'] = f1

Средний NDCG: 0.5043
Средний Precision: 0.0890
Средний Recall: 0.0574
Средний F1-Score: 0.0662


<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

На дефолтных параметрах точность оставляет желать лучшего

</div>

In [32]:
best_params = pipeline.get_params()

In [33]:
RUN_NAME = "base_pipeline" 
REGISTRY_MODEL_NAME = "base_pipeline"

In [34]:
signature = mlflow.models.infer_signature(X_test, y_test)
input_example = X_test[:10]
code_paths = ["linear_models.ipynb"]

try:
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
except:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    lr_info = mlflow.sklearn.log_model(sk_model=pipeline, 
                                       artifact_path='base_pipeline',
                                       registered_model_name=REGISTRY_MODEL_NAME,
                                       input_example=input_example,
                                       code_paths=code_paths,
                                       await_registration_for=60
                                      )
    mlflow.log_metrics(metrics_baseline)
    mlflow.log_params(best_params)

Successfully registered model 'base_pipeline'.
2025/11/19 22:42:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 60 seconds for model version to finish creation. Model name: base_pipeline, version 1


🏃 View run base_pipeline at: http://127.0.0.1:5000/#/experiments/1/runs/c591b0e6dba740b7b92e3cae5b35581b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '1' of model 'base_pipeline'.


# Feature engineering

## Create new feature

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">
Добавим признак матчинга локации вакансии и резюме

</div>

In [35]:
df['location_matching'] = df.apply(lambda row: 1 if row['vacancy_area'] == row['resume_location'] else 0, axis=1)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Сделаем новый признак, а именно посчитаем количество навыков кандидата, которые указаны в вакансии.

</div>

In [36]:
def resume_skill_count_in_vacancy(row):
    count = 0
    skill_list = row['resume_skills'].replace('[', '').replace(']', '').replace("'", "").split(', ')
    for i in skill_list:
        if i in row['vacancy_description']:
            count += 1
    return count

df['resume_skill_count_in_vacancy'] = df.apply(resume_skill_count_in_vacancy, axis=1)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Также посчитаем долю слов из последней должности в резюме, которые указаны в вакансии.

</div>

In [37]:
def last_position_in_vacancy(row):
    bow = []
    seps = [' ', '-', '_']
    for sep in seps:
        bow += row['resume_last_position'].split(sep=sep)
        bow = list(set(bow))
    
    c = 0
    for word in bow:
        if word in row['vacancy_description']:
            c +=1
    
    return c / len(bow)

In [38]:
df['last_position_in_vacancy'] = df.apply(last_position_in_vacancy, axis=1)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Теперь закодируем описание вакансии и последнего опыта работы разными способами и сравним через косинусное расстояние.

</div>

In [39]:
def preprocess_data(df):
    """Обработка пропущенных значений в текстовых полях"""
    print("Проверка пропущенных значений...")
    print(f"Пропуски в vacancy_description: {df['vacancy_description'].isna().sum()}")
    print(f"Пропуски в resume_last_experience_description: {df['resume_last_experience_description'].isna().sum()}")
    
    # Заполняем пропуски пустыми строками
    df['vacancy_description'] = df['vacancy_description'].fillna('')
    df['resume_last_experience_description'] = df['resume_last_experience_description'].fillna('')
    
    # Проверяем, что все значения теперь строковые
    df['vacancy_description'] = df['vacancy_description'].astype(str)
    df['resume_last_experience_description'] = df['resume_last_experience_description'].astype(str)
    
    return df

In [40]:
def save_results(df, output_file):
    """Сохранение результатов в CSV файл"""
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Результаты сохранены в файл: {output_file}")

In [41]:
def calculate_cosine_similarity(embeddings1, embeddings2):
    """Вычисление косинусного сходства между двумя наборами эмбеддингов"""
    similarities = []
    
    for i in tqdm(range(embeddings1.shape[0])):
        emb1_row = embeddings1[i]
        emb2_row = embeddings2[i]
        
        similarity = cosine_similarity(emb1_row, emb2_row)[0][0]
        similarities.append(similarity)
    
    return similarities

In [42]:
warnings.filterwarnings('ignore')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger_ru')
except LookupError:
    nltk.download('averaged_perceptron_tagger_ru')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

morph = pymorphy3.MorphAnalyzer()

[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [43]:
def lemmatize_russian(tokens):
    """Лемматизация русских слов"""
    lemmas = []
    for token in tokens:
        parsed = morph.parse(token)[0]  # Берем самый вероятный разбор
        lemmas.append(parsed.normal_form)
    return lemmas

In [44]:
def tokenize_and_lemmatize(text):
    """Токенизация текста с лемматизацией и удалением стоп-слов"""
    # Базовая токенизация
    tokens = simple_preprocess(text, deacc=True, min_len=2)
    
    # Удаляем стоп-слова
    stop_words = set(stopwords.words('russian') + stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Лемматизация для русских слов
    lemmatized_tokens = lemmatize_russian(tokens)
    
    return lemmatized_tokens

In [45]:
def get_tfidf_embeddings(texts, vectorizer=None, fit=True):
    """Создание TF-IDF эмбеддингов для списка текстов с лемматизацией"""
    if fit:
        vectorizer = TfidfVectorizer(
            max_features=5000,
            min_df=2,
            max_df=0.8,
            ngram_range=(1, 2),
            tokenizer=tokenize_and_lemmatize,
            token_pattern=None,
            lowercase=False  # Уже сделано в токенизации
        )
        embeddings = vectorizer.fit_transform(texts)
    else:
        embeddings = vectorizer.transform(texts)
    
    return embeddings, vectorizer

In [46]:
def get_tfidf_vacancy_embeddings(df, vectorizer=None):
    """Создание эмбеддингов для уникальных вакансий с лемматизацией"""
    unique_vacancies = df[['vacancy_id', 'vacancy_description']].drop_duplicates()
    
    unique_embeddings, vectorizer = get_tfidf_embeddings(
        unique_vacancies['vacancy_description'].tolist(), 
        vectorizer=vectorizer, 
        fit=(vectorizer is None)
    )
    
    # Создаем mapping: vacancy_id -> sparse row
    vacancy_embedding_dict = dict(zip(unique_vacancies['vacancy_id'], unique_embeddings))
    
    rows = []
    for vid in df['vacancy_id']:
        rows.append(vacancy_embedding_dict[vid])
    
    # Объединяем в одну sparse матрицу
    all_vacancy_embeddings = vstack(rows)
    
    return all_vacancy_embeddings, vectorizer

In [47]:
def process_similarity_scores_tfidf(df, vectorizer=None, fit=True):
    """Функция для вычисления схожести с использованием TF-IDF и лемматизации"""    
    # Предобработка данных
    df = preprocess_data(df)
    
    print("Создание TF-IDF эмбеддингов для описаний опыта в резюме...")
    experience_embeddings, tfidf_vectorizer = get_tfidf_embeddings(df['resume_last_experience_description'].tolist(), vectorizer=vectorizer, fit=fit)
    
    print("Создание TF-IDF эмбеддингов для описаний вакансий...")
    vacancy_embeddings, _ = get_tfidf_vacancy_embeddings(df, vectorizer=tfidf_vectorizer)
    
    print("Вычисление косинусного сходства...")
    similarity_scores = calculate_cosine_similarity(vacancy_embeddings, experience_embeddings)
    
    # Добавляем scores в DataFrame
    df['similarity_score_tfidf'] = similarity_scores
    
    return df, tfidf_vectorizer

In [48]:
try:
    df_tfidf = pd.read_csv('description_df_with_scores_tfidf.csv')
except:
    df_tfidf = process_similarity_scores_tfidf(df.copy())
    save_results(df_tfidf, 'description_df_with_scores_tfidf.csv')

In [49]:
df = df.merge(df_tfidf)

In [50]:
df.head()

Unnamed: 0,vacancy_id,vacancy_name,vacancy_area,vacancy_experience,vacancy_employment,vacancy_schedule,vacancy_description,resume_id,resume_title,resume_specialization,...,resume_location,resume_gender,resume_applicant_status,target,resume_last_company_experience_months,location_matching,resume_skill_count_in_vacancy,last_position_in_vacancy,resume_skill_count,similarity_score_tfidf
0,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,"Привет!.redev — технологическая компания, созд...",6969174,ABAP-разработчик,"['Программист, разработчик']",...,Москва,Мужчина,Рассматривает предложения,1,76.0,1,3,0.666667,3,0.284047
1,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,"Привет!.redev — технологическая компания, созд...",9100077,"ABAP разработчик - SAP HCM, CRM, S/4HANA ERP(F...","['Программист, разработчик']",...,Москва,Мужчина,Рассматривает предложения,1,8.0,1,2,0.5,2,0.308726
2,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,"Привет!.redev — технологическая компания, созд...",32644957,Разработчик ABAP,"['Программист, разработчик']",...,Москва,Женщина,NDT,1,136.0,1,1,0.0,1,0.510093
3,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,"Привет!.redev — технологическая компания, созд...",27220466,ABAP-разработчик,"['Программист, разработчик']",...,Красноярск,Мужчина,Рассматривает предложения,1,135.0,0,2,0.333333,2,0.301062
4,126167948,Разработчик SAP ABAP,Москва,Более 6 лет,Полная занятость,Удаленная работа,"Привет!.redev — технологическая компания, созд...",7532708,ABAP разработчик. Senior ABAP Developer. SAP T...,"['Programmer, developer']",...,Moscow,Мужчина,NDT,1,0.0,0,2,0.6,2,0.075429


## Train-test split

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Добавим новые признаки в обучение и тестирование

</div>

In [51]:
features = [
    'vacancy_area',
    'vacancy_experience',
    'vacancy_employment', 
    'vacancy_schedule',
    # 'resume_specialization',
    # 'resume_education', 
    # 'resume_courses', 
    'resume_salary',
    'resume_age', 
    'resume_experience_months',
    'resume_location',
    'resume_gender', 
    'resume_applicant_status', 
    'resume_last_company_experience_months', 
    'location_matching',
    'resume_skill_count_in_vacancy',
    'last_position_in_vacancy',
    'similarity_score_tfidf'
]
df[features]

Unnamed: 0,vacancy_area,vacancy_experience,vacancy_employment,vacancy_schedule,resume_salary,resume_age,resume_experience_months,resume_location,resume_gender,resume_applicant_status,resume_last_company_experience_months,location_matching,resume_skill_count_in_vacancy,last_position_in_vacancy,similarity_score_tfidf
0,Москва,Более 6 лет,Полная занятость,Удаленная работа,0.0,65.000000,228.0,Москва,Мужчина,Рассматривает предложения,76.0,1,3,0.666667,0.284047
1,Москва,Более 6 лет,Полная занятость,Удаленная работа,0.0,43.000000,208.0,Москва,Мужчина,Рассматривает предложения,8.0,1,2,0.500000,0.308726
2,Москва,Более 6 лет,Полная занятость,Удаленная работа,200000.0,52.000000,360.0,Москва,Женщина,NDT,136.0,1,1,0.000000,0.510093
3,Москва,Более 6 лет,Полная занятость,Удаленная работа,500000.0,56.000000,356.0,Красноярск,Мужчина,Рассматривает предложения,135.0,0,2,0.333333,0.301062
4,Москва,Более 6 лет,Полная занятость,Удаленная работа,0.0,48.000000,301.0,Moscow,Мужчина,NDT,0.0,0,2,0.600000,0.075429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321637,Москва,От 3 до 6 лет,Полная занятость,Удаленная работа,242550.0,66.000000,521.0,Санкт-Петербург,Женщина,NDT,270.0,0,0,0.166667,0.072670
321638,Москва,От 3 до 6 лет,Полная занятость,Удаленная работа,0.0,40.000000,213.0,Москва,Мужчина,Активно ищет работу,35.0,1,0,0.000000,0.000000
321639,Москва,От 3 до 6 лет,Полная занятость,Удаленная работа,80000.0,44.060813,121.0,Москва,Мужчина,NDT,44.0,1,0,0.200000,0.047398
321640,Москва,От 3 до 6 лет,Полная занятость,Удаленная работа,0.0,32.000000,117.0,Москва,Женщина,NDT,96.0,1,0,0.200000,0.029086


In [52]:
numeric_features = df[features].select_dtypes(include=np.number).columns
categorical_features = df[features].select_dtypes(exclude=np.number).columns

In [53]:
X = df[features].copy()
y = df['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

In [54]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((257313, 15), (64329, 15), (257313,), (64329,))

## Base pipeline

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Строим pipeline с новыми признаками

</div>

In [55]:
%%time
pipeline_fe = Pipeline([
    ('preprocessing', ColumnTransformer([
        ('numeric_scaling', StandardScaler(), numeric_features),
        ('categorical_encoding', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])),
    ('model', LogisticRegression(random_state=RANDOM_STATE))
])

pipeline_fe.fit(X_train, y_train)

CPU times: user 909 ms, sys: 12.5 ms, total: 922 ms
Wall time: 922 ms


0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric_scaling', ...), ('categorical_encoding', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [56]:
y_pred_proba = pipeline_fe.predict_proba(X_test)

df_test = df.loc[X_test.index].copy()
df_test['y_pred_proba'] = y_pred_proba[:, 1]

In [57]:
ndcg, precision, recall, f1 = calculate_metrics(df_test)
metrics_fe_baseline = {}
metrics_fe_baseline['NDCG'] = ndcg
metrics_fe_baseline['Precision'] = precision
metrics_fe_baseline['Recall'] = recall
metrics_fe_baseline['F1'] = f1

Средний NDCG: 0.7516
Средний Precision: 0.6408
Средний Recall: 0.6015
Средний F1-Score: 0.6047


<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Видно, что новые признаки добавили точности модели.

</div>

In [58]:
best_params_fe = pipeline_fe.get_params()

In [59]:
RUN_NAME = "feature_engineering_with_base_model" 
REGISTRY_MODEL_NAME = "feature_engineering_with_base_model"

In [60]:
signature = mlflow.models.infer_signature(X_test, y_test)
input_example = X_test[:10]
code_paths = ["linear_models.ipynb"]

try:
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
except:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    lr_info = mlflow.sklearn.log_model(sk_model=pipeline_fe, 
                                       artifact_path='feature_engineering_with_base_model',
                                       registered_model_name=REGISTRY_MODEL_NAME,
                                       input_example=input_example,
                                       code_paths=code_paths,
                                       await_registration_for=60
                                      )
    mlflow.log_metrics(metrics_fe_baseline)
    mlflow.log_params(best_params_fe)

Successfully registered model 'feature_engineering_with_base_model'.
2025/11/19 22:42:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 60 seconds for model version to finish creation. Model name: feature_engineering_with_base_model, version 1


🏃 View run feature_engineering_with_base_model at: http://127.0.0.1:5000/#/experiments/1/runs/abc8ab0401444d068ffbcef1644f36af
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '1' of model 'feature_engineering_with_base_model'.


# Tuning the hyper-parameters

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Воспользуемся optuna для подбора гипермараметров

</div>

In [61]:
def objective(trial: optuna.Trial) -> float:
    params = {
        'model__C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'model__penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'model__solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'model__class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])
    }
    
    pipeline_fe_optuna = Pipeline([
        ('preprocessing', ColumnTransformer([
            ('numeric_scaling', StandardScaler(), numeric_features),
            ('categorical_encoding', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
        ])),
        ('model', LogisticRegression(random_state=RANDOM_STATE))
    ])
    
    pipeline_fe_optuna.set_params(**params)
    
    kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    
    for train_idx, val_idx in kfold.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        pipeline_fe_optuna.fit(X_fold_train, y_fold_train)
        y_pred_proba = pipeline_fe_optuna.predict_proba(X_fold_val)
        

        df_val = df.loc[X_fold_val.index].copy()
        df_val['y_pred_proba'] = y_pred_proba[:, 1]
        
        ndcg, _, _, _ = calculate_metrics(df_val)
    
    return ndcg

In [63]:
try:
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
except:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

RUN_NAME_OPTUNE = 'feature_engineering_optuna'

with mlflow.start_run(run_name=RUN_NAME_OPTUNE, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

🏃 View run feature_engineering_optuna at: http://127.0.0.1:5000/#/experiments/1/runs/bf6cc1d6abf9440b9ea5b719d50dd00f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [64]:
STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "feature_engineering_optuna"

mlflc = MLflowCallback(
    tracking_uri=f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}',
    metric_name="NDCG",
    create_experiment=False,
    mlflow_kwargs={'experiment_id': experiment_id, 'tags': {MLFLOW_PARENT_RUN_ID: run_id}}
)

In [65]:
study = optuna.create_study(direction='maximize', 
                            sampler=optuna.samplers.TPESampler(),
                            study_name=STUDY_NAME,
                            storage=STUDY_DB_NAME,
                            load_if_exists=True)
study.optimize(objective, 
               n_trials=10, 
               callbacks=[mlflc]
              )
best_params_optuna = study.best_params

print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params_optuna}")

[I 2025-11-19 22:42:57,961] Using an existing study with name 'feature_engineering_optuna' instead of creating a new one.


Средний NDCG: 0.8303
Средний Precision: 0.5699
Средний Recall: 0.7892
Средний F1-Score: 0.6313
Средний NDCG: 0.8191
Средний Precision: 0.5626
Средний Recall: 0.7794
Средний F1-Score: 0.6223


[I 2025-11-19 22:43:22,264] Trial 13 finished with value: 0.8270394176376302 and parameters: {'C': 0.5996648730630076, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 13 with value: 0.8270394176376302.


Средний NDCG: 0.8270
Средний Precision: 0.5716
Средний Recall: 0.7816
Средний F1-Score: 0.6305
🏃 View run 13 at: http://127.0.0.1:5000/#/experiments/1/runs/ab2b7fac34a440d592268f72a938e78b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8301
Средний Precision: 0.5702
Средний Recall: 0.7894
Средний F1-Score: 0.6317
Средний NDCG: 0.8191
Средний Precision: 0.5632
Средний Recall: 0.7796
Средний F1-Score: 0.6227


[I 2025-11-19 22:43:45,804] Trial 14 finished with value: 0.8272364211671691 and parameters: {'C': 0.3595330746959291, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8272
Средний Precision: 0.5726
Средний Recall: 0.7818
Средний F1-Score: 0.6313
🏃 View run 14 at: http://127.0.0.1:5000/#/experiments/1/runs/5c7b951689a64258b8a7b02aa25f6cd7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8301
Средний Precision: 0.5718
Средний Recall: 0.7890
Средний F1-Score: 0.6326
Средний NDCG: 0.8184
Средний Precision: 0.5638
Средний Recall: 0.7779
Средний F1-Score: 0.6221


[I 2025-11-19 22:44:07,876] Trial 15 finished with value: 0.8262301371597249 and parameters: {'C': 0.0347834788915487, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8262
Средний Precision: 0.5728
Средний Recall: 0.7832
Средний F1-Score: 0.6318
🏃 View run 15 at: http://127.0.0.1:5000/#/experiments/1/runs/be8f23b0de2f49d2a714f5b1d2bcbdd2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8303
Средний Precision: 0.5691
Средний Recall: 0.7891
Средний F1-Score: 0.6307
Средний NDCG: 0.8190
Средний Precision: 0.5626
Средний Recall: 0.7799
Средний F1-Score: 0.6226


[I 2025-11-19 22:44:32,911] Trial 16 finished with value: 0.8268242106145085 and parameters: {'C': 3.024832329982409, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8268
Средний Precision: 0.5708
Средний Recall: 0.7816
Средний F1-Score: 0.6299
🏃 View run 16 at: http://127.0.0.1:5000/#/experiments/1/runs/6f44c30796e847d3bea3d72934a7a637
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8297
Средний Precision: 0.5677
Средний Recall: 0.7879
Средний F1-Score: 0.6293
Средний NDCG: 0.8187
Средний Precision: 0.5623
Средний Recall: 0.7792
Средний F1-Score: 0.6223


[I 2025-11-19 22:45:03,752] Trial 17 finished with value: 0.8259968763341434 and parameters: {'C': 615.5694131947703, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8260
Средний Precision: 0.5701
Средний Recall: 0.7807
Средний F1-Score: 0.6292
🏃 View run 17 at: http://127.0.0.1:5000/#/experiments/1/runs/d5c00b73a3c94fd7afadb94adec97f06
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8303
Средний Precision: 0.5705
Средний Recall: 0.7895
Средний F1-Score: 0.6320
Средний NDCG: 0.8190
Средний Precision: 0.5639
Средний Recall: 0.7798
Средний F1-Score: 0.6232


[I 2025-11-19 22:45:26,636] Trial 18 finished with value: 0.8272048230790116 and parameters: {'C': 0.18479180995408886, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8272
Средний Precision: 0.5722
Средний Recall: 0.7825
Средний F1-Score: 0.6311
🏃 View run 18 at: http://127.0.0.1:5000/#/experiments/1/runs/922a4cb4cbfe4e48b2ce9abdd6d4bbb7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8303
Средний Precision: 0.5708
Средний Recall: 0.7884
Средний F1-Score: 0.6315
Средний NDCG: 0.8188
Средний Precision: 0.5637
Средний Recall: 0.7782
Средний F1-Score: 0.6223


[I 2025-11-19 22:45:49,098] Trial 19 finished with value: 0.8263315318509254 and parameters: {'C': 0.05890740948164539, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8263
Средний Precision: 0.5723
Средний Recall: 0.7827
Средний F1-Score: 0.6314
🏃 View run 19 at: http://127.0.0.1:5000/#/experiments/1/runs/c99aa17dbfb445cd8bc3d8f2dda313f4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8271
Средний Precision: 0.5673
Средний Recall: 0.7847
Средний F1-Score: 0.6282
Средний NDCG: 0.8154
Средний Precision: 0.5605
Средний Recall: 0.7742
Средний F1-Score: 0.6181


[I 2025-11-19 22:46:10,626] Trial 20 finished with value: 0.8230966436343685 and parameters: {'C': 0.004036069909472678, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8231
Средний Precision: 0.5681
Средний Recall: 0.7813
Средний F1-Score: 0.6276
🏃 View run 20 at: http://127.0.0.1:5000/#/experiments/1/runs/6cea893c6a044d07ac36f85f56a12939
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8301
Средний Precision: 0.5681
Средний Recall: 0.7885
Средний F1-Score: 0.6298
Средний NDCG: 0.8189
Средний Precision: 0.5619
Средний Recall: 0.7796
Средний F1-Score: 0.6221


[I 2025-11-19 22:49:14,997] Trial 21 finished with value: 0.8267105518470379 and parameters: {'C': 5.24203195148384, 'penalty': 'l1', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8267
Средний Precision: 0.5706
Средний Recall: 0.7818
Средний F1-Score: 0.6298
🏃 View run 21 at: http://127.0.0.1:5000/#/experiments/1/runs/9330a4ad0e6744ebb6de98137f1481ba
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Средний NDCG: 0.8302
Средний Precision: 0.5708
Средний Recall: 0.7897
Средний F1-Score: 0.6322
Средний NDCG: 0.8188
Средний Precision: 0.5645
Средний Recall: 0.7798
Средний F1-Score: 0.6235


[I 2025-11-19 22:49:37,937] Trial 22 finished with value: 0.8272064795414297 and parameters: {'C': 0.15100483937791648, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 14 with value: 0.8272364211671691.


Средний NDCG: 0.8272
Средний Precision: 0.5720
Средний Recall: 0.7824
Средний F1-Score: 0.6309
🏃 View run 22 at: http://127.0.0.1:5000/#/experiments/1/runs/910d8c17c0a144c4b30bbd36b97bf885
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Number of finished trials: 23
Best params: {'C': 0.3595330746959291, 'penalty': 'l2', 'solver': 'liblinear', 'class_weight': 'balanced'}


<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Обучим и протестируем модель с лучшими подобранными гиперпараметрами

</div>

In [73]:
best_params_optuna = {
    'model__C': 0.3595330746959291,
    'model__penalty': 'l2',
    'model__solver': 'liblinear',
    'model__class_weight': 'balanced'
}

In [76]:
pipeline_fe_best_optuna = Pipeline([
    ('preprocessing', ColumnTransformer([
        ('numeric_scaling', StandardScaler(), numeric_features),
        ('categorical_encoding', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])),
    ('model', LogisticRegression(random_state=RANDOM_STATE))
])

pipeline_fe_best_optuna.set_params(**best_params_optuna)
pipeline_fe_best_optuna.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric_scaling', ...), ('categorical_encoding', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.3595330746959291
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,100


In [79]:
y_pred_proba = pipeline_fe_best_optuna.predict_proba(X_test)

df_test = df.loc[X_test.index].copy()
df_test['y_pred_proba'] = y_pred_proba[:, 1]

In [80]:
ndcg, precision, recall, f1 = calculate_metrics(df_test)
metrics_fe_best_optuna = {}
metrics_fe_best_optuna['NDCG'] = ndcg
metrics_fe_best_optuna['Precision'] = precision
metrics_fe_best_optuna['Recall'] = recall
metrics_fe_best_optuna['F1'] = f1

Средний NDCG: 0.7530
Средний Precision: 0.5340
Средний Recall: 0.7125
Средний F1-Score: 0.5835


<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

Метрика NDCG незначительно улучшилась, при этом метрика Recall стала выше, а Precision и F1 ниже.

</div>

In [81]:
RUN_NAME = "feature_engineering_with_best_optuna_lr" 
REGISTRY_MODEL_NAME = "feature_engineering_with_best_optuna_lr"

In [82]:
signature = mlflow.models.infer_signature(X_test, y_test)
input_example = X_test[:10]
code_paths = ["linear_models.ipynb"]

try:
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
except:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    lr_info = mlflow.sklearn.log_model(sk_model=pipeline_fe_best_optuna, 
                                       artifact_path='feature_engineering_with_best_optuna_lr',
                                       registered_model_name=REGISTRY_MODEL_NAME,
                                       input_example=input_example,
                                       code_paths=code_paths,
                                       await_registration_for=60
                                      )
    mlflow.log_metrics(metrics_fe_best_optuna)
    mlflow.log_params(best_params_optuna)

Successfully registered model 'feature_engineering_with_best_optuna_lr'.
2025/11/20 11:03:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 60 seconds for model version to finish creation. Model name: feature_engineering_with_best_optuna_lr, version 1


🏃 View run feature_engineering_with_best_optuna_lr at: http://127.0.0.1:5000/#/experiments/1/runs/42cd35b367b14ed7949edaa246b4826e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '1' of model 'feature_engineering_with_best_optuna_lr'.


In [87]:
MODEL_NAME = 'feature_engineering_with_best_optuna_lr.pkl'
with open(MODEL_NAME, 'wb') as file:
    pickle.dump(pipeline_fe_best_optuna, file)

<div style="background-color: #98FB98; color: black; padding: 10px; border-radius: 5px;">

**Общиый вывод:**

Было проведено несколько экспериментов с логированием результатов и моделий в MLFLOW:
1. Построение и обучение базового pipeline с предварительной обработкой категориальных фичей с помощью OneHotEncoder и масштабированием числовых с помощью StandardScaler.
   
   Полученные метрики:
    - Средний NDCG: 0.5043
    - Средний Precision: 0.0890
    - Средний Recall: 0.0574
    - Средний F1-Score: 0.0662
3. Были добавлены кросс фичи:
- location_matching - матчинг локации вакансии и резюме
- resume_skill_count_in_vacancy - количество навыков кандидата, которые присутствуют в описании резюме
- last_position_in_vacancy - доля слов в описании последней должности кандидата, которые присутствуют в резюме
- similarity_score_tfidf - косинусное расстояние между закодированными в векторы описаниями вакансий и последнего опыта работы кандидата
 
3. Построение и обучение базового pipeline с новыми фичами
   
   Полученные метрики:
    - Средний NDCG: 0.7516
    - Средний Precision: 0.6408
    - Средний Recall: 0.6015
    - Средний F1-Score: 0.6047
4. Подбор гиперпараметров с помощью Optuna
5. Обучение модели с лучшими гиперпараметрами после подбора

   Полученные метрики
    - Средний NDCG: 0.7530
    - Средний Precision: 0.5340
    - Средний Recall: 0.7125
    - Средний F1-Score: 0.5835
6. Модель сохранена в файл feature_engineering_with_best_optuna_lr.pkl
</div>