**Импорт зависимостей**

In [None]:
!pip install -U sentence-transformers
!pip install mlflow
!pip install optuna
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import os
import math
import warnings
import logging
import torch
import mlflow
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from optuna.visualization import plot_optimization_history

In [None]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)
pd.set_option('max_colwidth', 400)
mlflow.set_experiment("logging_sbert")
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[logging.StreamHandler()])
log_dir = './logs'
os.makedirs(log_dir, exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Набор данных**

In [6]:
# !rm -r /kaggle/working/ && !ls /kaggle/working/

In [None]:
df = pd.read_excel('/kaggle/input/input-dataset/resume_job_cities_fixed.xlsx')
df.head(1)

In [8]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=RANDOM_STATE)
train_data = [
    InputExample(texts=[job_desc, resume_desc], label=float(match))
    for job_desc, resume_desc, match in zip(train_df['Job Description'], train_df['Resume Description'], train_df['Match'])
]
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=2, pin_memory=True)

**Тренировка**

In [None]:
def create_evaluator(val_df, evaluator_class, name, batch_size):
    return evaluator_class(
        sentences1=val_df['Job Description'].tolist(),
        sentences2=val_df['Resume Description'].tolist(),
        labels=val_df['Match'].tolist(),
        name=name,
        batch_size=batch_size,
        show_progress_bar=True,
        write_csv=True
    )

def train_and_evaluate(model, train_dataloader, evaluator, trial, loss_name):
    model = model.to(device)
    loss_class = getattr(losses, loss_name)
    train_loss = loss_class(model=model) 
    mlflow.end_run()

    with mlflow.start_run(run_name="ml_sbert"):
        lr = trial.suggest_float('lr', 1e-6, 1e-2, log=True)
        weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-1, log=True)
        batch_size = trial.suggest_int('batch_size', 2, 32, log=True)
        epochs = 3
        warmup_steps = math.ceil(len(train_df) * 0.1)
        checkpoint_path = './checkpoints'
        mlflow.log_params({'lr': lr, 'weight_decay': weight_decay, 'batch_size': batch_size, 'loss_name': loss_name})

        for epoch in range(epochs):
            model.fit(
                train_objectives=[(train_dataloader, train_loss)],
                evaluator=evaluator,
                optimizer_class=torch.optim.AdamW,
                epochs=epochs,
                warmup_steps=warmup_steps,
                optimizer_params={'lr': lr},
                weight_decay=weight_decay,
                show_progress_bar=True,
                save_best_model=True,
                use_amp=True,
                checkpoint_path=checkpoint_path,
                checkpoint_save_steps=1000,
            )
            metrics = evaluator(model)
            print(metrics)
                    
            
            torch.cuda.empty_cache()

    return metrics

def objective(trial):
    model_name = "sberbank-ai/sbert_large_nlu_ru"
    model = SentenceTransformer(model_name)
    loss_name = trial.suggest_categorical('loss_name', ['CosineSimilarityLoss', 'ContrastiveLoss'])
    evaluator = create_evaluator(val_df, evaluation.BinaryClassificationEvaluator, "binary_classification_evaluation", batch_size=32)
    metrics = train_and_evaluate(model, train_dataloader, evaluator, trial, loss_name)
    
    return metrics

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)
best_params = study.best_params
print("Лучшие гиперпараметры:", best_params)

mlflow.log_params(best_params)
print("Тренировка завершена!")

fig = optuna.visualization.matplotlib.plot_optimization_history(study)
plt.show()

**TEST**

In [None]:
best_model_path = "./checkpoints/best_model"
best_model = SentenceTransformer(best_model_path)
mlflow.pytorch.log_model(best_model, "ml_sbert")

In [None]:
df_t = pd.read_csv('/kaggle/input/yandex-jobs/vacancies.csv', sep=',')['Raw text']
df_t = df_t.to_frame().rename(columns={'Raw text': 'Job Description'})

In [None]:
!ls /kaggle/working/checkpoints

Последний лучший чекпоинт

In [None]:
latest_checkpoint = max(os.listdir('/kaggle/working/checkpoints'), key=lambda x: int(x))
checkpoint_filename = f'/kaggle/working/checkpoints/{latest_checkpoint}'
print(checkpoint_filename)
# model.load_state_dict(torch.load(checkpoint_filename))

Без резюме айтишников

In [None]:
from scipy.spatial.distance import cosine

best_model_path = "/kaggle/working/checkpoints/1090"
best_model = SentenceTransformer(best_model_path)
job_descriptions = df_t['Job Description'].tolist()

user_resume = input('Введите своё резюме')

user_resume_embedding = best_model.encode(user_resume)
similarity_scores = [1 - cosine(user_resume_embedding, best_model.encode(job_desc)) for job_desc in job_descriptions]
result_df = pd.DataFrame({'Job Description': job_descriptions, 'Similarity': similarity_scores})
top_10_vacancies = result_df.nlargest(10, 'Similarity')
print("Топ-10 вакансий для пользовательского резюме:")
print(top_10_vacancies[['Job Description', 'Similarity']])

С резюме айтишников

In [None]:
best_model_path = "/kaggle/working/checkpoints/1090"
best_model = SentenceTransformer(best_model_path)

test_data = [
    InputExample(texts=[job_desc, resume_desc], label=float(match))
    for job_desc, resume_desc, match in zip(df_t['Job Description'], df_t['Resume Description'], df_t['Match'])
]
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=2, pin_memory=True)

test_evaluator = create_evaluator(df_t, evaluation.BinaryClassificationEvaluator, "test_evaluation", batch_size=32)
test_metrics = evaluator(model)
print("Test Metrics:", test_metrics)

user_resume = input('Введите своё резюме')

new_data_embeddings = model.encode(df_t['Job Description'].tolist(), df_t['Resume Description'].tolist())
new_data['Similarity'] = model.encode([user_resume] * len(df_t), df_t['Job Description'])
top_10_new_data = new_data.nlargest(10, 'Similarity')
print("Топ-10 вакансий для пользовательского резюме:")
print(top_10_new_data[['Job Description', 'Similarity']])