In [1]:
!pip install rank-bm25



In [2]:
import pandas as pd
import numpy as np
import re
import json
from tqdm.autonotebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import optuna
from rank_bm25 import BM25Okapi
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

  from tqdm.autonotebook import tqdm
2026-01-16 18:27:39.971663: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768588059.993423    1729 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768588059.999887    1729 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768588060.016959    1729 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768588060.016976    1729 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768588060.016979    1729 computation_

In [3]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r"[\r\n]+", " ", str(text))
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [4]:
def preprocess_data(df, min_category_size=3):
    df = df.copy()
    missing_idx_map = {
        0: "Interesting things",
        39: "General",
        40: "Interesting things",
        56: "Interesting things",
        74: "General",
        75: "Interesting things"
    }
    for idx, cat in missing_idx_map.items():
        if idx in df.index and pd.isna(df.loc[idx, 'category']):
            df.loc[idx, 'category'] = cat

    df["title"] = df["title"].apply(clean_text)
    df["text"] = df["text"].apply(clean_text)
    df["content"] = df["title"] + " " + df["text"]

    category_counts = df["category"].value_counts()
    valid_cats = category_counts[category_counts >= min_category_size].index
    df = df[df["category"].isin(valid_cats)].reset_index(drop=True)
    return df


In [5]:
def tokenize(text):
    return text.lower().split()

In [6]:
def get_w2v_doc_vector(tokens, model):
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

In [7]:
def vectorize_texts(docs, models_to_use=["sbert", "tfidf", "w2v", "bge"]):
    embeddings = {}
    if "sbert" in models_to_use:
        sbert_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
        embeddings["sbert"] = sbert_model.encode(docs, show_progress_bar=False)

    if "tfidf" in models_to_use:
        tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
        embeddings["tfidf"] = tfidf.fit_transform(docs).toarray()

    if "w2v" in models_to_use:
        tokenized_docs = [tokenize(doc) for doc in docs]
        w2v_model = Word2Vec(tokenized_docs, vector_size=100, window=5, min_count=1, seed=42, workers=1)
        w2v_embeddings = np.array([
            get_w2v_doc_vector(tokens, w2v_model) for tokens in tokenized_docs
        ])
        embeddings["w2v"] = w2v_embeddings

    if "bge" in models_to_use:
        bge_model = SentenceTransformer("BAAI/bge-m3")
        embeddings["bge"] = bge_model.encode(docs, normalize_embeddings=True, show_progress_bar=False)

    return embeddings

In [8]:
def hybrid_recommend(query_idx, df, embeddings, w_sem=0.7, w_cat=0.3, w_speaker=0.0, w_companies=0.0, top_k=5):
    n = len(df)
    scores = np.zeros(n)
    query_emb = embeddings[query_idx]
    query_cat = df.iloc[query_idx]["category"]
    query_speaker = df.iloc[query_idx]["speaker"]
    query_companies = df.iloc[query_idx]["companies"]

    for i in range(n):
        if i == query_idx:
            continue
        sem_sim = cosine_similarity([query_emb], [embeddings[i]])[0][0]
        cat_match = float(df.iloc[i]["category"] == query_cat)
        speaker_match = float(df.iloc[i]["speaker"] == query_speaker)
        companies_match = float(df.iloc[i]["companies"] == query_companies)
        scores[i] = (
            w_sem * sem_sim +
            w_cat * cat_match +
            w_speaker * speaker_match +
            w_companies * companies_match
        )

    top_idxs = np.argsort(-scores)[:top_k + 1]
    return [i for i in top_idxs if i != query_idx][:top_k]

In [9]:
def recommend_by_similarity(query_idx, embeddings, top_k=5):
    sims = cosine_similarity([embeddings[query_idx]], embeddings).flatten()
    sims[query_idx] = -1
    top_idxs = np.argsort(-sims)[:top_k]
    return top_idxs.tolist()

In [10]:
def recommend_by_bm25(query_idx, docs, top_k=5):
    tokenized_docs = [doc.lower().split() for doc in docs]
    bm25 = BM25Okapi(tokenized_docs)
    query_tokens = tokenized_docs[query_idx]
    scores = bm25.get_scores(query_tokens)
    scores[query_idx] = -np.inf
    top_idxs = np.argsort(-scores)[:top_k]
    return top_idxs.tolist()

In [11]:
def load_llm_cache(cache_path):
    with open(cache_path, "r", encoding="utf-8") as f:
        return json.load(f)

In [12]:
def get_llm_ground_truth_for_talk(query_talk_id, df, cache, min_score=0.5):
    relevant_talks = []
    for _, row in df.iterrows():
        other_id = row["talk_id"]
        if other_id == query_talk_id:
            continue
        key = f"{min(query_talk_id, other_id)}_{max(query_talk_id, other_id)}"
        score = cache.get(key, 0.0)
        if score >= min_score:
            relevant_talks.append((other_id, score))
    relevant_talks.sort(key=lambda x: x[1], reverse=True)
    return [tid for tid, _ in relevant_talks]

In [13]:
def llm_rerank_recommend(query_idx, df, cache, top_k=5):
    query_talk_id = df.iloc[query_idx]["talk_id"]
    scores = []
    for idx, row in df.iterrows():
        if idx == query_idx:
            continue
        other_talk_id = row["talk_id"]
        key = f"{min(query_talk_id, other_talk_id)}_{max(query_talk_id, other_talk_id)}"
        llm_score = cache.get(key, 0.0)
        scores.append((idx, llm_score))
    scores.sort(key=lambda x: x[1], reverse=True)
    return [idx for idx, _ in scores[:top_k]]

In [14]:
def evaluate_model(
    model_name,
    recommend_fn,
    df,
    embeddings_for_diversity,
    gt_type="category",
    cache=None,
    min_score=0.5,
    top_k=5
):
    relevance_scores = []
    diversity_scores = []

    for idx in range(len(df)):
        try:
            recs = recommend_fn(idx)
            if not recs:
                continue
        except Exception:
            continue

        # Relevance 
        if gt_type == "category":
            query_cat = df.iloc[idx]["category"]
            relevance = sum(1 for i in recs if df.iloc[i]["category"] == query_cat) / len(recs)
        elif gt_type == "llm":
            if cache is None:
                continue
            query_talk_id = df.iloc[idx]["talk_id"]
            llm_gt = get_llm_ground_truth_for_talk(query_talk_id, df, cache, min_score=min_score)
            if not llm_gt:
                continue
            rec_talk_ids = [df.iloc[i]["talk_id"] for i in recs[:top_k]]
            recall = len(set(llm_gt) & set(rec_talk_ids)) / len(llm_gt)
            relevance = recall
        else:
            continue

        # Diversity (по эмбеддингам)
        if len(recs) == 1:
            diversity = 1.0
        else:
            valid_embs = []
            for i in recs:
                if i < len(embeddings_for_diversity):
                    valid_embs.append(embeddings_for_diversity[i])
            if len(valid_embs) < 2:
                diversity = 1.0
            else:
                rec_embs = np.array(valid_embs)
                sim_mat = cosine_similarity(rec_embs)
                np.fill_diagonal(sim_mat, 0)
                avg_sim = sim_mat.sum() / (len(rec_embs) * (len(rec_embs) - 1))
                diversity = 1.0 - avg_sim

        relevance_scores.append(relevance)
        diversity_scores.append(diversity)

    avg_rel = np.mean(relevance_scores) if relevance_scores else 0.0
    avg_div = np.mean(diversity_scores) if diversity_scores else 0.0
    composite = 0.7 * avg_rel + 0.3 * avg_div

    metric_name = "Relevance@5" if gt_type == "category" else f"Recall@{top_k} (LLM-GT)"
    return {
        "Model": model_name,
        metric_name: round(avg_rel, 3),
        "Diversity": round(avg_div, 3),
        "Composite": round(composite, 3)
    }

In [15]:
def objective(trial, df, embeddings, gt_type="category", cache=None, min_score=0.5):
    w_sem = trial.suggest_float("w_sem", 0.5, 1.0)
    w_cat = trial.suggest_float("w_cat", 0.0, 0.5)
    w_speaker = trial.suggest_float("w_speaker", 0.0, 0.3)
    w_companies = trial.suggest_float("w_companies", 0.0, 0.3)

    composite_scores = []
    for idx in range(len(df)):
        recs = hybrid_recommend(idx, df, embeddings, w_sem, w_cat, w_speaker, w_companies, top_k=5)
        if not recs:
            continue

        if gt_type == "category":
            query_cat = df.iloc[idx]["category"]
            relevance = sum(1 for i in recs if df.iloc[i]["category"] == query_cat) / len(recs)
        elif gt_type == "llm":
            if cache is None:
                continue
            query_talk_id = df.iloc[idx]["talk_id"]
            llm_gt = get_llm_ground_truth_for_talk(query_talk_id, df, cache, min_score=min_score)
            if not llm_gt:
                continue
            rec_talk_ids = [df.iloc[i]["talk_id"] for i in recs]
            recall = len(set(llm_gt) & set(rec_talk_ids)) / len(llm_gt)
            relevance = recall
        else:
            continue

        if len(recs) == 1:
            diversity = 1.0
        else:
            rec_embs = embeddings[recs]
            sim_mat = cosine_similarity(rec_embs)
            np.fill_diagonal(sim_mat, 0)
            avg_sim = sim_mat.sum() / (len(recs) * (len(recs) - 1))
            diversity = 1.0 - avg_sim

        composite = 0.7 * relevance + 0.3 * diversity
        composite_scores.append(composite)

    return np.mean(composite_scores) if composite_scores else 0.0


In [16]:
def run_optimization(df, embeddings, gt_type="category", cache=None, n_trials=10, min_score=0.5):
    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    study.optimize(
        lambda trial: objective(trial, df, embeddings, gt_type, cache, min_score),
        n_trials=n_trials,
        show_progress_bar=False
    )
    return study.best_params, round(study.best_value, 3)

In [None]:
# Загрузка данных
df = pd.read_csv("dataset_jug.csv")
df = preprocess_data(df, min_category_size=3)
df = df.reset_index(drop=True)
df['talk_id'] = df.index
docs = df["content"].tolist()

In [18]:
# Векторизация
embeddings_dict = vectorize_texts(docs, models_to_use=["sbert", "tfidf", "w2v", "bge"])

In [None]:
# Загрузка LLM-кэша
cache_path = "similarity_cache.json"
llm_cache = load_llm_cache(cache_path)

In [20]:
print("Оценка по совпадению категории (Relevance@5)")

results_category = []

# Dense и sparse модели
for name, emb_key in [("TF-IDF", "tfidf"), ("Word2Vec", "w2v"), ("Sentence-BERT", "sbert"), ("BGE-M3", "bge")]:
    if emb_key in embeddings_dict:
        results_category.append(
            evaluate_model(
                model_name=name,
                recommend_fn=lambda idx, key=emb_key: recommend_by_similarity(idx, embeddings_dict[key]),
                df=df,
                embeddings_for_diversity=embeddings_dict[emb_key],
                gt_type="category"
            )
        )

# BM25
results_category.append(
    evaluate_model(
        model_name="BM25",
        recommend_fn=lambda idx: recommend_by_bm25(idx, docs),
        df=df,
        embeddings_for_diversity=embeddings_dict["tfidf"],
        gt_type="category"
    )
)

best_weights_cat, _ = run_optimization(df, embeddings_dict["bge"], gt_type="category", n_trials=10)
results_category.append(
    evaluate_model(
        model_name="Hybrid (BGE + Optuna)",
        recommend_fn=lambda idx: hybrid_recommend(idx, df, embeddings_dict["bge"], **best_weights_cat),
        df=df,
        embeddings_for_diversity=embeddings_dict["bge"],
        gt_type="category"
    )
)

print(pd.DataFrame(results_category).to_string(index=False))

Оценка по совпадению категории (Relevance@5)


[32m[I 2026-01-16 18:27:56,094][0m A new study created in memory with name: no-name-6cbd6e09-4865-49a3-9d88-e277a966f1d7[0m
[32m[I 2026-01-16 18:27:58,934][0m Trial 0 finished with value: 0.6882157325744629 and parameters: {'w_sem': 0.6872700594236812, 'w_cat': 0.4753571532049581, 'w_speaker': 0.21959818254342153, 'w_companies': 0.17959754525911098}. Best is trial 0 with value: 0.6882157325744629.[0m
[32m[I 2026-01-16 18:28:01,727][0m Trial 1 finished with value: 0.5956175923347473 and parameters: {'w_sem': 0.5780093202212182, 'w_cat': 0.07799726016810132, 'w_speaker': 0.017425083650459836, 'w_companies': 0.2598528437324805}. Best is trial 0 with value: 0.6882157325744629.[0m
[32m[I 2026-01-16 18:28:04,547][0m Trial 2 finished with value: 0.6861491799354553 and parameters: {'w_sem': 0.8005575058716043, 'w_cat': 0.35403628889802274, 'w_speaker': 0.006175348288740734, 'w_companies': 0.29097295564859826}. Best is trial 0 with value: 0.6882157325744629.[0m
[32m[I 2026-01-16 18

                Model  Relevance@5  Diversity  Composite
               TF-IDF        0.169      0.957      0.405
             Word2Vec        0.097      0.106      0.100
        Sentence-BERT        0.186      0.509      0.283
               BGE-M3        0.206      0.463      0.283
                 BM25        0.123      0.959      0.374
Hybrid (BGE + Optuna)        0.777      0.481      0.688


In [21]:
print("Оценка по LLM-based ground truth (Recall@5, min_score=0.5)")

results_llm = []

# Dense и sparse модели
for name, emb_key in [("TF-IDF", "tfidf"), ("Word2Vec", "w2v"), ("Sentence-BERT", "sbert"), ("BGE-M3", "bge")]:
    if emb_key in embeddings_dict:
        results_llm.append(
            evaluate_model(
                model_name=name,
                recommend_fn=lambda idx, key=emb_key: recommend_by_similarity(idx, embeddings_dict[key]),
                df=df,
                embeddings_for_diversity=embeddings_dict[emb_key],
                gt_type="llm",
                cache=llm_cache,
                min_score=0.5
            )
        )

# BM25
results_llm.append(
    evaluate_model(
        model_name="BM25",
        recommend_fn=lambda idx: recommend_by_bm25(idx, docs),
        df=df,
        embeddings_for_diversity=embeddings_dict["tfidf"],
        gt_type="llm",
        cache=llm_cache,
        min_score=0.5
    )
)

# Hybrid (BGE + Optuna)
best_weights_llm_bge, _ = run_optimization(
    df, embeddings_dict["bge"], gt_type="llm", cache=llm_cache, n_trials=20, min_score=0.5
)
results_llm.append(
    evaluate_model(
        model_name="Hybrid (BGE + Optuna)",
        recommend_fn=lambda idx: hybrid_recommend(idx, df, embeddings_dict["bge"], **best_weights_llm_bge),
        df=df,
        embeddings_for_diversity=embeddings_dict["bge"],
        gt_type="llm",
        cache=llm_cache,
        min_score=0.5
    )
)

# Hybrid (SBERT + Optuna)
best_weights_llm_sbert_base, _ = run_optimization(
    df, embeddings_dict["sbert"], gt_type="llm", cache=llm_cache, n_trials=20, min_score=0.5
)
results_llm.append(
    evaluate_model(
        model_name="Hybrid (SBERT + Optuna)",
        recommend_fn=lambda idx: hybrid_recommend(idx, df, embeddings_dict["sbert"], **best_weights_llm_sbert_base),
        df=df,
        embeddings_for_diversity=embeddings_dict["sbert"],
        gt_type="llm",
        cache=llm_cache,
        min_score=0.5
    )
)

# LLM Oracle
results_llm.append(
    evaluate_model(
        model_name="LLM Oracle (Reranked)",
        recommend_fn=lambda idx: llm_rerank_recommend(idx, df, llm_cache, top_k=5),
        df=df,
        embeddings_for_diversity=embeddings_dict["bge"],
        gt_type="llm",
        cache=llm_cache,
        min_score=0.5
    )
)

print(pd.DataFrame(results_llm).to_string(index=False))

Оценка по LLM-based ground truth (Recall@5, min_score=0.5)


[32m[I 2026-01-16 18:28:28,986][0m A new study created in memory with name: no-name-9d5e45fa-d463-4db0-a030-98659e0e595b[0m
[32m[I 2026-01-16 18:28:32,068][0m Trial 0 finished with value: 0.31171026825904846 and parameters: {'w_sem': 0.6872700594236812, 'w_cat': 0.4753571532049581, 'w_speaker': 0.21959818254342153, 'w_companies': 0.17959754525911098}. Best is trial 0 with value: 0.31171026825904846.[0m
[32m[I 2026-01-16 18:28:35,159][0m Trial 1 finished with value: 0.3192254304885864 and parameters: {'w_sem': 0.5780093202212182, 'w_cat': 0.07799726016810132, 'w_speaker': 0.017425083650459836, 'w_companies': 0.2598528437324805}. Best is trial 1 with value: 0.3192254304885864.[0m
[32m[I 2026-01-16 18:28:38,122][0m Trial 2 finished with value: 0.31171026825904846 and parameters: {'w_sem': 0.8005575058716043, 'w_cat': 0.35403628889802274, 'w_speaker': 0.006175348288740734, 'w_companies': 0.29097295564859826}. Best is trial 1 with value: 0.3192254304885864.[0m
[32m[I 2026-01-16

                  Model  Recall@5 (LLM-GT)  Diversity  Composite
                 TF-IDF              0.208      0.958      0.433
               Word2Vec              0.048      0.108      0.066
          Sentence-BERT              0.256      0.511      0.333
                 BGE-M3              0.281      0.459      0.335
                   BM25              0.109      0.960      0.365
  Hybrid (BGE + Optuna)              0.361      0.469      0.394
Hybrid (SBERT + Optuna)              0.329      0.533      0.390
  LLM Oracle (Reranked)              1.000      0.499      0.850


In [22]:
print("Fine tuning SBERT под LLM-GT")

# Подготовка пар
train_pairs = []
texts = df["content"].tolist()
talk_ids = df["talk_id"].tolist()

for i in range(len(df)):
    for j in range(i + 1, len(df)):
        tid_i, tid_j = talk_ids[i], talk_ids[j]
        key = f"{min(tid_i, tid_j)}_{max(tid_i, tid_j)}"
        llm_score = llm_cache.get(key, 0.0)
        if llm_score >= 0.3:
            train_pairs.append((texts[i], texts[j], float(llm_score)))

print(f"Создано {len(train_pairs)} обучающих пар.")

Fine tuning SBERT под LLM-GT
Создано 265 обучающих пар.


In [23]:
class PairDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        return self.pairs[idx]

def collate_fn(batch):
    a, b, labels = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.float)
    return list(a), list(b), labels



In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2").to(device)


In [25]:
g = torch.Generator()
g.manual_seed(42)

train_loader = DataLoader(
    PairDataset(train_pairs),
    batch_size=8,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=0, 
    generator=g
)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()
model.train()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [26]:
for epoch in range(5):
    total_loss = 0.0
    for texts_a, texts_b, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        labels = labels.to(device)
        
        features_a = model.tokenize(texts_a)
        features_b = model.tokenize(texts_b)
        for key in features_a:
            features_a[key] = features_a[key].to(device)
        for key in features_b:
            features_b[key] = features_b[key].to(device)
        
        emb1 = model(features_a)['sentence_embedding']
        emb2 = model(features_b)['sentence_embedding']
        
        emb1 = torch.nn.functional.normalize(emb1, p=2, dim=1)
        emb2 = torch.nn.functional.normalize(emb2, p=2, dim=1)
        
        cos_sim = torch.sum(emb1 * emb2, dim=1)
        cos_sim_norm = (cos_sim + 1) / 2
        
        loss = criterion(cos_sim_norm, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Avg Loss: {total_loss / len(train_loader):.4f}")



Epoch 1:   0%|          | 0/34 [00:00<?, ?it/s]

Epoch 1, Avg Loss: 0.0532


Epoch 2:   0%|          | 0/34 [00:00<?, ?it/s]

Epoch 2, Avg Loss: 0.0277


Epoch 3:   0%|          | 0/34 [00:00<?, ?it/s]

Epoch 3, Avg Loss: 0.0196


Epoch 4:   0%|          | 0/34 [00:00<?, ?it/s]

Epoch 4, Avg Loss: 0.0151


Epoch 5:   0%|          | 0/34 [00:00<?, ?it/s]

Epoch 5, Avg Loss: 0.0119


In [27]:
# Получение эмбеддингов 
fine_tuned_embs_sbert = model.encode(df["content"].tolist(), convert_to_numpy=True)



In [28]:
# Оценка fine-tuned SBERT
result_finetuned_sbert = evaluate_model(
    model_name="SBERT (Fine-tuned on LLM-GT)",
    recommend_fn=lambda idx: recommend_by_similarity(idx, fine_tuned_embs_sbert),
    df=df,
    embeddings_for_diversity=fine_tuned_embs_sbert,
    gt_type="llm",
    cache=llm_cache,
    min_score=0.5
)
results_llm.append(result_finetuned_sbert)

In [31]:
print("Оптимизация гибридной модели на Fine-tuned SBERT")
best_weights_hybrid_sbert_ft, _ = run_optimization(
    df, fine_tuned_embs_sbert, gt_type="llm", cache=llm_cache, n_trials=30, min_score=0.5
)

result_hybrid_sbert_ft = evaluate_model(
    model_name="Hybrid (Fine-tuned SBERT + Optuna)",
    recommend_fn=lambda idx: hybrid_recommend(idx, df, fine_tuned_embs_sbert, **best_weights_hybrid_sbert_ft),
    df=df,
    embeddings_for_diversity=fine_tuned_embs_sbert,
    gt_type="llm",
    cache=llm_cache,
    min_score=0.5
)
results_llm.append(result_hybrid_sbert_ft)



[32m[I 2026-01-16 18:32:26,839][0m A new study created in memory with name: no-name-9db19529-87e7-4576-ac30-5bd63d2201da[0m


Оптимизация гибридной модели на Fine-tuned SBERT


[32m[I 2026-01-16 18:32:29,921][0m Trial 0 finished with value: 0.5545468330383301 and parameters: {'w_sem': 0.6872700594236812, 'w_cat': 0.4753571532049581, 'w_speaker': 0.21959818254342153, 'w_companies': 0.17959754525911098}. Best is trial 0 with value: 0.5545468330383301.[0m
[32m[I 2026-01-16 18:32:33,044][0m Trial 1 finished with value: 0.5485032796859741 and parameters: {'w_sem': 0.5780093202212182, 'w_cat': 0.07799726016810132, 'w_speaker': 0.017425083650459836, 'w_companies': 0.2598528437324805}. Best is trial 0 with value: 0.5545468330383301.[0m
[32m[I 2026-01-16 18:32:36,195][0m Trial 2 finished with value: 0.5569313168525696 and parameters: {'w_sem': 0.8005575058716043, 'w_cat': 0.35403628889802274, 'w_speaker': 0.006175348288740734, 'w_companies': 0.29097295564859826}. Best is trial 2 with value: 0.5569313168525696.[0m
[32m[I 2026-01-16 18:32:39,245][0m Trial 3 finished with value: 0.5773707628250122 and parameters: {'w_sem': 0.9162213204002109, 'w_cat': 0.106169

In [32]:
print("Финальное сравнение всех моделей(по LLM-GT)")

final_results_df = pd.DataFrame(results_llm)
print(final_results_df.to_string(index=False))

Финальное сравнение всех моделей(по LLM-GT)
                             Model  Recall@5 (LLM-GT)  Diversity  Composite
                            TF-IDF              0.208      0.958      0.433
                          Word2Vec              0.048      0.108      0.066
                     Sentence-BERT              0.256      0.511      0.333
                            BGE-M3              0.281      0.459      0.335
                              BM25              0.109      0.960      0.365
             Hybrid (BGE + Optuna)              0.361      0.469      0.394
           Hybrid (SBERT + Optuna)              0.329      0.533      0.390
             LLM Oracle (Reranked)              1.000      0.499      0.850
      SBERT (Fine-tuned on LLM-GT)              0.373      0.846      0.515
Hybrid (Fine-tuned SBERT + Optuna)              0.459      0.886      0.587
Hybrid (Fine-tuned SBERT + Optuna)              0.487      0.867      0.601
