In [None]:
# Instalar dependencias necesarias
%pip -q install torch datasets transformers scikit-learn pandas tqdm python-Levenshtein

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[0m

In [None]:
# Importar bibliotecas
import logging
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Dict
from datasets import load_dataset
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from google.colab import files
import json
from itertools import product
import re
from Levenshtein import distance as lev_distance

# Configurar logging para depuración
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Verificar GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Verificar que todo esté listo
print("Bibliotecas importadas correctamente!")

# Configurar dispositivo (GPU si está disponible)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Dispositivo:", device)

# Definir constantes para columnas
DOCUMENT_ID_COL = "document_id"
SPANS_COL = "spans"
RANK_COL = "rank"
SAMPLE_PK_COL = "pk"
TRUE_CUI_COL = "UMLS_CUI"
PREDICTION_COL = "prediction"

# Crear directorios para resultados
output_dir = '../data/results'
subdirs = ['general', 'sapbert', 'sapbert_reranked']
for subdir in subdirs:
    os.makedirs(os.path.join(output_dir, subdir), exist_ok=True)

# Cargar datos desde Hugging Face
en_data_dev = load_dataset("andorei/BioNNE-L", "English", split="dev").to_pandas()
en_data_test = load_dataset("andorei/BioNNE-L", "English", split="test").to_pandas()
vocab = load_dataset("andorei/BioNNE-L", "Vocabulary", split="train").to_pandas()

# Sanitizar datos
en_data_dev['text'] = en_data_dev['text'].fillna('').astype(str)
en_data_dev['text'] = en_data_dev['text'].fillna('').astype(str)
en_data_test['text'] = en_data_test['text'].fillna('').astype(str)
vocab['concept_name'] = vocab['concept_name'].fillna('').astype(str)

def clean_text(text):
    text = text.lower()  # Convertir a minúsculas
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Eliminar caracteres no alfanuméricos
    return text.strip()

# Aplicar limpieza a los datos
en_data_dev["text"] = en_data_dev["text"].apply(clean_text)
en_data_test["text"] = en_data_test["text"].apply(clean_text)
vocab["concept_name"] = vocab["concept_name"].apply(clean_text)

# Guardar datos crudos
dev_path = os.path.join(output_dir, 'general', 'en_data_dev.csv')
en_data_dev.to_csv(dev_path, index=False)
print(f"Datos de desarrollo guardados en: {dev_path}")
#files.download(dev_path)

test_path = os.path.join(output_dir, 'general', 'en_data_test.csv')
en_data_test.to_csv(test_path, index=False)
print(f"Datos de test guardados en: {test_path}")
#files.download(test_path)

vocab_path = os.path.join(output_dir, 'general', 'vocab.csv')
vocab.to_csv(vocab_path, index=False)
print(f"Vocabulario completo guardado en: {vocab_path}")
#files.download(vocab_path)

# Filtrar vocabulario en inglés
en_vocab = vocab[vocab["lang"] == "ENG"][["CUI", "semantic_type", "concept_name"]]
print("Vocabulario en inglés:", en_vocab.shape)

# Guardar vocabulario filtrado
en_vocab_path = os.path.join(output_dir, 'general', 'en_vocab.csv')
en_vocab.to_csv(en_vocab_path, index=False)
print(f"Vocabulario en inglés guardado en: {en_vocab_path}")
#files.download(en_vocab_path)

# Confirmar longitudes de menciones
tokenizer = AutoTokenizer.from_pretrained("andorei/gebert_eng_gat")
en_data_dev["text_length"] = en_data_dev["text"].apply(lambda x: len(tokenizer(x)["input_ids"]))
en_data_test["text_length"] = en_data_test["text"].apply(lambda x: len(tokenizer(x)["input_ids"]))
print("Longitud promedio de menciones (dev):", en_data_dev["text_length"].mean())
print("Longitud máxima de menciones (dev):", en_data_dev["text_length"].max())
print("Longitud promedio de menciones (test):", en_data_test["text_length"].mean())
print("Longitud máxima de menciones (test):", en_data_test["text_length"].max())

# Guardar datos con longitudes
dev_with_length_path = os.path.join(output_dir, 'general', 'en_data_dev_with_length.csv')
en_data_dev.to_csv(dev_with_length_path, index=False)
print(f"Datos de desarrollo con longitudes guardados en: {dev_with_length_path}")
#files.download(dev_with_length_path)

test_with_length_path = os.path.join(output_dir, 'general', 'en_data_test_with_length.csv')
en_data_test.to_csv(test_with_length_path, index=False)
print(f"Datos de test con longitudes guardados en: {test_with_length_path}")
#files.download(test_with_length_path)

Tue May  6 01:42:33 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P0             47W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

bionnel_en_train.parquet:   0%|          | 0.00/55.1k [00:00<?, ?B/s]

bionnel_en_dev.parquet:   0%|          | 0.00/50.3k [00:00<?, ?B/s]

bionnel_en_test.parquet:   0%|          | 0.00/95.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2690 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/2494 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6661 [00:00<?, ? examples/s]

bionnel_vocab_bilingual.parquet:   0%|          | 0.00/70.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4047990 [00:00<?, ? examples/s]

Datos de desarrollo guardados en: ./predictions/general/en_data_dev.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Datos de test guardados en: ./predictions/general/en_data_test.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Vocabulario completo guardado en: ./predictions/general/vocab.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Vocabulario en inglés: (3902187, 3)
Vocabulario en inglés guardado en: ./predictions/general/en_vocab.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Longitud promedio de menciones (dev): 3.9931836407377705
Longitud máxima de menciones (dev): 15
Longitud promedio de menciones (test): 3.9791322624230596
Longitud máxima de menciones (test): 16
Datos de desarrollo con longitudes guardados en: ./predictions/general/en_data_dev_with_length.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Datos de test con longitudes guardados en: ./predictions/general/en_data_test_with_length.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ------------------------------
# Funciones Auxiliares
# ------------------------------

def jaccard_similarity(str1, str2):
    str1 = str(str1) if pd.notnull(str1) else ""
    str2 = str(str2) if pd.notnull(str2) else ""
    if len(str1.strip()) == 0 or len(str2.strip()) == 0:
        return 0.0
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    intersection = len(a & b)
    union = len(a | b)
    return intersection / union if union > 0 else 0.0

def levenshtein_similarity(str1, str2):
    str1 = str(str1) if pd.notnull(str1) else ""
    str2 = str(str2) if pd.notnull(str2) else ""
    if len(str1.strip()) == 0 or len(str2.strip()) == 0:
        return 0.0
    dist = lev_distance(str1.lower(), str2.lower())
    max_len = max(len(str1), len(str2))
    return 1.0 - (dist / max_len) if max_len > 0 else 0.0

def encode_names(names, bert_encoder, tokenizer, max_length, device, batch_size=256, show_progress=False):
    bert_encoder.eval()
    if isinstance(names, np.ndarray):
        names = names.tolist()
    name_encodings = tokenizer(names, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
    input_ids = name_encodings["input_ids"]
    attention_mask = name_encodings["attention_mask"]
    embs = []
    num_samples = len(names)
    indices = range(0, num_samples, batch_size)
    if show_progress:
        indices = tqdm(indices, desc="Encoding names", unit="batch")
    with torch.no_grad():
        for i in indices:
            batch_input_ids = input_ids[i:i + batch_size].to(device)
            batch_attention_mask = attention_mask[i:i + batch_size].to(device)
            outputs = bert_encoder(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            batch_embeddings = outputs.last_hidden_state[:, 0].detach().cpu()
            embs.append(batch_embeddings)
            del batch_input_ids, batch_attention_mask, outputs
            torch.cuda.empty_cache()
    final_embeddings = torch.cat(embs, dim=0).detach()
    assert final_embeddings.size(0) == num_samples
    return final_embeddings

def create_repeated_tensor(range_bound, repeat_times):
    base_tensor = torch.arange(range_bound)
    repeated_tensor = torch.repeat_interleave(base_tensor, repeat_times)
    return repeated_tensor

def get_torch_query_dict_score_matrix(query_names, tokenizer, bert_encoder, vocab_names, base_k, device,
                                     query_batch_size, max_length, show_progress, vocab_batch_size=256):
    bert_encoder.eval()
    num_queries = len(query_names)
    vocab_length = len(vocab_names)
    query_embs = encode_names(names=query_names, bert_encoder=bert_encoder, tokenizer=tokenizer,
                              max_length=max_length, device=device, batch_size=query_batch_size,
                              show_progress=show_progress).unsqueeze(1).to(device).detach()
    assert num_queries == len(query_embs)
    overall_max = None
    overall_max_indices = None
    with torch.no_grad():
        for vocab_start_pos in tqdm(range(0, vocab_length, vocab_batch_size), desc="Processing vocab batches"):
            vocab_end_pos = min(vocab_start_pos + vocab_batch_size, vocab_length)
            batch_vocab_names = vocab_names[vocab_start_pos:vocab_end_pos]
            batch_vocab_embeddings = encode_names(names=batch_vocab_names, bert_encoder=bert_encoder,
                                                 tokenizer=tokenizer, max_length=max_length,
                                                 device=device, batch_size=vocab_batch_size, show_progress=False).to(device)
            batch_score_matrix = F.cosine_similarity(query_embs, batch_vocab_embeddings.unsqueeze(0), dim=-1)
            assert batch_score_matrix.shape == (num_queries, vocab_end_pos - vocab_start_pos)
            k = min(base_k, vocab_end_pos - vocab_start_pos)
            b_max, b_indices = torch.topk(batch_score_matrix, k=k, dim=1)
            b_indices += vocab_start_pos
            if overall_max is None:
                overall_max = b_max
                overall_max_indices = b_indices
            concat_max = torch.cat((overall_max, b_max), dim=1)
            concat_indices = torch.cat((overall_max_indices, b_indices), dim=1)
            overall_max, local_indices = torch.topk(concat_max, k=base_k, dim=1)
            x_index = create_repeated_tensor(range_bound=num_queries, repeat_times=base_k)
            overall_max_indices = concat_indices[x_index, local_indices.view(-1)].view(size=overall_max.size())
            del batch_vocab_embeddings, batch_score_matrix, b_max, b_indices
            torch.cuda.empty_cache()
    return {"best_scores": overall_max, "best_indices": overall_max_indices}

def create_row_primary_key(row):
    doc_id = row[DOCUMENT_ID_COL]
    spans = row[SPANS_COL]
    assert '|' not in str(doc_id)
    assert '|' not in str(spans)
    return f"{doc_id}|{spans}"

def create_sample_pk2_true_cui_map(df: pd.DataFrame, true_cui_column) -> Dict[str, str]:
    sample_pk2cui = {}
    for _, row in df.iterrows():
        sample_pk = create_row_primary_key(row)
        true_cui = row[true_cui_column]
        assert '|' not in true_cui
        sample_pk2cui[sample_pk] = true_cui
    return sample_pk2cui

def calculate_metrics(pred_df: pd.DataFrame, sample_pk2true_cui: Dict[str, str]):
    if SAMPLE_PK_COL not in pred_df.columns:
        pred_df = pred_df.copy()
        pred_df[SAMPLE_PK_COL] = pred_df.apply(create_row_primary_key, axis=1)

    sample_id2min_true_predicted_rank = {}
    for _, row in pred_df.iterrows():
        sample_pk = row[SAMPLE_PK_COL]
        rank = row[RANK_COL]
        pred_cui = row[PREDICTION_COL]
        if sample_pk not in sample_pk2true_cui:
            continue
        true_cui = sample_pk2true_cui[sample_pk]
        if pred_cui == true_cui:
            if sample_id2min_true_predicted_rank.get(sample_pk) is None:
                sample_id2min_true_predicted_rank[sample_pk] = rank
            sample_id2min_true_predicted_rank[sample_pk] = min(sample_id2min_true_predicted_rank[sample_pk], rank)
    acc_1_sum = acc_5_sum = mrr_sum = 0.
    num_samples = len(sample_pk2true_cui)
    for sample_id in sample_pk2true_cui.keys():
        rank = sample_id2min_true_predicted_rank.get(sample_id, -1)
        assert rank != 0
        sample_acc_1 = sample_acc_5 = sample_mrr = 0.
        if rank != -1:
            if rank == 1:
                sample_acc_1 = 1.
            if rank <= 5:
                sample_acc_5 = 1.
            sample_mrr = 1. / rank
        acc_1_sum += sample_acc_1
        acc_5_sum += sample_acc_5
        mrr_sum += sample_mrr
    return {
        "Acc@1": acc_1_sum / num_samples if num_samples > 0 else 0.0,
        "Acc@5": acc_5_sum / num_samples if num_samples > 0 else 0.0,
        "MRR": mrr_sum / num_samples if num_samples > 0 else 0.0
    }

def make_predictions(entities_df, tokenizer, bert_encoder, vocab, max_length, k, device, query_batch_size, vocab_batch_size, model_name):
    predictions_list = []
    for chem_type in ("DISO", "CHEM", "ANATOMY"):
        subset_df = entities_df[entities_df["entity_type"] == chem_type]
        document_ids = subset_df["document_id"].values
        query_names = subset_df["text"].values
        spans = subset_df["spans"].values
        subset_vocab = vocab[vocab["semantic_type"] == chem_type]
        vocab_names = subset_vocab["concept_name"].values
        vocab_cuis = subset_vocab["CUI"].values
        if len(vocab_names) == 0:
            for doc_id, sp in zip(document_ids, spans):
                for rank in range(1, k + 1):
                    predictions_list.append({
                        "document_id": doc_id,
                        "spans": sp,
                        "rank": rank,
                        "prediction": "CUILESS",
                        "model": model_name
                    })
            continue
        torch.cuda.empty_cache()
        pred_d = get_torch_query_dict_score_matrix(
            query_names=query_names,
            tokenizer=tokenizer,
            bert_encoder=bert_encoder,
            vocab_names=vocab_names,
            base_k=k,
            device=device,
            query_batch_size=query_batch_size,
            max_length=max_length,
            show_progress=True,
            vocab_batch_size=vocab_batch_size
        )
        pred_indices = pred_d["best_indices"]
        pred_scores = pred_d["best_scores"]
        assert len(pred_indices) == len(query_names) == len(spans) == len(document_ids)
        for doc_id, pred_idx, pred_score, sp in zip(document_ids, pred_indices, pred_scores, spans):
            pred_cuis = [vocab_cuis[x.item()] for x in pred_idx]
            pred_scores_list = pred_score.tolist()
            for rank, (cui, score) in enumerate(zip(pred_cuis, pred_scores_list)):
                predictions_list.append({
                    "document_id": doc_id,
                    "spans": sp,
                    "rank": rank + 1,
                    "prediction": cui,
                    "cosine_score": score,
                    "model": model_name
                })
    return pd.DataFrame(predictions_list)

def rerank_predictions(predictions_df, entities_df, vocab, k_final=5, cosine_weight=0.7, jaccard_weight=0.2, levenshtein_weight=0.1, model_name="reranked"):
    reranked_list = []
    debug_log = []
    vocab_dict = {}
    for chem_type in ("DISO", "CHEM", "ANATOMY"):
        subset = vocab[vocab["semantic_type"] == chem_type]
        vocab_dict[chem_type] = dict(zip(subset["CUI"], subset["concept_name"]))

    for chem_type in ("DISO", "CHEM", "ANATOMY"):
        subset_df = entities_df[entities_df["entity_type"] == chem_type]
        subset_preds = predictions_df[predictions_df["document_id"].isin(subset_df["document_id"])]
        chem_vocab_dict = vocab_dict[chem_type]

        for doc_id, group in subset_preds.groupby("document_id"):
            for sp, subgroup in group.groupby("spans"):
                query_row = entities_df[
                    (entities_df["document_id"] == doc_id) & (entities_df["spans"] == sp)
                ]
                if query_row.empty:
                    print(f"No se encontró query_text para doc_id: {doc_id}, spans: {sp}")
                    continue

                query_text = query_row["text"].iloc[0]
                if pd.isna(query_text) or not isinstance(query_text, str):
                    print(f"query_text inválido para doc_id: {doc_id}, spans: {sp}, query_text: {query_text}")
                    query_text = ""

                candidates = subgroup[["prediction", "cosine_score"]].copy()
                jaccard_scores = []
                levenshtein_scores = []
                for cui in candidates["prediction"]:
                    concept_name = chem_vocab_dict.get(cui, "")
                    jaccard_scores.append(jaccard_similarity(query_text, concept_name))
                    levenshtein_scores.append(levenshtein_similarity(query_text, concept_name))

                candidates["jaccard_score"] = jaccard_scores
                candidates["levenshtein_score"] = levenshtein_scores
                candidates["combined_score"] = (
                    cosine_weight * candidates["cosine_score"] +
                    jaccard_weight * candidates["jaccard_score"] +
                    levenshtein_weight * candidates["levenshtein_score"]
                )

                candidates = candidates.sort_values("combined_score", ascending=False).head(k_final)

                for rank, row in enumerate(candidates.itertuples(), 1):
                    reranked_list.append({
                        "document_id": doc_id,
                        "spans": sp,
                        "rank": rank,
                        "prediction": row.prediction
                    })

                debug_log.append({
                    "doc_id": doc_id,
                    "spans": sp,
                    "query_text": query_text,
                    "candidates": candidates[["prediction", "cosine_score", "jaccard_score", "levenshtein_score", "combined_score"]].to_dict()
                })

    reranked_df = pd.DataFrame(reranked_list)
    debug_log_path = os.path.join(output_dir, model_name.lower().replace('_reranked', ''), 'rerank_debug_log.json')
    with open(debug_log_path, 'w') as f:
        json.dump(debug_log, f)
    print(f"Log de depuración guardado en: {debug_log_path}")
    #files.download(debug_log_path)
    return reranked_df

def grid_search_reranking(predictions_df, entities_df, vocab, k_final, sample_pk2true_cui, cosine_weights, jaccard_weights, levenshtein_weights):
    results = []
    valid_combinations = [(cw, jw, lw) for cw in cosine_weights for jw in jaccard_weights for lw in levenshtein_weights if abs(cw + jw + lw - 1.0) < 1e-6]
    for cw, jw, lw in tqdm(valid_combinations, desc="Grid Search"):
        reranked_df = rerank_predictions(
            predictions_df=predictions_df,
            entities_df=entities_df,
            vocab=vocab,
            k_final=k_final,
            cosine_weight=cw,
            jaccard_weight=jw,
            levenshtein_weight=lw,
            model_name="SapBERT_reranked"
        )
        if SAMPLE_PK_COL not in reranked_df.columns:
            reranked_df["pk"] = reranked_df.apply(create_row_primary_key, axis=1)
        metrics = calculate_metrics(reranked_df, sample_pk2true_cui)
        subset_metrics = {}
        for chem_type in ("DISO", "CHEM", "ANATOMY"):
            subset_df = entities_df[entities_df["entity_type"] == chem_type]
            subset_preds = reranked_df[reranked_df["document_id"].isin(subset_df["document_id"])]
            subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")
            subset_metrics[chem_type] = calculate_metrics(subset_preds, subset_pk2true_cui)
        results.append({
            "cosine_weight": cw,
            "jaccard_weight": jw,
            "levenshtein_weight": lw,
            "Acc@1": metrics["Acc@1"],
            "Acc@5": metrics["Acc@5"],
            "MRR": metrics["MRR"],
            "DISO_Acc@1": subset_metrics["DISO"]["Acc@1"],
            "CHEM_Acc@1": subset_metrics["CHEM"]["Acc@1"],
            "ANATOMY_Acc@1": subset_metrics["ANATOMY"]["Acc@1"]
        })
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(output_dir, 'sapbert_reranked', 'reranking_grid_search.csv'), index=False)
    #files.download(os.path.join(output_dir, 'sapbert_reranked', 'reranking_grid_search.csv'))
    best_result = results_df.loc[results_df["Acc@1"].idxmax()]
    print(f"Mejor combinación: cosine_weight={best_result['cosine_weight']}, jaccard_weight={best_result['jaccard_weight']}, levenshtein_weight={best_result['levenshtein_weight']}")
    print(f"Métricas: Acc@1={best_result['Acc@1']:.4f}, Acc@5={best_result['Acc@5']:.4f}, MRR={best_result['MRR']:.4f}")
    return best_result["cosine_weight"], best_result["jaccard_weight"], best_result["levenshtein_weight"]


In [None]:
# ------------------------------
# Bloque 2: SapBERT para Desarrollo
# ------------------------------

print("\nCargando SapBERT...")
sapbert_encoder = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").to(device)
sapbert_tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")

# Parámetros optimizados
query_batch_size = 128
vocab_batch_size = 200
max_length = 16
k_initial = 50
k_final = 5

print("\nGenerando predicciones con SapBERT para dev...")
sapbert_predictions_df = make_predictions(
    entities_df=en_data_dev,
    tokenizer=sapbert_tokenizer,
    bert_encoder=sapbert_encoder,
    vocab=en_vocab,
    max_length=max_length,
    k=k_initial,
    device=device,
    query_batch_size=query_batch_size,
    vocab_batch_size=vocab_batch_size,
    model_name="SapBERT"
)

sapbert_predictions_path = os.path.join(output_dir, 'sapbert', 'en_predictions_sapbert_dev.tsv')
sapbert_predictions_df.to_csv(sapbert_predictions_path, sep='\t', index=False)
print(f"Predicciones SapBERT para dev guardadas en: {sapbert_predictions_path}")
#files.download(sapbert_predictions_path)

sapbert_predictions_df["pk"] = sapbert_predictions_df.apply(create_row_primary_key, axis=1)
en_data_dev["pk"] = en_data_dev.apply(create_row_primary_key, axis=1)

sapbert_eval_data = en_data_dev.merge(sapbert_predictions_df[["pk", "rank", "prediction", "model"]], on="pk")
sapbert_eval_data = sapbert_eval_data[sapbert_eval_data["UMLS_CUI"] != "CUILESS"]

sapbert_eval_data_path = os.path.join(output_dir, 'sapbert', 'eval_data_sapbert.csv')
sapbert_eval_data.to_csv(sapbert_eval_data_path, index=False)
print(f"Datos de evaluación SapBERT guardados en: {sapbert_eval_data_path}")
#files.download(sapbert_eval_data_path)

sapbert_pk2true_cui = create_sample_pk2_true_cui_map(sapbert_eval_data, "UMLS_CUI")

sapbert_pk2true_cui_path = os.path.join(output_dir, 'sapbert', 'pk2true_cui_sapbert.json')
with open(sapbert_pk2true_cui_path, 'w') as f:
    json.dump(sapbert_pk2true_cui, f)
print(f"Diccionario pk2true_cui SapBERT guardado en: {sapbert_pk2true_cui_path}")
#files.download(sapbert_pk2true_cui_path)

sapbert_metrics = calculate_metrics(sapbert_predictions_df, sapbert_pk2true_cui)
print("Resultados de SapBERT:", sapbert_metrics)


Cargando SapBERT...


config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


Generando predicciones con SapBERT para dev...


Encoding names: 100%|██████████| 9/9 [00:00<00:00, 13.29batch/s]
Processing vocab batches: 100%|██████████| 9126/9126 [08:08<00:00, 18.69it/s]
Encoding names: 100%|██████████| 5/5 [00:00<00:00, 41.32batch/s]
Processing vocab batches: 100%|██████████| 8661/8661 [07:22<00:00, 19.56it/s]
Encoding names: 100%|██████████| 8/8 [00:00<00:00, 40.45batch/s]
Processing vocab batches: 100%|██████████| 1726/1726 [01:31<00:00, 18.97it/s]


Predicciones SapBERT para dev guardadas en: ./predictions/sapbert/en_predictions_sapbert_dev.tsv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Datos de evaluación SapBERT guardados en: ./predictions/sapbert/eval_data_sapbert.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Diccionario pk2true_cui SapBERT guardado en: ./predictions/sapbert/pk2true_cui_sapbert.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Resultados de SapBERT: {'Acc@1': 0.6835073068893528, 'Acc@5': 0.7974947807933194, 'MRR': 0.7354674925823039}


In [None]:
# ------------------------------
# Bloque 3: Re-ranking SapBERT con Levenshtein para Desarrollo
# ------------------------------

print("\nRe-ranking SapBERT para dev...")
cosine_weights = [0.5, 0.6, 0.7, 0.8]
jaccard_weights = [0.1, 0.2, 0.3]
levenshtein_weights = [0.1, 0.2]
best_cosine_weight, best_jaccard_weight, best_levenshtein_weight = grid_search_reranking(
    predictions_df=sapbert_predictions_df,
    entities_df=en_data_dev,
    vocab=en_vocab,
    k_final=k_final,
    sample_pk2true_cui=sapbert_pk2true_cui,
    cosine_weights=cosine_weights,
    jaccard_weights=jaccard_weights,
    levenshtein_weights=levenshtein_weights
)

sapbert_reranked_df = rerank_predictions(
    predictions_df=sapbert_predictions_df,
    entities_df=en_data_dev,
    vocab=en_vocab,
    k_final=k_final,
    cosine_weight=best_cosine_weight,
    jaccard_weight=best_jaccard_weight,
    levenshtein_weight=best_levenshtein_weight,
    model_name="SapBERT_reranked"
)

sapbert_reranked_df["pk"] = sapbert_reranked_df.apply(create_row_primary_key, axis=1)
sapbert_reranked_path = os.path.join(output_dir, 'sapbert_reranked', 'en_predictions_sapbert_reranked_dev.tsv')
sapbert_reranked_df.to_csv(sapbert_reranked_path, sep='\t', index=False)
print(f"Predicciones SapBERT reranked para dev guardadas en: {sapbert_reranked_path}")
#files.download(sapbert_reranked_path)

sapbert_reranked_metrics = calculate_metrics(sapbert_reranked_df, sapbert_pk2true_cui)
print("Resultados de SapBERT reranked:", sapbert_reranked_metrics)


Re-ranking SapBERT para dev...


Grid Search:   0%|          | 0/6 [00:00<?, ?it/s]

Log de depuración guardado en: ./predictions/sapbert/rerank_debug_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Grid Search:  17%|█▋        | 1/6 [00:34<02:53, 34.80s/it]

Log de depuración guardado en: ./predictions/sapbert/rerank_debug_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Grid Search:  33%|███▎      | 2/6 [01:09<02:18, 34.67s/it]

Log de depuración guardado en: ./predictions/sapbert/rerank_debug_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Grid Search:  50%|█████     | 3/6 [01:44<01:43, 34.66s/it]

Log de depuración guardado en: ./predictions/sapbert/rerank_debug_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Grid Search:  67%|██████▋   | 4/6 [02:18<01:09, 34.69s/it]

Log de depuración guardado en: ./predictions/sapbert/rerank_debug_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Grid Search:  83%|████████▎ | 5/6 [02:53<00:34, 34.62s/it]

Log de depuración guardado en: ./predictions/sapbert/rerank_debug_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Grid Search: 100%|██████████| 6/6 [03:28<00:00, 34.71s/it]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Mejor combinación: cosine_weight=0.7, jaccard_weight=0.1, levenshtein_weight=0.2
Métricas: Acc@1=0.7203, Acc@5=0.8054, MRR=0.7535
Log de depuración guardado en: ./predictions/sapbert/rerank_debug_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Predicciones SapBERT reranked para dev guardadas en: ./predictions/sapbert_reranked/en_predictions_sapbert_reranked_dev.tsv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Resultados de SapBERT reranked: {'Acc@1': 0.7202505219206681, 'Acc@5': 0.8054279749478079, 'MRR': 0.7534725121781489}


In [None]:
# ------------------------------
# Bloque 4: SapBERT para Test
# ------------------------------

print("\nGenerando predicciones con SapBERT para test...")
sapbert_predictions_test_df = make_predictions(
    entities_df=en_data_test,
    tokenizer=sapbert_tokenizer,
    bert_encoder=sapbert_encoder,
    vocab=en_vocab,
    max_length=max_length,
    k=k_initial,
    device=device,
    query_batch_size=query_batch_size,
    vocab_batch_size=vocab_batch_size,
    model_name="SapBERT"
)

sapbert_predictions_test_path = os.path.join(output_dir, 'sapbert', 'en_predictions_sapbert_test.tsv')
sapbert_predictions_test_df.to_csv(sapbert_predictions_test_path, sep='\t', index=False)
print(f"Predicciones SapBERT para test guardadas en: {sapbert_predictions_test_path}")
#files.download(sapbert_predictions_test_path)


Generando predicciones con SapBERT para test...


Encoding names: 100%|██████████| 24/24 [00:00<00:00, 35.66batch/s]
Processing vocab batches: 100%|██████████| 9126/9126 [22:40<00:00,  6.71it/s]
Encoding names: 100%|██████████| 11/11 [00:00<00:00, 33.56batch/s]
Processing vocab batches: 100%|██████████| 8661/8661 [09:44<00:00, 14.81it/s]
Encoding names: 100%|██████████| 18/18 [00:00<00:00, 38.14batch/s]
Processing vocab batches: 100%|██████████| 1726/1726 [03:08<00:00,  9.14it/s]


Predicciones SapBERT para test guardadas en: ./predictions/sapbert/en_predictions_sapbert_test.tsv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ------------------------------
# Bloque 5: Re-ranking SapBERT con Levenshtein para Test
# ------------------------------

print("\nRe-ranking SapBERT para test...")
sapbert_reranked_test_df = rerank_predictions(
    predictions_df=sapbert_predictions_test_df,
    entities_df=en_data_test,
    vocab=en_vocab,
    k_final=k_final,
    cosine_weight=best_cosine_weight,
    jaccard_weight=best_jaccard_weight,
    levenshtein_weight=best_levenshtein_weight,
    model_name="SapBERT_reranked"
)

sapbert_reranked_test_df["pk"] = sapbert_reranked_test_df.apply(create_row_primary_key, axis=1)
sapbert_reranked_test_path = os.path.join(output_dir, 'sapbert_reranked', 'en_predictions_sapbert_reranked_test.tsv')
sapbert_reranked_test_df.to_csv(sapbert_reranked_test_path, sep='\t', index=False)
print(f"Predicciones SapBERT reranked para test guardadas en: {sapbert_reranked_test_path}")
#files.download(sapbert_reranked_test_path)


Re-ranking SapBERT para test...
Log de depuración guardado en: ./predictions/sapbert/rerank_debug_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Predicciones SapBERT reranked para test guardadas en: ./predictions/sapbert_reranked/en_predictions_sapbert_reranked_test.tsv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("\nPredicciones listas para subir. Revisa el archivo descargado:")
print(f"- {sapbert_reranked_test_path}")


Predicciones listas para subir. Revisa el archivo descargado:
- ./predictions/sapbert_reranked/en_predictions_sapbert_reranked_test.tsv
