In [None]:
# ------------------------------
# Fase 1: Configuración del Entorno y Carga de Datos
# ------------------------------

# Instalar dependencias necesarias
%pip -q install torch datasets transformers scikit-learn pandas tqdm

In [None]:
import os

# Crear directorios para resultados
output_dir = '../data'
subdirs = ['general', 'sapbert', 'pubmedbert', 'sapbert_reranked', 'pubmedbert_reranked', 'stanford', 'stanford_reranked']
for subdir in subdirs:
    os.makedirs(os.path.join(output_dir, subdir), exist_ok=True)

In [None]:
# Importar bibliotecas
import logging
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Dict
from datasets import load_dataset
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from google.colab import files
import json
from itertools import product
import re

# Configurar logging para depuración
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Verificar GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Verificar que todo esté listo
print("Bibliotecas importadas correctamente!")

# Configurar dispositivo (GPU si está disponible)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Dispositivo:", device)

# Definir constantes para columnas
DOCUMENT_ID_COL = "document_id"
SPANS_COL = "spans"
RANK_COL = "rank"
SAMPLE_PK_COL = "pk"
TRUE_CUI_COL = "UMLS_CUI"
PREDICTION_COL = "prediction"

# Crear directorios para resultados
output_dir = '../data'
subdirs = ['general', 'sapbert', 'pubmedbert', 'sapbert_reranked', 'pubmedbert_reranked', 'stanford', 'stanford_reranked']
for subdir in subdirs:
    os.makedirs(os.path.join(output_dir, subdir), exist_ok=True)

# Cargar datos desde Hugging Face
en_data_train = load_dataset("andorei/BioNNE-L", "English", split="train").to_pandas()
en_data_dev = load_dataset("andorei/BioNNE-L", "English", split="dev").to_pandas()
vocab = load_dataset("andorei/BioNNE-L", "Vocabulary", split="train").to_pandas()

# Sanitizar datos
en_data_train['text'] = en_data_train['text'].fillna('').astype(str)
en_data_dev['text'] = en_data_dev['text'].fillna('').astype(str)
vocab['concept_name'] = vocab['concept_name'].fillna('').astype(str)


def clean_text(text):
    text = text.lower()  # Convertir a minúsculas
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Eliminar caracteres no alfanuméricos
    return text.strip()

# Aplicar limpieza a los datos
en_data_train["text"] = en_data_train["text"].apply(clean_text)
en_data_dev["text"] = en_data_dev["text"].apply(clean_text)
vocab["concept_name"] = vocab["concept_name"].apply(clean_text)

# Guardar datos crudos
train_path = os.path.join(output_dir, 'general', 'en_data_train.csv')
en_data_train.to_csv(train_path, index=False)
print(f"Datos de entrenamiento guardados en: {train_path}")
#files.download(train_path)

dev_path = os.path.join(output_dir, 'general', 'en_data_dev.csv')
en_data_dev.to_csv(dev_path, index=False)
print(f"Datos de desarrollo guardados en: {dev_path}")
#files.download(dev_path)

vocab_path = os.path.join(output_dir, 'general', 'vocab.csv')
vocab.to_csv(vocab_path, index=False)
print(f"Vocabulario completo guardado en: {vocab_path}")
#files.download(vocab_path)

# Explorar datos
print("\nExploración de datos:")
print("Entrenamiento (en):", en_data_train.shape)
print(en_data_train.head())
print("Desarrollo (en):", en_data_dev.shape)
print(en_data_dev.head())
print("Vocabulario completo:", vocab.shape)
print(vocab.head())

# Filtrar vocabulario en inglés
en_vocab = vocab[vocab["lang"] == "ENG"][["CUI", "semantic_type", "concept_name"]]
print("Vocabulario en inglés:", en_vocab.shape)

# Guardar vocabulario filtrado
en_vocab_path = os.path.join(output_dir, 'general', 'en_vocab.csv')
en_vocab.to_csv(en_vocab_path, index=False)
print(f"Vocabulario en inglés guardado en: {en_vocab_path}")
#files.download(en_vocab_path)

# Confirmar longitudes de menciones
tokenizer = AutoTokenizer.from_pretrained("andorei/gebert_eng_gat")
en_data_dev["text_length"] = en_data_dev["text"].apply(lambda x: len(tokenizer(x)["input_ids"]))
print("Longitud promedio de menciones:", en_data_dev["text_length"].mean())
print("Longitud máxima de menciones:", en_data_dev["text_length"].max())

# Guardar en_data_dev con text_length
dev_with_length_path = os.path.join(output_dir, 'general', 'en_data_dev_with_length.csv')
en_data_dev.to_csv(dev_with_length_path, index=False)
print(f"Datos de desarrollo con longitudes guardados en: {dev_with_length_path}")
#files.download(dev_with_length_path)

In [None]:
# ------------------------------
# Funciones Auxiliares
# ------------------------------

def jaccard_similarity(str1, str2):
    str1 = str(str1) if pd.notnull(str1) else ""
    str2 = str(str2) if pd.notnull(str2) else ""
    if len(str1.strip()) == 0 or len(str2.strip()) == 0:
        return 0.0
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    intersection = len(a & b)
    union = len(a | b)
    return intersection / union if union > 0 else 0.0

def encode_names(names, bert_encoder, tokenizer, max_length, device, batch_size=256, show_progress=False):
    bert_encoder.eval()
    if isinstance(names, np.ndarray):
        names = names.tolist()
    name_encodings = tokenizer(names, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
    input_ids = name_encodings["input_ids"]
    attention_mask = name_encodings["attention_mask"]
    embs = []
    num_samples = len(names)
    indices = range(0, num_samples, batch_size)
    if show_progress:
        indices = tqdm(indices, desc="Encoding names", unit="batch")
    with torch.no_grad():
        for i in indices:
            batch_input_ids = input_ids[i:i + batch_size].to(device)
            batch_attention_mask = attention_mask[i:i + batch_size].to(device)
            outputs = bert_encoder(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            batch_embeddings = outputs.last_hidden_state[:, 0].detach().cpu()
            embs.append(batch_embeddings)
            del batch_input_ids, batch_attention_mask, outputs
            torch.cuda.empty_cache()
    final_embeddings = torch.cat(embs, dim=0).detach()
    assert final_embeddings.size(0) == num_samples
    return final_embeddings

def create_repeated_tensor(range_bound, repeat_times):
    base_tensor = torch.arange(range_bound)
    repeated_tensor = torch.repeat_interleave(base_tensor, repeat_times)
    return repeated_tensor

def get_torch_query_dict_score_matrix(query_names, tokenizer, bert_encoder, vocab_names, base_k, device,
                                     query_batch_size, max_length, show_progress, vocab_batch_size=256):
    bert_encoder.eval()
    num_queries = len(query_names)
    vocab_length = len(vocab_names)
    query_embs = encode_names(names=query_names, bert_encoder=bert_encoder, tokenizer=tokenizer,
                              max_length=max_length, device=device, batch_size=query_batch_size,
                              show_progress=show_progress).unsqueeze(1).to(device).detach()
    assert num_queries == len(query_embs)
    overall_max = None
    overall_max_indices = None
    with torch.no_grad():
        for vocab_start_pos in tqdm(range(0, vocab_length, vocab_batch_size)):
            vocab_end_pos = min(vocab_start_pos + vocab_batch_size, vocab_length)
            batch_vocab_names = vocab_names[vocab_start_pos:vocab_end_pos]
            batch_vocab_embeddings = encode_names(names=batch_vocab_names, bert_encoder=bert_encoder,
                                                 tokenizer=tokenizer, max_length=max_length,
                                                 device=device, batch_size=vocab_batch_size, show_progress=False).to(device)
            batch_score_matrix = F.cosine_similarity(query_embs, batch_vocab_embeddings.unsqueeze(0), dim=-1)
            assert batch_score_matrix.shape == (num_queries, vocab_end_pos - vocab_start_pos)
            k = min(base_k, vocab_end_pos - vocab_start_pos)
            b_max, b_indices = torch.topk(batch_score_matrix, k=k, dim=1)
            b_indices += vocab_start_pos
            if overall_max is None:
                overall_max = b_max
                overall_max_indices = b_indices
            concat_max = torch.cat((overall_max, b_max), dim=1)
            concat_indices = torch.cat((overall_max_indices, b_indices), dim=1)
            overall_max, local_indices = torch.topk(concat_max, k=base_k, dim=1)
            x_index = create_repeated_tensor(range_bound=num_queries, repeat_times=base_k)
            overall_max_indices = concat_indices[x_index, local_indices.view(-1)].view(size=overall_max.size())
            del batch_vocab_embeddings, batch_score_matrix, b_max, b_indices
            torch.cuda.empty_cache()
    return {"best_scores": overall_max, "best_indices": overall_max_indices}

def create_row_primary_key(row):
    doc_id = row[DOCUMENT_ID_COL]
    spans = row[SPANS_COL]
    assert '|' not in str(doc_id)
    assert '|' not in str(spans)
    return f"{doc_id}|{spans}"

def create_sample_pk2_true_cui_map(df: pd.DataFrame, true_cui_column) -> Dict[str, str]:
    sample_pk2cui = {}
    for _, row in df.iterrows():
        sample_pk = row[SAMPLE_PK_COL]
        true_cui = row[true_cui_column]
        assert '|' not in true_cui
        sample_pk2cui[sample_pk] = true_cui
    return sample_pk2cui

def calculate_metrics(pred_df: pd.DataFrame, sample_pk2true_cui: Dict[str, str]):
    sample_id2min_true_predicted_rank = {}
    for _, row in pred_df.iterrows():
        sample_pk = row[SAMPLE_PK_COL]
        rank = row[RANK_COL]
        pred_cui = row[PREDICTION_COL]
        if sample_pk not in sample_pk2true_cui:
            continue
        true_cui = sample_pk2true_cui[sample_pk]
        if pred_cui == true_cui:
            if sample_id2min_true_predicted_rank.get(sample_pk) is None:
                sample_id2min_true_predicted_rank[sample_pk] = rank
            sample_id2min_true_predicted_rank[sample_pk] = min(sample_id2min_true_predicted_rank[sample_pk], rank)
    acc_1_sum = acc_5_sum = mrr_sum = 0.
    num_samples = len(sample_pk2true_cui)
    for sample_id in sample_pk2true_cui.keys():
        rank = sample_id2min_true_predicted_rank.get(sample_id, -1)
        assert rank != 0
        sample_acc_1 = sample_acc_5 = sample_mrr = 0.
        if rank != -1:
            if rank == 1:
                sample_acc_1 = 1.
            if rank <= 5:
                sample_acc_5 = 1.
            sample_mrr = 1. / rank
        acc_1_sum += sample_acc_1
        acc_5_sum += sample_acc_5
        mrr_sum += sample_mrr
    return {
        "Acc@1": acc_1_sum / num_samples if num_samples > 0 else 0.0,
        "Acc@5": acc_5_sum / num_samples if num_samples > 0 else 0.0,
        "MRR": mrr_sum / num_samples if num_samples > 0 else 0.0
    }

def make_predictions(entities_df, tokenizer, bert_encoder, vocab, max_length, k, device, query_batch_size, vocab_batch_size, model_name):
    predictions_list = []
    for chem_type in ("DISO", "CHEM", "ANATOMY"):
        subset_df = entities_df[entities_df["entity_type"] == chem_type]
        document_ids = subset_df["document_id"].values
        query_names = subset_df["text"].values
        spans = subset_df["spans"].values
        subset_vocab = vocab[vocab["semantic_type"] == chem_type]
        vocab_names = subset_vocab["concept_name"].values
        vocab_cuis = subset_vocab["CUI"].values
        if len(vocab_names) == 0:
            for doc_id, sp in zip(document_ids, spans):
                for rank in range(1, k + 1):
                    predictions_list.append({
                        "document_id": doc_id,
                        "spans": sp,
                        "rank": rank,
                        "prediction": "CUILESS",
                        "model": model_name
                    })
            continue
        torch.cuda.empty_cache()
        pred_d = get_torch_query_dict_score_matrix(
            query_names=query_names,
            tokenizer=tokenizer,
            bert_encoder=bert_encoder,
            vocab_names=vocab_names,
            base_k=k,
            device=device,
            query_batch_size=query_batch_size,
            max_length=max_length,
            show_progress=True,
            vocab_batch_size=vocab_batch_size
        )
        pred_indices = pred_d["best_indices"]
        pred_scores = pred_d["best_scores"]
        assert len(pred_indices) == len(query_names) == len(spans) == len(document_ids)
        for doc_id, pred_idx, pred_score, sp in zip(document_ids, pred_indices, pred_scores, spans):
            pred_cuis = [vocab_cuis[x.item()] for x in pred_idx]
            pred_scores_list = pred_score.tolist()
            for rank, (cui, score) in enumerate(zip(pred_cuis, pred_scores_list)):
                predictions_list.append({
                    "document_id": doc_id,
                    "spans": sp,
                    "rank": rank + 1,
                    "prediction": cui,
                    "cosine_score": score,
                    "model": model_name
                })
    return pd.DataFrame(predictions_list)

def rerank_predictions(predictions_df, entities_df, vocab, k_final=5, cosine_weight=0.7, jaccard_weight=0.3, model_name="reranked"):
    reranked_list = []
    debug_log = []

    # Convertir vocab a diccionario
    vocab_dict = {}
    for chem_type in ("DISO", "CHEM", "ANATOMY"):
        subset = vocab[vocab["semantic_type"] == chem_type]
        vocab_dict[chem_type] = dict(zip(subset["CUI"], subset["concept_name"]))

    for chem_type in ("DISO", "CHEM", "ANATOMY"):
        subset_df = entities_df[entities_df["entity_type"] == chem_type]
        subset_preds = predictions_df[predictions_df["document_id"].isin(subset_df["document_id"])]
        chem_vocab_dict = vocab_dict[chem_type]

        for doc_id, group in subset_preds.groupby("document_id"):
            for sp, subgroup in group.groupby("spans"):
                query_row = entities_df[
                    (entities_df["document_id"] == doc_id) & (entities_df["spans"] == sp)
                ]
                if query_row.empty:
                    print(f"No se encontró query_text para doc_id: {doc_id}, spans: {sp}")
                    continue

                query_text = query_row["text"].iloc[0]
                if pd.isna(query_text) or not isinstance(query_text, str):
                    print(f"query_text inválido para doc_id: {doc_id}, spans: {sp}, query_text: {query_text}")
                    query_text = ""

                candidates = subgroup[["prediction", "cosine_score"]].copy()
                jaccard_scores = []
                for cui in candidates["prediction"]:
                    concept_name = chem_vocab_dict.get(cui, "")
                    jaccard_scores.append(jaccard_similarity(query_text, concept_name))

                candidates["jaccard_score"] = jaccard_scores
                candidates["combined_score"] = (
                    cosine_weight * candidates["cosine_score"] +
                    jaccard_weight * candidates["jaccard_score"]
                )

                candidates = candidates.sort_values("combined_score", ascending=False).head(k_final)

                for rank, row in enumerate(candidates.itertuples(), 1):
                    reranked_list.append({
                        "document_id": doc_id,
                        "spans": sp,
                        "rank": rank,
                        "prediction": row.prediction,
                        "model": model_name
                    })

                debug_log.append({
                    "doc_id": doc_id,
                    "spans": sp,
                    "query_text": query_text,
                    "candidates": candidates[["prediction", "cosine_score", "jaccard_score", "combined_score"]].to_dict()
                })

    reranked_df = pd.DataFrame(reranked_list)

    # Guardar log de depuración
    debug_log_path = os.path.join(output_dir, model_name.lower().replace('_reranked', ''), 'rerank_debug_log.json')
    with open(debug_log_path, 'w') as f:
        json.dump(debug_log, f)
    print(f"Log de depuración guardado en: {debug_log_path}")
    #files.download(debug_log_path)

    return reranked_df

# Nueva función para grid search
def grid_search_reranking(predictions_df, entities_df, vocab, k_final, sample_pk2true_cui, cosine_weights, jaccard_weights):
    """Realiza grid search para optimizar pesos de re-ranking."""
    results = []
    for cw, jw in product(cosine_weights, jaccard_weights):
        if cw + jw != 1.0:
            continue
        reranked_df = rerank_predictions(
            predictions_df=predictions_df,
            entities_df=entities_df,
            vocab=vocab,
            k_final=k_final,
            cosine_weight=cw,
            jaccard_weight=jw,
            model_name="SapBERT_reranked"
        )
        reranked_df["pk"] = reranked_df.apply(create_row_primary_key, axis=1)
        metrics = calculate_metrics(reranked_df, sample_pk2true_cui)
        # Calcular métricas por categoría
        subset_metrics = {}
        for chem_type in ("DISO", "CHEM", "ANATOMY"):
            subset_df = entities_df[entities_df["entity_type"] == chem_type]
            subset_preds = reranked_df[reranked_df["document_id"].isin(subset_df["document_id"])]
            subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")
            subset_metrics[chem_type] = calculate_metrics(subset_preds, subset_pk2true_cui)
        results.append({
            "cosine_weight": cw,
            "jaccard_weight": jw,
            "Acc@1": metrics["Acc@1"],
            "Acc@5": metrics["Acc@5"],
            "MRR": metrics["MRR"],
            "DISO_Acc@1": subset_metrics["DISO"]["Acc@1"],
            "CHEM_Acc@1": subset_metrics["CHEM"]["Acc@1"],
            "ANATOMY_Acc@1": subset_metrics["ANATOMY"]["Acc@1"]
        })
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(output_dir, 'sapbert_reranked', 'reranking_grid_search.csv'), index=False)
    #files.download(os.path.join(output_dir, 'sapbert_reranked', 'reranking_grid_search.csv'))
    best_result = results_df.loc[results_df["Acc@1"].idxmax()]
    print(f"Mejor combinación: cosine_weight={best_result['cosine_weight']}, jaccard_weight={best_result['jaccard_weight']}")
    print(f"Métricas: Acc@1={best_result['Acc@1']:.4f}, Acc@5={best_result['Acc@5']:.4f}, MRR={best_result['MRR']:.4f}")
    return best_result["cosine_weight"], best_result["jaccard_weight"]

In [None]:
# ------------------------------
# Bloque 2: SapBERT
# ------------------------------

print("\nCargando SapBERT...")
sapbert_encoder = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").to(device)
sapbert_tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")

# Parámetros optimizados
query_batch_size = 128
vocab_batch_size = 200
max_length = 16
k_initial = 50
k_final = 5

print("\nGenerando predicciones con SapBERT...")
sapbert_predictions_df = make_predictions(
    entities_df=en_data_dev,
    tokenizer=sapbert_tokenizer,
    bert_encoder=sapbert_encoder,
    vocab=en_vocab,
    max_length=max_length,
    k=k_initial,
    device=device,
    query_batch_size=query_batch_size,
    vocab_batch_size=vocab_batch_size,
    model_name="SapBERT"
)

sapbert_predictions_path = os.path.join(output_dir, 'sapbert', 'en_predictions_sapbert.tsv')
sapbert_predictions_df.to_csv(sapbert_predictions_path, sep='\t', index=False)
print(f"Predicciones SapBERT guardadas en: {sapbert_predictions_path}")
#files.download(sapbert_predictions_path)

sapbert_predictions_df["pk"] = sapbert_predictions_df.apply(create_row_primary_key, axis=1)
en_data_dev["pk"] = en_data_dev.apply(create_row_primary_key, axis=1)

sapbert_eval_data = en_data_dev.merge(sapbert_predictions_df[["pk", "rank", "prediction", "model"]], on="pk")
sapbert_eval_data = sapbert_eval_data[sapbert_eval_data["UMLS_CUI"] != "CUILESS"]

sapbert_eval_data_path = os.path.join(output_dir, 'sapbert', 'eval_data_sapbert.csv')
sapbert_eval_data.to_csv(sapbert_eval_data_path, index=False)
print(f"Datos de evaluación SapBERT guardados en: {sapbert_eval_data_path}")
#files.download(sapbert_eval_data_path)

sapbert_pk2true_cui = create_sample_pk2_true_cui_map(sapbert_eval_data, "UMLS_CUI")

sapbert_pk2true_cui_path = os.path.join(output_dir, 'sapbert', 'pk2true_cui_sapbert.json')
with open(sapbert_pk2true_cui_path, 'w') as f:
    json.dump(sapbert_pk2true_cui, f)
print(f"Diccionario pk2true_cui SapBERT guardado en: {sapbert_pk2true_cui_path}")
#files.download(sapbert_pk2true_cui_path)

sapbert_metrics = calculate_metrics(sapbert_predictions_df, sapbert_pk2true_cui)
print("Resultados de SapBERT:", sapbert_metrics)

sapbert_metrics_path = os.path.join(output_dir, 'sapbert', 'metrics_sapbert.txt')
with open(sapbert_metrics_path, 'w') as f:
    f.write(str(sapbert_metrics))
print(f"Métricas SapBERT guardadas en: {sapbert_metrics_path}")
#files.download(sapbert_metrics_path)

sapbert_incorrect = sapbert_eval_data[sapbert_eval_data["prediction"] != sapbert_eval_data["UMLS_CUI"]]
sapbert_errors_path = os.path.join(output_dir, 'sapbert', 'errors_sapbert.csv')
sapbert_incorrect[["text", "UMLS_CUI", "prediction", "model"]].to_csv(sapbert_errors_path, index=False)
print(f"Errores SapBERT guardados en: {sapbert_errors_path}")
#files.download(sapbert_errors_path)

sapbert_subset_metrics = {}
for chem_type in ("DISO", "CHEM", "ANATOMY"):
    subset_df = sapbert_eval_data[sapbert_eval_data["entity_type"] == chem_type]
    subset_df_path = os.path.join(output_dir, 'sapbert', f'subset_{chem_type.lower()}_sapbert.csv')
    subset_df.to_csv(subset_df_path, index=False)
    print(f"Subset {chem_type} SapBERT guardado en: {subset_df_path}")
    #files.download(subset_df_path)
    subset_preds = sapbert_predictions_df[sapbert_predictions_df["document_id"].isin(subset_df["document_id"])]
    subset_preds_path = os.path.join(output_dir, 'sapbert', f'subset_preds_{chem_type.lower()}_sapbert.csv')
    subset_preds.to_csv(subset_preds_path, index=False)
    print(f"Subset preds {chem_type} SapBERT guardado en: {subset_preds_path}")
    #files.download(subset_preds_path)
    subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")
    subset_pk2true_cui_path = os.path.join(output_dir, 'sapbert', f'pk2true_cui_{chem_type.lower()}_sapbert.json')
    with open(subset_pk2true_cui_path, 'w') as f:
        json.dump(subset_pk2true_cui, f)
    print(f"Diccionario pk2true_cui {chem_type} SapBERT guardado en: {subset_pk2true_cui_path}")
    #files.download(subset_pk2true_cui_path)
    subset_metrics = calculate_metrics(subset_preds, subset_pk2true_cui)
    sapbert_subset_metrics[chem_type] = subset_metrics
    print(f"Métricas para {chem_type} (SapBERT):", subset_metrics)

sapbert_subset_metrics_path = os.path.join(output_dir, 'sapbert', 'subset_metrics_sapbert.txt')
with open(sapbert_subset_metrics_path, 'w') as f:
    for chem_type, metrics in sapbert_subset_metrics.items():
        f.write(f"{chem_type}: {metrics}\n")
print(f"Métricas por tipo de entidad SapBERT guardadas en: {sapbert_subset_metrics_path}")
#files.download(sapbert_subset_metrics_path)

In [None]:
# ------------------------------
# Bloque 3: PubMedBERT
# ------------------------------

print("\nCargando PubMedBERT...")
pubmedbert_encoder = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract").to(device)
pubmedbert_tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

print("\nGenerando predicciones con PubMedBERT...")
pubmedbert_predictions_df = make_predictions(
    entities_df=en_data_dev,
    tokenizer=pubmedbert_tokenizer,
    bert_encoder=pubmedbert_encoder,
    vocab=en_vocab,
    max_length=max_length,
    k=k_initial,
    device=device,
    query_batch_size=query_batch_size,
    vocab_batch_size=vocab_batch_size,
    model_name="PubMedBERT"
)

pubmedbert_predictions_path = os.path.join(output_dir, 'pubmedbert', 'en_predictions_pubmedbert.tsv')
pubmedbert_predictions_df.to_csv(pubmedbert_predictions_path, sep='\t', index=False)
print(f"Predicciones PubMedBERT guardadas en: {pubmedbert_predictions_path}")
#files.download(pubmedbert_predictions_path)

pubmedbert_predictions_df["pk"] = pubmedbert_predictions_df.apply(create_row_primary_key, axis=1)
pubmedbert_eval_data = en_data_dev.merge(pubmedbert_predictions_df[["pk", "rank", "prediction", "model"]], on="pk")
pubmedbert_eval_data = pubmedbert_eval_data[pubmedbert_eval_data["UMLS_CUI"] != "CUILESS"]

pubmedbert_eval_data_path = os.path.join(output_dir, 'pubmedbert', 'eval_data_pubmedbert.csv')
pubmedbert_eval_data.to_csv(pubmedbert_eval_data_path, index=False)
print(f"Datos de evaluación PubMedBERT guardados en: {pubmedbert_eval_data_path}")
#files.download(pubmedbert_eval_data_path)

pubmedbert_pk2true_cui = create_sample_pk2_true_cui_map(pubmedbert_eval_data, "UMLS_CUI")

pubmedbert_pk2true_cui_path = os.path.join(output_dir, 'pubmedbert', 'pk2true_cui_pubmedbert.json')
with open(pubmedbert_pk2true_cui_path, 'w') as f:
    json.dump(pubmedbert_pk2true_cui, f)
print(f"Diccionario pk2true_cui PubMedBERT guardado en: {pubmedbert_pk2true_cui_path}")
#files.download(pubmedbert_pk2true_cui_path)

pubmedbert_metrics = calculate_metrics(pubmedbert_predictions_df, pubmedbert_pk2true_cui)
print("Resultados de PubMedBERT:", pubmedbert_metrics)

pubmedbert_metrics_path = os.path.join(output_dir, 'pubmedbert', 'metrics_pubmedbert.txt')
with open(pubmedbert_metrics_path, 'w') as f:
    f.write(str(pubmedbert_metrics))
print(f"Métricas PubMedBERT guardadas en: {pubmedbert_metrics_path}")
#files.download(pubmedbert_metrics_path)

pubmedbert_incorrect = pubmedbert_eval_data[pubmedbert_eval_data["prediction"] != pubmedbert_eval_data["UMLS_CUI"]]
pubmedbert_errors_path = os.path.join(output_dir, 'pubmedbert', 'errors_pubmedbert.csv')
pubmedbert_incorrect[["text", "UMLS_CUI", "prediction", "model"]].to_csv(pubmedbert_errors_path, index=False)
print(f"Errores PubMedBERT guardados en: {pubmedbert_errors_path}")
#files.download(pubmedbert_errors_path)

pubmedbert_subset_metrics = {}
for chem_type in ("DISO", "CHEM", "ANATOMY"):
    subset_df = pubmedbert_eval_data[pubmedbert_eval_data["entity_type"] == chem_type]
    subset_df_path = os.path.join(output_dir, 'pubmedbert', f'subset_{chem_type.lower()}_pubmedbert.csv')
    subset_df.to_csv(subset_df_path, index=False)
    print(f"Subset {chem_type} PubMedBERT guardado en: {subset_df_path}")
    #files.download(subset_df_path)
    subset_preds = pubmedbert_predictions_df[pubmedbert_predictions_df["document_id"].isin(subset_df["document_id"])]
    subset_preds_path = os.path.join(output_dir, 'pubmedbert', f'subset_preds_{chem_type.lower()}_pubmedbert.csv')
    subset_preds.to_csv(subset_preds_path, index=False)
    print(f"Subset preds {chem_type} PubMedBERT guardado en: {subset_preds_path}")
    #files.download(subset_preds_path)
    subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")
    subset_pk2true_cui_path = os.path.join(output_dir, 'pubmedbert', f'pk2true_cui_{chem_type.lower()}_pubmedbert.json')
    with open(subset_pk2true_cui_path, 'w') as f:
        json.dump(subset_pk2true_cui, f)
    print(f"Diccionario pk2true_cui {chem_type} PubMedBERT guardado en: {subset_pk2true_cui_path}")
    #files.download(subset_pk2true_cui_path)
    subset_metrics = calculate_metrics(subset_preds, subset_pk2true_cui)
    pubmedbert_subset_metrics[chem_type] = subset_metrics
    print(f"Métricas para {chem_type} (PubMedBERT):", subset_metrics)

pubmedbert_subset_metrics_path = os.path.join(output_dir, 'pubmedbert', 'subset_metrics_pubmedbert.txt')
with open(pubmedbert_subset_metrics_path, 'w') as f:
    for chem_type, metrics in pubmedbert_subset_metrics.items():
        f.write(f"{chem_type}: {metrics}\n")
print(f"Métricas por tipo de entidad PubMedBERT guardadas en: {pubmedbert_subset_metrics_path}")
#files.download(pubmedbert_subset_metrics_path)

In [None]:
# ------------------------------
# Bloque 4: Stanford
# ------------------------------

print("\nCargando Stanford BioBERT...")
stanford_encoder = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2").to(device)
stanford_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

print("\nGenerando predicciones con Stanford BioBERT...")
stanford_predictions_df = make_predictions(
    entities_df=en_data_dev,
    tokenizer=stanford_tokenizer,
    bert_encoder=stanford_encoder,
    vocab=en_vocab,
    max_length=max_length,
    k=k_initial,
    device=device,
    query_batch_size=query_batch_size,
    vocab_batch_size=vocab_batch_size,
    model_name="Stanford"
)

stanford_predictions_path = os.path.join(output_dir, 'stanford', 'en_predictions_stanford.tsv')
stanford_predictions_df.to_csv(stanford_predictions_path, sep='\t', index=False)
print(f"Predicciones Stanford guardadas en: {stanford_predictions_path}")
#files.download(stanford_predictions_path)

stanford_predictions_df["pk"] = stanford_predictions_df.apply(create_row_primary_key, axis=1)
stanford_eval_data = en_data_dev.merge(stanford_predictions_df[["pk", "rank", "prediction", "model"]], on="pk")
stanford_eval_data = stanford_eval_data[stanford_eval_data["UMLS_CUI"] != "CUILESS"]

stanford_eval_data_path = os.path.join(output_dir, 'stanford', 'eval_data_stanford.csv')
stanford_eval_data.to_csv(stanford_eval_data_path, index=False)
print(f"Datos de evaluación Stanford guardados en: {stanford_eval_data_path}")
#files.download(stanford_eval_data_path)

stanford_pk2true_cui = create_sample_pk2_true_cui_map(stanford_eval_data, "UMLS_CUI")

stanford_pk2true_cui_path = os.path.join(output_dir, 'stanford', 'pk2true_cui_stanford.json')
with open(stanford_pk2true_cui_path, 'w') as f:
    json.dump(stanford_pk2true_cui, f)
print(f"Diccionario pk2true_cui Stanford guardado en: {stanford_pk2true_cui_path}")
#files.download(stanford_pk2true_cui_path)

stanford_metrics = calculate_metrics(stanford_predictions_df, stanford_pk2true_cui)
print("Resultados de Stanford:", stanford_metrics)

stanford_metrics_path = os.path.join(output_dir, 'stanford', 'metrics_stanford.txt')
with open(stanford_metrics_path, 'w') as f:
    f.write(str(stanford_metrics))
print(f"Métricas Stanford guardadas en: {stanford_metrics_path}")
#files.download(stanford_metrics_path)

stanford_incorrect = stanford_eval_data[stanford_eval_data["prediction"] != stanford_eval_data["UMLS_CUI"]]
stanford_errors_path = os.path.join(output_dir, 'stanford', 'errors_stanford.csv')
stanford_incorrect[["text", "UMLS_CUI", "prediction", "model"]].to_csv(stanford_errors_path, index=False)
print(f"Errores Stanford guardados en: {stanford_errors_path}")
#files.download(stanford_errors_path)

stanford_subset_metrics = {}
for chem_type in ("DISO", "CHEM", "ANATOMY"):
    subset_df = stanford_eval_data[stanford_eval_data["entity_type"] == chem_type]
    subset_df_path = os.path.join(output_dir, 'stanford', f'subset_{chem_type.lower()}_stanford.csv')
    subset_df.to_csv(subset_df_path, index=False)
    print(f"Subset {chem_type} Stanford guardado en: {subset_df_path}")
    #files.download(subset_df_path)
    subset_preds = stanford_predictions_df[stanford_predictions_df["document_id"].isin(subset_df["document_id"])]
    subset_preds_path = os.path.join(output_dir, 'stanford', f'subset_preds_{chem_type.lower()}_stanford.csv')
    subset_preds.to_csv(subset_preds_path, index=False)
    print(f"Subset preds {chem_type} Stanford guardado en: {subset_preds_path}")
    #files.download(subset_preds_path)
    subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")
    subset_pk2true_cui_path = os.path.join(output_dir, 'stanford', f'pk2true_cui_{chem_type.lower()}_stanford.json')
    with open(subset_pk2true_cui_path, 'w') as f:
        json.dump(subset_pk2true_cui, f)
    print(f"Diccionario pk2true_cui {chem_type} Stanford guardado en: {subset_pk2true_cui_path}")
    #files.download(subset_pk2true_cui_path)
    subset_metrics = calculate_metrics(subset_preds, subset_pk2true_cui)
    stanford_subset_metrics[chem_type] = subset_metrics
    print(f"Métricas para {chem_type} (Stanford):", subset_metrics)

stanford_subset_metrics_path = os.path.join(output_dir, 'stanford', 'subset_metrics_stanford.txt')
with open(stanford_subset_metrics_path, 'w') as f:
    for chem_type, metrics in stanford_subset_metrics.items():
        f.write(f"{chem_type}: {metrics}\n")
print(f"Métricas por tipo de entidad Stanford guardadas en: {stanford_subset_metrics_path}")
#files.download(stanford_subset_metrics_path)

In [None]:
# ------------------------------
# Bloque 5: Re-ranking SapBERT
# ------------------------------

print("\nRe-ranking SapBERT...")
cosine_weights = [0.5, 0.6, 0.7, 0.8]
jaccard_weights = [0.2, 0.3, 0.4]
best_cosine_weight, best_jaccard_weight = grid_search_reranking(
    predictions_df=sapbert_predictions_df,
    entities_df=en_data_dev,
    vocab=en_vocab,
    k_final=k_final,
    sample_pk2true_cui=sapbert_pk2true_cui,
    cosine_weights=cosine_weights,
    jaccard_weights=jaccard_weights
)

# Generar predicciones finales con los mejores pesos
sapbert_reranked_df = rerank_predictions(
    predictions_df=sapbert_predictions_df,
    entities_df=en_data_dev,
    vocab=en_vocab,
    k_final=k_final,
    cosine_weight=best_cosine_weight,
    jaccard_weight=best_jaccard_weight,
    model_name="SapBERT_reranked"
)

sapbert_reranked_df["pk"] = sapbert_reranked_df.apply(create_row_primary_key, axis=1)
sapbert_reranked_path = os.path.join(output_dir, 'sapbert_reranked', 'en_predictions_sapbert_reranked.tsv')
sapbert_reranked_df.to_csv(sapbert_reranked_path, sep='\t', index=False)
print(f"Predicciones SapBERT reranked guardadas en: {sapbert_reranked_path}")
#files.download(sapbert_reranked_path)

sapbert_reranked_eval_data = en_data_dev.merge(sapbert_reranked_df[["pk", "rank", "prediction", "model"]], on="pk")
sapbert_reranked_eval_data = sapbert_reranked_eval_data[sapbert_reranked_eval_data["UMLS_CUI"] != "CUILESS"]

sapbert_reranked_eval_data_path = os.path.join(output_dir, 'sapbert_reranked', 'eval_data_sapbert_reranked.csv')
sapbert_reranked_eval_data.to_csv(sapbert_reranked_eval_data_path, index=False)
print(f"Datos de evaluación SapBERT reranked guardados en: {sapbert_reranked_eval_data_path}")
#files.download(sapbert_reranked_eval_data_path)

sapbert_reranked_metrics = calculate_metrics(sapbert_reranked_df, sapbert_pk2true_cui)
print("Resultados de SapBERT reranked:", sapbert_reranked_metrics)

sapbert_reranked_metrics_path = os.path.join(output_dir, 'sapbert_reranked', 'metrics_sapbert_reranked.txt')
with open(sapbert_reranked_metrics_path, 'w') as f:
    f.write(str(sapbert_reranked_metrics))
print(f"Métricas SapBERT reranked guardadas en: {sapbert_reranked_metrics_path}")
#files.download(sapbert_reranked_metrics_path)

sapbert_reranked_incorrect = sapbert_reranked_eval_data[sapbert_reranked_eval_data["prediction"] != sapbert_reranked_eval_data["UMLS_CUI"]]
sapbert_reranked_errors_path = os.path.join(output_dir, 'sapbert_reranked', 'errors_sapbert_reranked.csv')
sapbert_reranked_incorrect[["text", "UMLS_CUI", "prediction", "model"]].to_csv(sapbert_reranked_errors_path, index=False)
print(f"Errores SapBERT reranked guardados en: {sapbert_reranked_errors_path}")
#files.download(sapbert_reranked_errors_path)

sapbert_reranked_subset_metrics = {}
for chem_type in ("DISO", "CHEM", "ANATOMY"):
    subset_df = sapbert_reranked_eval_data[sapbert_reranked_eval_data["entity_type"] == chem_type]
    subset_df_path = os.path.join(output_dir, 'sapbert_reranked', f'subset_{chem_type.lower()}_sapbert_reranked.csv')
    subset_df.to_csv(subset_df_path, index=False)
    print(f"Subset {chem_type} SapBERT reranked guardado en: {subset_df_path}")
    #files.download(subset_df_path)
    subset_preds = sapbert_reranked_df[sapbert_reranked_df["document_id"].isin(subset_df["document_id"])]
    subset_preds_path = os.path.join(output_dir, 'sapbert_reranked', f'subset_preds_{chem_type.lower()}_sapbert_reranked.csv')
    subset_preds.to_csv(subset_preds_path, index=False)
    print(f"Subset preds {chem_type} SapBERT reranked guardado en: {subset_preds_path}")
    #files.download(subset_preds_path)
    subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")
    subset_pk2true_cui_path = os.path.join(output_dir, 'sapbert_reranked', f'pk2true_cui_{chem_type.lower()}_sapbert_reranked.json')
    with open(subset_pk2true_cui_path, 'w') as f:
        json.dump(subset_pk2true_cui, f)
    print(f"Diccionario pk2true_cui {chem_type} SapBERT reranked guardado en: {subset_pk2true_cui_path}")
    #files.download(subset_pk2true_cui_path)
    subset_metrics = calculate_metrics(subset_preds, subset_pk2true_cui)
    sapbert_reranked_subset_metrics[chem_type] = subset_metrics
    print(f"Métricas para {chem_type} (SapBERT reranked):", subset_metrics)

sapbert_reranked_subset_metrics_path = os.path.join(output_dir, 'sapbert_reranked', 'subset_metrics_sapbert_reranked.txt')
with open(sapbert_reranked_subset_metrics_path, 'w') as f:
    for chem_type, metrics in sapbert_reranked_subset_metrics.items():
        f.write(f"{chem_type}: {metrics}\n")
print(f"Métricas por tipo de entidad SapBERT reranked guardadas en: {sapbert_reranked_subset_metrics_path}")
#files.download(sapbert_reranked_subset_metrics_path)

In [None]:
# ------------------------------
# Bloque 6: Re-ranking PubMedBERT
# ------------------------------

print("\nRe-ranking PubMedBERT...")
pubmedbert_reranked_df = rerank_predictions(
    predictions_df=pubmedbert_predictions_df,
    entities_df=en_data_dev,
    vocab=en_vocab,
    k_final=k_final,
    cosine_weight=best_cosine_weight,
    jaccard_weight=best_jaccard_weight,
    model_name="PubMedBERT_reranked"
)

pubmedbert_reranked_df["pk"] = pubmedbert_reranked_df.apply(create_row_primary_key, axis=1)
pubmedbert_reranked_path = os.path.join(output_dir, 'pubmedbert_reranked', 'en_predictions_pubmedbert_reranked.tsv')
pubmedbert_reranked_df.to_csv(pubmedbert_reranked_path, sep='\t', index=False)
print(f"Predicciones PubMedBERT reranked guardadas en: {pubmedbert_reranked_path}")
#files.download(pubmedbert_reranked_path)

pubmedbert_reranked_eval_data = en_data_dev.merge(pubmedbert_reranked_df[["pk", "rank", "prediction", "model"]], on="pk")
pubmedbert_reranked_eval_data = pubmedbert_reranked_eval_data[pubmedbert_reranked_eval_data["UMLS_CUI"] != "CUILESS"]

pubmedbert_reranked_eval_data_path = os.path.join(output_dir, 'pubmedbert_reranked', 'eval_data_pubmedbert_reranked.csv')
pubmedbert_reranked_eval_data.to_csv(pubmedbert_reranked_eval_data_path, index=False)
print(f"Datos de evaluación PubMedBERT reranked guardados en: {pubmedbert_reranked_eval_data_path}")
#files.download(pubmedbert_reranked_eval_data_path)

pubmedbert_reranked_metrics = calculate_metrics(pubmedbert_reranked_df, pubmedbert_pk2true_cui)
print("Resultados de PubMedBERT reranked:", pubmedbert_reranked_metrics)

pubmedbert_reranked_metrics_path = os.path.join(output_dir, 'pubmedbert_reranked', 'metrics_pubmedbert_reranked.txt')
with open(pubmedbert_reranked_metrics_path, 'w') as f:
    f.write(str(pubmedbert_reranked_metrics))
print(f"Métricas PubMedBERT reranked guardadas en: {pubmedbert_reranked_metrics_path}")
#files.download(pubmedbert_reranked_metrics_path)

pubmedbert_reranked_incorrect = pubmedbert_reranked_eval_data[pubmedbert_reranked_eval_data["prediction"] != pubmedbert_reranked_eval_data["UMLS_CUI"]]
pubmedbert_reranked_errors_path = os.path.join(output_dir, 'pubmedbert_reranked', 'errors_pubmedbert_reranked.csv')
pubmedbert_reranked_incorrect[["text", "UMLS_CUI", "prediction", "model"]].to_csv(pubmedbert_reranked_errors_path, index=False)
print(f"Errores PubMedBERT reranked guardados en: {pubmedbert_reranked_errors_path}")
#files.download(pubmedbert_reranked_errors_path)

pubmedbert_reranked_subset_metrics = {}
for chem_type in ("DISO", "CHEM", "ANATOMY"):
    subset_df = pubmedbert_reranked_eval_data[pubmedbert_reranked_eval_data["entity_type"] == chem_type]
    subset_df_path = os.path.join(output_dir, 'pubmedbert_reranked', f'subset_{chem_type.lower()}_pubmedbert_reranked.csv')
    subset_df.to_csv(subset_df_path, index=False)
    print(f"Subset {chem_type} PubMedBERT reranked guardado en: {subset_df_path}")
    #files.download(subset_df_path)
    subset_preds = pubmedbert_reranked_df[pubmedbert_reranked_df["document_id"].isin(subset_df["document_id"])]
    subset_preds_path = os.path.join(output_dir, 'pubmedbert_reranked', f'subset_preds_{chem_type.lower()}_pubmedbert_reranked.csv')
    subset_preds.to_csv(subset_preds_path, index=False)
    print(f"Subset preds {chem_type} PubMedBERT reranked guardado en: {subset_preds_path}")
    #files.download(subset_preds_path)
    subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")
    subset_pk2true_cui_path = os.path.join(output_dir, 'pubmedbert_reranked', f'pk2true_cui_{chem_type.lower()}_pubmedbert_reranked.json')
    with open(subset_pk2true_cui_path, 'w') as f:
        json.dump(subset_pk2true_cui, f)
    print(f"Diccionario pk2true_cui {chem_type} PubMedBERT reranked guardado en: {subset_pk2true_cui_path}")
    #files.download(subset_pk2true_cui_path)
    subset_metrics = calculate_metrics(subset_preds, subset_pk2true_cui)
    pubmedbert_reranked_subset_metrics[chem_type] = subset_metrics
    print(f"Métricas para {chem_type} (PubMedBERT reranked):", subset_metrics)

pubmedbert_reranked_subset_metrics_path = os.path.join(output_dir, 'pubmedbert_reranked', 'subset_metrics_pubmedbert_reranked.txt')
with open(pubmedbert_reranked_subset_metrics_path, 'w') as f:
    for chem_type, metrics in pubmedbert_reranked_subset_metrics.items():
        f.write(f"{chem_type}: {metrics}\n")
print(f"Métricas por tipo de entidad PubMedBERT reranked guardadas en: {pubmedbert_reranked_subset_metrics_path}")
#files.download(pubmedbert_reranked_subset_metrics_path)

In [None]:
# ------------------------------
# Bloque 7: Re-ranking Stanford
# ------------------------------

print("\nRe-ranking Stanford BioBERT...")
stanford_reranked_df = rerank_predictions(
    predictions_df=stanford_predictions_df,
    entities_df=en_data_dev,
    vocab=en_vocab,
    k_final=k_final,
    cosine_weight=best_cosine_weight,
    jaccard_weight=best_jaccard_weight,
    model_name="Stanford_reranked"
)

stanford_reranked_df["pk"] = stanford_reranked_df.apply(create_row_primary_key, axis=1)
stanford_reranked_path = os.path.join(output_dir, 'stanford_reranked', 'en_predictions_stanford_reranked.tsv')
stanford_reranked_df.to_csv(stanford_reranked_path, sep='\t', index=False)
print(f"Predicciones Stanford reranked guardadas en: {stanford_reranked_path}")
#files.download(stanford_reranked_path)

stanford_reranked_eval_data = en_data_dev.merge(stanford_reranked_df[["pk", "rank", "prediction", "model"]], on="pk")
stanford_reranked_eval_data = stanford_reranked_eval_data[stanford_reranked_eval_data["UMLS_CUI"] != "CUILESS"]

stanford_reranked_eval_data_path = os.path.join(output_dir, 'stanford_reranked', 'eval_data_stanford_reranked.csv')
stanford_reranked_eval_data.to_csv(stanford_reranked_eval_data_path, index=False)
print(f"Datos de evaluación Stanford reranked guardados en: {stanford_reranked_eval_data_path}")
#files.download(stanford_reranked_eval_data_path)

stanford_reranked_metrics = calculate_metrics(stanford_reranked_df, stanford_pk2true_cui)
print("Resultados de Stanford reranked:", stanford_reranked_metrics)

stanford_reranked_metrics_path = os.path.join(output_dir, 'stanford_reranked', 'metrics_stanford_reranked.txt')
with open(stanford_reranked_metrics_path, 'w') as f:
    f.write(str(stanford_reranked_metrics))
print(f"Métricas Stanford reranked guardadas en: {stanford_reranked_metrics_path}")
#files.download(stanford_reranked_metrics_path)

stanford_reranked_incorrect = stanford_reranked_eval_data[stanford_reranked_eval_data["prediction"] != stanford_reranked_eval_data["UMLS_CUI"]]
stanford_reranked_errors_path = os.path.join(output_dir, 'stanford_reranked', 'errors_stanford_reranked.csv')
stanford_reranked_incorrect[["text", "UMLS_CUI", "prediction", "model"]].to_csv(stanford_reranked_errors_path, index=False)
print(f"Errores Stanford reranked guardados en: {stanford_reranked_errors_path}")
#files.download(stanford_reranked_errors_path)

stanford_reranked_subset_metrics = {}
for chem_type in ("DISO", "CHEM", "ANATOMY"):
    subset_df = stanford_reranked_eval_data[stanford_reranked_eval_data["entity_type"] == chem_type]
    subset_df_path = os.path.join(output_dir, 'stanford_reranked', f'subset_{chem_type.lower()}_stanford_reranked.csv')
    subset_df.to_csv(subset_df_path, index=False)
    print(f"Subset {chem_type} Stanford reranked guardado en: {subset_df_path}")
    #files.download(subset_df_path)
    subset_preds = stanford_reranked_df[stanford_reranked_df["document_id"].isin(subset_df["document_id"])]
    subset_preds_path = os.path.join(output_dir, 'stanford_reranked', f'subset_preds_{chem_type.lower()}_stanford_reranked.csv')
    subset_preds.to_csv(subset_preds_path, index=False)
    print(f"Subset preds {chem_type} Stanford reranked guardado en: {subset_preds_path}")
    #files.download(subset_preds_path)
    subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")
    subset_pk2true_cui_path = os.path.join(output_dir, 'stanford_reranked', f'pk2true_cui_{chem_type.lower()}_stanford_reranked.json')
    with open(subset_pk2true_cui_path, 'w') as f:
        json.dump(subset_pk2true_cui, f)
    print(f"Diccionario pk2true_cui {chem_type} Stanford reranked guardado en: {subset_pk2true_cui_path}")
    #files.download(subset_pk2true_cui_path)
    subset_metrics = calculate_metrics(subset_preds, subset_pk2true_cui)
    stanford_reranked_subset_metrics[chem_type] = subset_metrics
    print(f"Métricas para {chem_type} (Stanford reranked):", subset_metrics)

stanford_reranked_subset_metrics_path = os.path.join(output_dir, 'stanford_reranked', 'subset_metrics_stanford_reranked.txt')
with open(stanford_reranked_subset_metrics_path, 'w') as f:
    for chem_type, metrics in stanford_reranked_subset_metrics.items():
        f.write(f"{chem_type}: {metrics}\n")
print(f"Métricas por tipo de entidad Stanford reranked guardadas en: {stanford_reranked_subset_metrics_path}")
#files.download(stanford_reranked_subset_metrics_path)

In [None]:
# ------------------------------
# Bloque 8: Comparación Final
# ------------------------------

# Generar predicciones Baseline
print("\nGenerando predicciones Baseline...")
baseline_predictions_df = make_predictions(
    entities_df=en_data_dev,
    tokenizer=AutoTokenizer.from_pretrained("andorei/gebert_eng_gat"),
    bert_encoder=AutoModel.from_pretrained("andorei/gebert_eng_gat").to(device),
    vocab=en_vocab,
    max_length=max_length,
    k=k_final,
    device=device,
    query_batch_size=query_batch_size,
    vocab_batch_size=vocab_batch_size,
    model_name="Baseline"
)

# Guardar predicciones Baseline
baseline_predictions_path = os.path.join(output_dir, 'general', 'en_predictions_baseline.tsv')
baseline_predictions_df.to_csv(baseline_predictions_path, sep='\t', index=False)
print(f"Predicciones Baseline guardadas en: {baseline_predictions_path}")
#files.download(baseline_predictions_path)

# Evaluar Baseline
baseline_predictions_df["pk"] = baseline_predictions_df.apply(create_row_primary_key, axis=1)
baseline_eval_data = en_data_dev.merge(baseline_predictions_df[["pk", "rank", "prediction", "model"]], on="pk")
baseline_eval_data = baseline_eval_data[baseline_eval_data["UMLS_CUI"] != "CUILESS"]

# Guardar eval_data Baseline
baseline_eval_data_path = os.path.join(output_dir, 'general', 'eval_data_baseline.csv')
baseline_eval_data.to_csv(baseline_eval_data_path, index=False)
print(f"Datos de evaluación Baseline guardados en: {baseline_eval_data_path}")
#files.download(baseline_eval_data_path)

baseline_pk2true_cui = create_sample_pk2_true_cui_map(baseline_eval_data, "UMLS_CUI")

# Guardar pk2true_cui Baseline
baseline_pk2true_cui_path = os.path.join(output_dir, 'general', 'pk2true_cui_baseline.json')
with open(baseline_pk2true_cui_path, 'w') as f:
    json.dump(baseline_pk2true_cui, f)
print(f"Diccionario pk2true_cui Baseline guardado en: {baseline_pk2true_cui_path}")
#files.download(baseline_pk2true_cui_path)

baseline_metrics = calculate_metrics(baseline_predictions_df, baseline_pk2true_cui)
print("Resultados de Baseline:", baseline_metrics)

# Guardar métricas Baseline
baseline_metrics_path = os.path.join(output_dir, 'general', 'metrics_baseline.txt')
with open(baseline_metrics_path, 'w') as f:
    f.write(str(baseline_metrics))
print(f"Métricas Baseline guardadas en: {baseline_metrics_path}")
#files.download(baseline_metrics_path)

# Análisis de errores Baseline
baseline_incorrect = baseline_eval_data[baseline_eval_data["prediction"] != baseline_eval_data["UMLS_CUI"]]
baseline_errors_path = os.path.join(output_dir, 'general', 'errors_baseline.csv')
baseline_incorrect[["text", "UMLS_CUI", "prediction", "model"]].to_csv(baseline_errors_path, index=False)
print(f"Errores Baseline guardados en: {baseline_errors_path}")
#files.download(baseline_errors_path)

# Métricas por tipo de entidad Baseline
baseline_subset_metrics = {}
for chem_type in ("DISO", "CHEM", "ANATOMY"):
    subset_df = baseline_eval_data[baseline_eval_data["entity_type"] == chem_type]

    # Guardar subset_df
    subset_df_path = os.path.join(output_dir, 'general', f'subset_{chem_type.lower()}_baseline.csv')
    subset_df.to_csv(subset_df_path, index=False)
    print(f"Subset {chem_type} Baseline guardado en: {subset_df_path}")
    #files.download(subset_df_path)

    subset_preds = baseline_predictions_df[baseline_predictions_df["document_id"].isin(subset_df["document_id"])]

    # Guardar subset_preds
    subset_preds_path = os.path.join(output_dir, 'general', f'subset_preds_{chem_type.lower()}_baseline.csv')
    subset_preds.to_csv(subset_preds_path, index=False)
    print(f"Subset preds {chem_type} Baseline guardado en: {subset_preds_path}")
    #files.download(subset_preds_path)

    subset_pk2true_cui = create_sample_pk2_true_cui_map(subset_df, "UMLS_CUI")

    # Guardar subset_pk2true_cui
    subset_pk2true_cui_path = os.path.join(output_dir, 'general', f'pk2true_cui_{chem_type.lower()}_baseline.json')
    with open(subset_pk2true_cui_path, 'w') as f:
        json.dump(subset_pk2true_cui, f)
    print(f"Diccionario pk2true_cui {chem_type} Baseline guardado en: {subset_pk2true_cui_path}")
    #files.download(subset_pk2true_cui_path)

    subset_metrics = calculate_metrics(subset_preds, subset_pk2true_cui)
    baseline_subset_metrics[chem_type] = subset_metrics
    print(f"Métricas para {chem_type} (Baseline):", subset_metrics)

# Guardar métricas por tipo de entidad Baseline
baseline_subset_metrics_path = os.path.join(output_dir, 'general', 'subset_metrics_baseline.txt')
with open(baseline_subset_metrics_path, 'w') as f:
    for chem_type, metrics in baseline_subset_metrics.items():
        f.write(f"{chem_type}: {metrics}\n")
print(f"Métricas por tipo de entidad Baseline guardadas en: {baseline_subset_metrics_path}")
#files.download(baseline_subset_metrics_path)

In [None]:
# Comparar todos los resultados
results = {
    "Basline": baseline_metrics,
    "SapBERT": sapbert_metrics,
    "PubMedBERT": pubmedbert_metrics,
    "Stanford": stanford_metrics,
    "SapBERT_reranked": sapbert_reranked_metrics,
    "PubMedBERT_reranked": pubmedbert_reranked_metrics,
    "Stanford_reranked": stanford_reranked_metrics
}

print("\nComparación de resultados:")
for model_name, metrics in results.items():
    print(f"{model_name}: {metrics}")

# Guardar comparación
comparison_path = os.path.join(output_dir, 'general', 'results_comparison.txt')
with open(comparison_path, 'w') as f:
    for model_name, metrics in results.items():
        f.write(f"{model_name}: {metrics}\n")
print(f"Comparación guardada en: {comparison_path}")
#files.download(comparison_path)