In [1]:
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
from datasets import load_dataset
from transformers import set_seed
from tqdm import tqdm

In [2]:

seed = 42
set_seed(seed)


# Corrección: cargar JSON local correctamente
dataset = load_dataset('json', data_files='./data/discourse_qa.json')
#remove columns that are not needed "question" and "answer"
dataset = dataset.remove_columns(["question", "answer"])
#split train and test
dataset = dataset["train"].train_test_split(0.1)

In [3]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
perplexity = evaluate.load("perplexity")

# ejemplos planos
# predictions = ["respuesta generada por el modelo 1", "respuesta generada 2"]
# reales = ["respuesta real 1", "respuesta real 2"]

reales = [i['text'] for i in dataset['test']]

---

In [4]:
model_name = "PabloCano1/llama1b-entreno"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             attn_implementation='eager',
                                             device_map="cuda",dtype=torch.bfloat16)
# model.to("cuda")

In [5]:
predictions = []

reales_only_response = []
for real in tqdm(reales):
    real_no_response = real[:real.find('### Expected Response:') + len("### Expected Response:")].strip()
    inputs = tokenizer(real_no_response, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=None,
            do_sample=True,
            temperature=0.5,
            top_p=0.90,
            pad_token_id=tokenizer.eos_token_id
        )
    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print(f"{output=}")
    # print(f"{real=}")
    reales_only_response.append(real[real.find('### Expected Response:') + len('### Expected Response:'):].strip())
    predictions.append(output)
    # predictions.append(output[output.find('### Expected Response:')+len('### Expected Response:'):].strip())
    # print(f"{reales_only_response[-1]=}")
    # print(f"{predictions[-1]=}")


100%|██████████| 884/884 [10:20<00:00,  1.42it/s]


In [6]:

# Para BLEU la librería espera referencias como lista de listas (cada pred puede tener varias refs)
references_for_bleu = [[r] for r in reales_only_response]   # [["respuesta real 1"], ["respuesta real 2"]]

# Opción 1: pasar cadenas y dejar que evaluate haga el tokenizado internamente
bleu_result = bleu.compute(predictions=predictions, references=references_for_bleu)
print("BLEU:", bleu_result["bleu"])

# ROUGE normalmente recibe listas de strings
rouge_result = rouge.compute(predictions=predictions, references=reales_only_response)
print("ROUGE:", rouge_result)

# Calcular perplexity usando el módulo ya cargado
perplexities = []
model.eval()
with torch.no_grad():
    for text in tqdm(predictions, desc="Calculando perplexities",leave=False):
        enc = tokenizer(text, return_tensors="pt", truncation=True).to(model.device)
        # Para modelos causales: pasar labels iguales a input_ids calcula la loss por token
        outputs = model(**enc, labels=enc["input_ids"])
        loss = outputs.loss.item()  # loss media por token (cross-entropy)
        ppl = float(np.exp(loss))
        perplexities.append(ppl)

# print("Perplexities (primeros 10):", perplexities[:10])
print("Mean perplexity:", float(np.mean(perplexities)))
#

BLEU: 0.0551982777072815
ROUGE: {'rouge1': np.float64(0.14818577530685856), 'rouge2': np.float64(0.02640262530774247), 'rougeL': np.float64(0.09756200040479551), 'rougeLsum': np.float64(0.14008163406230378)}


                                                                          

Mean perplexity: 2.845599216118297




In [8]:
tokenizer.pad_token = tokenizer.eos_token

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

def embed_texts(texts):
    encoded_input = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        # Activar salida de todos los hidden states para obtener embeddings
        outputs = model(**encoded_input, output_hidden_states=True)
        hidden_states = outputs.hidden_states  # tuple: layers hidden states
        # Usar la última capa oculta
        last_hidden = hidden_states[-1]
        # Promediar embeddings de tokens por cada secuencia (dim=1)
        embeddings = last_hidden.mean(dim=1)
        # Convertir a float32 y mover a CPU para compatibilidad con sklearn/numpy
        embeddings = embeddings.to(torch.float32).cpu()
    return embeddings

def batch_semantic_similarity(reference_texts, generated_texts):

    similarities = []
    for i in tqdm(range(0,200,1)):
        ref_emb = embed_texts(reference_texts[i:i+4])
        gen_emb = embed_texts(generated_texts[i:i+4])
        # convertir a numpy antes de usar sklearn
        ref_np = [j.unsqueeze(0).numpy() for j in ref_emb]
        gen_np = [j.unsqueeze(0).numpy() for j in gen_emb]
        sim = [float(cosine_similarity(r, g)[0][0]) for r, g in zip(ref_np, gen_np)]
        similarities.extend(sim)
    return similarities
# Ejemplo
referencias = [
    "Mi amigo tiene esquizofrenia y alucinaciones.",
    "El paciente habla con ideas delirantes."
]
generados = [
    "Mi amigo con esquizofrenia habla de escuchar voces.",
    "El paciente expresa pensamientos desconectados."
]

sim_scores = batch_semantic_similarity(reales, predictions)
print('Similitudes semánticas por texto:', sim_scores)

  2%|▏         | 4/200 [00:15<12:27,  3.81s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 366.00 MiB. GPU 0 has a total capacity of 3.94 GiB of which 109.75 MiB is free. Including non-PyTorch memory, this process has 3.50 GiB memory in use. Of the allocated memory 3.08 GiB is allocated by PyTorch, and 367.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

---

In [11]:
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             attn_implementation='eager',
                                             device_map="cuda",dtype=torch.bfloat16)

In [6]:
predictions = []
reales_only_response = []
for real in tqdm(reales):
    real_no_response = real[:real.find('### Expected Response:') + len("### Expected Response:")].strip()
    inputs = tokenizer(real_no_response, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=None,
            do_sample=True,
            # temperature=0.2,
            # top_p=0.90,
            pad_token_id=tokenizer.eos_token_id
        )
    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print(f"{output=}")
    # print(f"{real=}")
    predictions.append(output)
    reales_only_response.append(real[real.find('### Expected Response:') + len('### Expected Response:'):].strip())
    # predictions.append(output[output.find('### Expected Response:')+len('### Expected Response:'):].strip())
    # print(f"{reales_only_response[-1]=}")

100%|██████████| 884/884 [13:13<00:00,  1.11it/s]


In [8]:

# Para BLEU la librería espera referencias como lista de listas (cada pred puede tener varias refs)
references_for_bleu = [[r] for r in reales_only_response]   # [["respuesta real 1"], ["respuesta real 2"]]

# Opción 1: pasar cadenas y dejar que evaluate haga el tokenizado internamente
bleu_result = bleu.compute(predictions=predictions, references=references_for_bleu)
print("BLEU:", bleu_result["bleu"])

# ROUGE normalmente recibe listas de strings
rouge_result = rouge.compute(predictions=predictions, references=reales_only_response)
print("ROUGE:", rouge_result)

# Calcular perplexity usando el módulo ya cargado
perplexities = []
model.eval()
with torch.no_grad():
    for text in tqdm(predictions, desc="Calculando perplexities"):
        enc = tokenizer(text, return_tensors="pt", truncation=True).to(model.device)
        # Para modelos causales: pasar labels iguales a input_ids calcula la loss por token
        outputs = model(**enc, labels=enc["input_ids"])
        loss = outputs.loss.item()  # loss media por token (cross-entropy)
        ppl = float(np.exp(loss))
        perplexities.append(ppl)

print("Perplexities (primeros 10):", perplexities[:10])
print("Mean perplexity:", float(np.mean(perplexities)))

BLEU: 0.00697797889791655
ROUGE: {'rouge1': np.float64(0.09016682546380125), 'rouge2': np.float64(0.008694714171673713), 'rougeL': np.float64(0.06030742918384896), 'rougeLsum': np.float64(0.08293171662688542)}


Calculando perplexities: 100%|██████████| 884/884 [04:04<00:00,  3.62it/s]

Perplexities (primeros 10): [27.466263752307377, 24.612488690215503, 20.933212515663044, 19.543498237411534, 14.093393997144519, 19.28727354375807, 14.007683478939311, 17.30278154319744, 43.48730444818187, 15.256172404578898]
Mean perplexity: 22.084054707809567



