In [1]:
!pip install transformers torch datasets nltk seaborn tqdm bert-score rouge_score py-readability-metrics detoxify evaluate

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py-readability-metrics
  Downloading py_readability_metrics-1.4.5-py3-none-any.whl.metadata (8.8 kB)
Collecting detoxify
  Downloading detoxify-0.5.2-py3-none-any.whl.metadata (13 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-m

In [2]:
import torch
import json
import numpy as np
import evaluate
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from detoxify import Detoxify
from readability import Readability
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from collections import Counter

In [3]:
MODEL_FINETUNED = "ShivomH/Elixir-Health-Llama3B"
MODEL_BASE = "meta-llama/Llama-3.2-3B-Instruct"

In [4]:
def load_model(model_name):
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Ensure padding token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

# Choose which model to evaluate
EVALUATE_FINETUNED = False  # Set to False if evaluating base model

model_name = MODEL_FINETUNED if EVALUATE_FINETUNED else MODEL_BASE
model, tokenizer = load_model(model_name)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [5]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [6]:
test_file = "/content/mental_eval_dataset.jsonl"
test_data = []

with open(test_file, "r") as f:
    for line in f:
        test_data.append(json.loads(line))

# Extract inputs and expected outputs
inputs = [entry["text"].split("<|start_header_id|>assistant<|end_header_id|>")[0] for entry in test_data]
expected_responses = [entry["text"].split("<|start_header_id|>assistant<|end_header_id|>")[1] for entry in test_data]

In [7]:
# Generate Responses

def generate_responses(model, tokenizer, inputs, max_new_tokens=150, batch_size=8):
    responses = []

    for i in tqdm(range(0, len(inputs), batch_size), desc="Generating Responses"):
        batch_inputs = inputs[i:i + batch_size]

        encoded_inputs = tokenizer.batch_encode_plus(
            batch_inputs, return_tensors="pt", truncation=True, max_length=512, padding=True
        )

        input_ids = encoded_inputs["input_ids"].to("cuda")
        attention_mask = encoded_inputs["attention_mask"].to("cuda")

        with torch.no_grad():
            output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)

        batch_responses = tokenizer.batch_decode(output, skip_special_tokens=True)
        responses.extend(batch_responses)

    return responses

responses = generate_responses(model, tokenizer, inputs)

Generating Responses:   0%|          | 0/125 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Responses:   1%|          | 1/125 [00:09<19:25,  9.40s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Responses:   2%|▏         | 2/125 [00:17<17:23,  8.48s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Responses:   2%|▏         | 3/125 [00:24<16:06,  7.93s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Responses:   3%|▎         | 4

In [8]:
# Compute Evaluation Metrics

def compute_metrics(predictions, references):
    metrics = {
        "BLEU": bleu.compute(predictions=predictions, references=references)["bleu"],
        "ROUGE-1": rouge.compute(predictions=predictions, references=references)["rouge1"],
        "ROUGE-2": rouge.compute(predictions=predictions, references=references)["rouge2"],
        "ROUGE-L": rouge.compute(predictions=predictions, references=references)["rougeL"],
        "METEOR": meteor.compute(predictions=predictions, references=references)["meteor"],
        "BERTScore": np.mean(bertscore.compute(predictions=predictions, references=references, lang="en")["f1"]),
    }
    return metrics

# Compute metrics in batches
def compute_metrics_batched(responses, expected_responses, batch_size=100):
    results = []
    for i in range(0, len(responses), batch_size):
        batch_responses = responses[i:i + batch_size]
        batch_expected = expected_responses[i:i + batch_size]
        results.append(compute_metrics(batch_responses, batch_expected))
    return results

metrics = compute_metrics_batched(responses, expected_responses)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Compute Perplexity

def compute_perplexity(model, tokenizer, responses):
    perplexities = []

    for response in tqdm(responses, desc="Calculating Perplexity"):
        encodings = tokenizer(response, return_tensors="pt").to("cuda")
        with torch.no_grad():
            loss = model(**encodings, labels=encodings["input_ids"]).loss
        perplexities.append(torch.exp(loss).item())

    return np.mean(perplexities)

perplexity_score = compute_perplexity(model, tokenizer, responses)

Calculating Perplexity: 100%|██████████| 1000/1000 [01:37<00:00, 10.25it/s]


In [10]:
# Compute Toxicity

def compute_toxicity(responses):
    detoxify_model = Detoxify("original")  # Load model once
    toxicity_scores = [detoxify_model.predict(response)['toxicity'] for response in tqdm(responses, desc="Evaluating Toxicity")]
    return np.mean(toxicity_scores)

toxicity_score = compute_toxicity(responses)

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.1-alpha/toxic_original-c1212f89.ckpt" to /root/.cache/torch/hub/checkpoints/toxic_original-c1212f89.ckpt
100%|██████████| 418M/418M [00:05<00:00, 73.2MB/s]


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Evaluating Toxicity: 100%|██████████| 1000/1000 [12:20<00:00,  1.35it/s]


In [11]:
# Compute Readability

def compute_readability(responses):
    readability_scores = []

    for response in responses:
        words = response.split()

        # Extend text to at least 100 words by repeating if necessary
        if len(words) < 100:
            repeat_count = (100 // len(words)) + 1  # Ensure we have enough words
            extended_response = " ".join((words * repeat_count)[:100])
        else:
            extended_response = " ".join(words[:100])

        try:
            r = Readability(extended_response)
            readability_scores.append(r.flesch_kincaid().score)
        except Exception as e:
            print(f"Readability error: {e}")
            readability_scores.append(None)

    return np.nanmean([x for x in readability_scores if x is not None]) if readability_scores else 0

readability_score = compute_readability(responses)


Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.
Readability error: 100 words required.


In [12]:
def compute_diversity_metrics(responses):
    def distinct_n(responses, n):
        """ Computes Distinct-N score for n-grams """
        all_ngrams = []
        total_ngrams = 0

        for response in responses:
            tokens = response.split()
            ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
            all_ngrams.extend(ngrams)
            total_ngrams += len(ngrams)

        unique_ngrams = len(set(all_ngrams))
        return unique_ngrams / total_ngrams if total_ngrams > 0 else 0

    def self_bleu(responses):
        """ Computes Self-BLEU using smoothing to avoid zero scores """
        smoothie = SmoothingFunction().method1  # Apply smoothing
        scores = []

        for i, response in enumerate(responses):
            references = [r.split() for j, r in enumerate(responses) if i != j]
            hypothesis = response.split()

            # Skip empty responses
            if not hypothesis:
                continue

            score = sentence_bleu(references, hypothesis, smoothing_function=smoothie)
            scores.append(score)

        return np.mean(scores) if scores else 0

    return {
        "Distinct-1": distinct_n(responses, 1),
        "Distinct-2": distinct_n(responses, 2),
        "Self-BLEU": self_bleu(responses),
    }

# Compute metrics
diversity_metrics = compute_diversity_metrics(responses)

In [13]:
print(type(metrics))  # Check the data type
print(metrics)  # Print its value (or a subset if large)

<class 'list'>
[{'BLEU': 0.03528343124766325, 'ROUGE-1': 0.32014573043631295, 'ROUGE-2': 0.06827117093203064, 'ROUGE-L': 0.1552180755762359, 'METEOR': 0.25979870885058437, 'BERTScore': 0.8322430777549744}, {'BLEU': 0.03347084493237487, 'ROUGE-1': 0.3133556064312393, 'ROUGE-2': 0.06708300074264456, 'ROUGE-L': 0.15059305944505993, 'METEOR': 0.26197793963854094, 'BERTScore': 0.8313711971044541}, {'BLEU': 0.0418744947038785, 'ROUGE-1': 0.3200864862671198, 'ROUGE-2': 0.07735467884067265, 'ROUGE-L': 0.1565060068003275, 'METEOR': 0.2614339615424635, 'BERTScore': 0.833770666718483}, {'BLEU': 0.03878733169826325, 'ROUGE-1': 0.32043022654923536, 'ROUGE-2': 0.07165043766204443, 'ROUGE-L': 0.1548240882100062, 'METEOR': 0.25796549087737497, 'BERTScore': 0.8335874795913696}, {'BLEU': 0.044115892665893726, 'ROUGE-1': 0.3242880100618836, 'ROUGE-2': 0.07880021668168714, 'ROUGE-L': 0.15695171070973057, 'METEOR': 0.26724165384455484, 'BERTScore': 0.8334412097930908}, {'BLEU': 0.040698312586662115, 'ROUGE

In [15]:
import numpy as np
import json

# Aggregate batched metrics by averaging across all batches
def aggregate_metrics(batched_metrics):
    """Convert list of metric dicts into a single averaged dict."""
    aggregated = {key: np.mean([batch[key] for batch in batched_metrics]) for key in batched_metrics[0].keys()}
    return aggregated

# Compute aggregated metrics from batched results
aggregated_metrics = aggregate_metrics(metrics)

# Add other computed metrics (toxicity, perplexity, readability, diversity)
aggregated_metrics.update({
    "Perplexity": perplexity_score,
    "Toxicity": toxicity_score,
    "Readability": readability_score,
    **diversity_metrics  # Includes Distinct-1, Distinct-2, Self-BLEU
})

# Convert NumPy types to native Python types
aggregated_metrics = {key: float(value) if isinstance(value, (np.float32, np.float64)) else value for key, value in aggregated_metrics.items()}

# Determine filename
filename = "elixir_mental_results.json" if EVALUATE_FINETUNED else "llama_mental_results.json"

# Save results
with open(filename, "w") as f:
    json.dump({model_name: aggregated_metrics}, f, indent=4)

print(f"✅ Evaluation results saved to {filename}")


✅ Evaluation results saved to llama_mental_results.json
