In [1]:
# Install required packages
!pip install -q transformers accelerate evaluate bert-score rouge-score nltk detoxify torch datasets
!pip install -q sentencepiece protobuf

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
# Download NLTK data for METEOR
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# 2. Imports and Configuration

import json
import torch
import gc
import warnings
from typing import List, Dict, Any, Tuple
from tqdm import tqdm
import numpy as np
from collections import Counter
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from evaluate import load
from bert_score import score as bert_score_func
from detoxify import Detoxify
import re
import os

warnings.filterwarnings('ignore')

In [3]:
# Clear GPU cache
torch.cuda.empty_cache()
gc.collect()

120

In [4]:
# 3. Configuration

# CHANGE THIS TO SWITCH MODELS
# MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
MODEL_NAME = "ShivomH/Elixir-MentalHealth-3B"
# MODEL_NAME = "google/gemma-2-2b-it"
# MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

In [5]:
# Configuration parameters
CONFIG = {
    "model_name": MODEL_NAME,
    "dataset_path": "/content/MH_final_test.jsonl",
    "max_new_tokens": 512,
    "temperature": 0.5,
    "top_p": 0.85,
    "batch_size": 1,  # Keep at 1 for memory efficiency
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_half_precision": True,  # Use float16 for memory efficiency
    "seed": 42
}

# Set seed for reproducibility
torch.manual_seed(CONFIG["seed"])
np.random.seed(CONFIG["seed"])

print(f"Configuration:")
print(f"  Model: {CONFIG['model_name']}")
print(f"  Device: {CONFIG['device']}")
print(f"  Dataset: {CONFIG['dataset_path']}")

Configuration:
  Model: ShivomH/Elixir-MentalHealth-3B
  Device: cuda
  Dataset: /content/MH_final_test.jsonl


In [6]:
# 4. Helper Functions

def load_dataset(path: str) -> List[Dict]:
    """Load the test dataset from JSON or JSONL file."""
    try:
        data = []

        # Check file extension to determine format
        if path.endswith('.jsonl'):
            # Load JSONL format (one JSON object per line)
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:  # Skip empty lines
                        data.append(json.loads(line))
            print(f"✓ Loaded {len(data)} examples from {path} (JSONL format)")
        else:
            # Load standard JSON format
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            print(f"✓ Loaded {len(data)} examples from {path} (JSON format)")

        # Verify dataset structure
        multi_turn = sum(1 for item in data if len(item['messages']) > 2)
        single_turn = len(data) - multi_turn
        print(f"  - Multi-turn conversations: {multi_turn}")
        print(f"  - Single-turn conversations: {single_turn}")

        return data
    except FileNotFoundError:
        print(f"❌ Dataset not found at {path}")
        print("Please upload your test_dataset.json or test_dataset.jsonl file to /content/")
        raise
    except json.JSONDecodeError as e:
        print(f"❌ Error parsing JSON: {e}")
        print("Please check your file format (JSON or JSONL)")
        raise
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        raise

def prepare_input(messages: List[Dict], tokenizer) -> str:
    """Prepare input for the model using chat template."""
    # Remove the last assistant message (the one we want to predict)
    input_messages = messages[:-1]

    # Apply chat template
    try:
        input_text = tokenizer.apply_chat_template(
            input_messages,
            tokenize=False,
            add_generation_prompt=True
        )
    except:
        # Fallback for models without chat template
        input_text = ""
        for msg in input_messages:
            role = msg["role"]
            content = msg["content"]
            if role == "user":
                input_text += f"User: {content}\n"
            elif role == "assistant":
                input_text += f"Assistant: {content}\n"
        input_text += "Assistant: "

    return input_text

def calculate_distinct_n(texts: List[str], n: int = 1) -> float:
    """Calculate Distinct-n metric for diversity."""
    all_ngrams = []
    for text in texts:
        tokens = text.lower().split()
        ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
        all_ngrams.extend(ngrams)

    if len(all_ngrams) == 0:
        return 0.0

    return len(set(all_ngrams)) / len(all_ngrams)

def calculate_self_bleu(texts: List[str], n: int = 3) -> float:
    """Calculate Self-BLEU for measuring repetitiveness."""
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

    if len(texts) < 2:
        return 0.0

    smoothing = SmoothingFunction().method1
    scores = []

    for i, text in enumerate(texts[:100]):  # Limit to 100 for efficiency
        references = [t.split() for j, t in enumerate(texts[:100]) if i != j]
        hypothesis = text.split()
        if hypothesis and references:
            score = sentence_bleu(references, hypothesis,
                                 weights=(1/n,)*n,
                                 smoothing_function=smoothing)
            scores.append(score)

    return np.mean(scores) if scores else 0.0

def calculate_perplexity(model, tokenizer, texts: List[str], device: str) -> float:
    """Calculate perplexity as a measure of fluency."""
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for text in tqdm(texts[:50], desc="Calculating perplexity", leave=False):  # Sample for efficiency
            inputs = tokenizer(text, return_tensors="pt",
                              truncation=True, max_length=512).to(device)

            if inputs.input_ids.shape[1] <= 1:
                continue

            outputs = model(**inputs, labels=inputs.input_ids)
            loss = outputs.loss

            if loss is not None:
                total_loss += loss.item() * inputs.input_ids.shape[1]
                total_tokens += inputs.input_ids.shape[1]

    if total_tokens == 0:
        return float('inf')

    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)

    return perplexity if perplexity < 1000 else 1000.0


In [7]:
# 5. Load Model and Tokenizer

def load_model_and_tokenizer(model_name: str, use_half: bool = True):
    """Load model and tokenizer with memory-efficient settings."""
    print(f"\n📥 Loading model: {model_name}")

    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            use_fast=True
        )

        # Set padding token if not set
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Load model with half precision for memory efficiency
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if use_half else torch.float32,
            device_map="auto",
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )

        model.eval()
        print(f"✓ Model loaded successfully!")

        # Print memory usage
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1024**3
            print(f"  GPU Memory allocated: {allocated:.2f} GB")

        return model, tokenizer

    except Exception as e:
        print(f"❌ Error loading model: {e}")
        raise

# Load the model
model, tokenizer = load_model_and_tokenizer(CONFIG["model_name"], CONFIG["use_half_precision"])


📥 Loading model: ShivomH/Elixir-MentalHealth-3B


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/867 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

✓ Model loaded successfully!
  GPU Memory allocated: 5.98 GB


In [8]:
# 6. Generate Predictions

def generate_predictions(model, tokenizer, dataset: List[Dict], config: Dict) -> Tuple[List[str], List[str]]:
    """Generate predictions for the dataset."""
    predictions = []
    references = []

    print(f"\n🔮 Generating predictions...")

    for item in tqdm(dataset, desc="Generating"):
        messages = item["messages"]

        # Get reference (last assistant message)
        reference = messages[-1]["content"]
        references.append(reference)

        # Prepare input
        input_text = prepare_input(messages, tokenizer)

        # Tokenize
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=2048
        ).to(config["device"])

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=config["max_new_tokens"],
                temperature=config["temperature"],
                top_p=config["top_p"],
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode prediction
        generated_ids = outputs[0][inputs.input_ids.shape[1]:]
        prediction = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

        predictions.append(prediction)

        # Clear cache periodically
        if len(predictions) % 50 == 0:
            torch.cuda.empty_cache()

    print(f"✓ Generated {len(predictions)} predictions")

    return predictions, references

# Generate predictions
predictions, references = generate_predictions(model, tokenizer,
                                              load_dataset(CONFIG["dataset_path"]),
                                              CONFIG)

✓ Loaded 1000 examples from /content/MH_final_test.jsonl (JSONL format)
  - Multi-turn conversations: 1000
  - Single-turn conversations: 0

🔮 Generating predictions...


Generating: 100%|██████████| 1000/1000 [2:40:57<00:00,  9.66s/it]

✓ Generated 1000 predictions





In [9]:
# 7. Evaluate Metrics

def evaluate_all_metrics(predictions: List[str], references: List[str], model, tokenizer, device: str) -> Dict:
    """Calculate all evaluation metrics."""
    print("\n📊 Computing evaluation metrics...")
    results = {}

    # 1. BERTScore
    print("  • Computing BERTScore...")
    try:
        P, R, F1 = bert_score_func(predictions, references,
                                   lang="en", verbose=False,
                                   device=device, batch_size=8)
        results["bertscore"] = {
            "precision": float(P.mean()),
            "recall": float(R.mean()),
            "f1": float(F1.mean())
        }
    except Exception as e:
        print(f"    Warning: BERTScore failed - {e}")
        results["bertscore"] = {"precision": 0, "recall": 0, "f1": 0}

    # 2. ROUGE-L
    print("  • Computing ROUGE-L...")
    try:
        rouge = load("rouge")
        rouge_results = rouge.compute(predictions=predictions, references=references)
        results["rouge_l"] = float(rouge_results["rougeL"])
    except Exception as e:
        print(f"    Warning: ROUGE-L failed - {e}")
        results["rouge_l"] = 0.0

    # 3. METEOR
    print("  • Computing METEOR...")
    try:
        meteor = load("meteor")
        meteor_results = meteor.compute(predictions=predictions, references=references)
        results["meteor"] = float(meteor_results["meteor"])
    except Exception as e:
        print(f"    Warning: METEOR failed - {e}")
        results["meteor"] = 0.0

    # 4. Distinct-1 and Distinct-2
    print("  • Computing Distinct-n...")
    results["distinct_1"] = calculate_distinct_n(predictions, n=1)
    results["distinct_2"] = calculate_distinct_n(predictions, n=2)

    # 5. Self-BLEU
    print("  • Computing Self-BLEU...")
    results["self_bleu"] = calculate_self_bleu(predictions, n=3)

    # 6. Toxicity
    print("  • Computing Toxicity scores...")
    try:
        toxicity_model = Detoxify('original')
        toxicity_scores = []
        for pred in tqdm(predictions, desc="    Analyzing toxicity", leave=False):
            scores = toxicity_model.predict(pred)
            toxicity_scores.append(scores['toxicity'])
        results["toxicity"] = {
            "mean": float(np.mean(toxicity_scores)),
            "max": float(np.max(toxicity_scores)),
            "min": float(np.min(toxicity_scores))
        }
    except Exception as e:
        print(f"    Warning: Toxicity detection failed - {e}")
        results["toxicity"] = {"mean": 0, "max": 0, "min": 0}

    # 7. Perplexity
    print("  • Computing Perplexity...")
    try:
        perplexity = calculate_perplexity(model, tokenizer, predictions, device)
        results["perplexity"] = float(perplexity)
    except Exception as e:
        print(f"    Warning: Perplexity calculation failed - {e}")
        results["perplexity"] = 0.0

    # Add metadata
    results["metadata"] = {
        "num_samples": len(predictions),
        "avg_prediction_length": np.mean([len(p.split()) for p in predictions]),
        "avg_reference_length": np.mean([len(r.split()) for r in references])
    }

    print("✓ All metrics computed successfully!")

    return results

In [10]:
# Evaluate
results = evaluate_all_metrics(predictions, references, model, tokenizer, CONFIG["device"])


📊 Computing evaluation metrics...
  • Computing BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  • Computing ROUGE-L...


Downloading builder script: 0.00B [00:00, ?B/s]

  • Computing METEOR...


Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


  • Computing Distinct-n...
  • Computing Self-BLEU...
  • Computing Toxicity scores...
Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.1-alpha/toxic_original-c1212f89.ckpt" to /root/.cache/torch/hub/checkpoints/toxic_original-c1212f89.ckpt


100%|██████████| 418M/418M [00:02<00:00, 189MB/s]


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



  • Computing Perplexity...


                                                                       

✓ All metrics computed successfully!




In [11]:
# 8. Display Results and Sample Outputs

# Display sample predictions
print("\n" + "="*80)
print("📝 SAMPLE PREDICTIONS (First 3)")
print("="*80)

for i in range(min(3, len(predictions))):
    print(f"\n--- Example {i+1} ---")
    print(f"Reference: {references[i][:200]}...")
    print(f"Prediction: {predictions[i][:200]}...")

# Display final results
print("\n" + "="*80)
print("📊 EVALUATION RESULTS")
print("="*80)

# Pretty print results
import json
print(json.dumps(results, indent=2))


📝 SAMPLE PREDICTIONS (First 3)

--- Example 1 ---
Reference: I understand how difficult it can be to feel isolated and disconnected, especially when you've lost a valuable support system. It's important to remember that building connections takes time and effor...
Prediction: It's understandable that you're feeling isolated after losing your connection with your neighbors who had a positive impact on your life. It can be tough when we experience changes in our social circl...

--- Example 2 ---
Reference: I'm glad to hear that.  Remember, conflicts can lead to growth and stronger connections when handled with understanding and open communication. Take care of yourself, and I'm here whenever you need me...
Prediction: That's the spirit.  You have the strength within you to overcome any challenges that come your way. Keep nurturing your self-love and surround yourself with positive influences. Remember, you're deser...

--- Example 3 ---
Reference: I'm sorry for your loss, and I can see

In [12]:
# 9. Save Results

# Create output filename based on model name
model_name_clean = CONFIG["model_name"].replace("/", "-").replace(".", "_")
output_filename = f"{model_name_clean}_eval_results.json"

# Prepare final results dictionary
final_results = {
    "model": CONFIG["model_name"],
    "configuration": {
        "max_new_tokens": CONFIG["max_new_tokens"],
        "temperature": CONFIG["temperature"],
        "top_p": CONFIG["top_p"],
        "device": CONFIG["device"]
    },
    "metrics": results,
    "sample_outputs": [
        {
            "reference": ref[:500],
            "prediction": pred[:500]
        }
        for ref, pred in zip(references[:5], predictions[:5])
    ]
}

# Save to JSON file
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(final_results, f, indent=2, ensure_ascii=False)

print(f"\n✅ Results saved to: {output_filename}")


✅ Results saved to: ShivomH-Elixir-MentalHealth-3B_eval_results.json


In [13]:
# Clear GPU memory
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()

print("\n🧹 Cleaned up GPU memory")
print("✨ Evaluation complete!")


🧹 Cleaned up GPU memory
✨ Evaluation complete!
