In [None]:
!pip install -q -U transformers datasets peft accelerate bitsandbytes evaluate rouge_score bert_score
!pip install -q xformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from evaluate import load as load_metric
import torch

In [None]:
# Load dataset
dataset = load_dataset("Amod/mental_health_counseling_conversations")
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
test_set = split_dataset["test"]

# Load fine-tuned model & tokenizer
finetuned_model_dir = "/content/drive/MyDrive/lora-llama3-mental-health"
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)
model = AutoModelForCausalLM.from_pretrained(finetuned_model_dir, device_map="auto", torch_dtype=torch.float16)

# Load baseline model (original LLaMA3)
baseline_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.float16)
baseline_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

# Generation function
def generate_response(model, tokenizer, prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=200, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Chat format conversion
def format_as_chat(context):
    return f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n{context}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n"

# Run evaluation
rouge = load_metric("rouge")
bertscore = load_metric("bertscore")

ft_preds, base_preds, refs = [], [], []

print("Generating responses...")
for example in test_set.select(range(30)):  # Evaluate on 30 samples for speed
    context = example["Context"]
    reference = example["Response"]
    prompt = format_as_chat(context)

    finetuned_reply = generate_response(model, tokenizer, prompt)
    baseline_reply = generate_response(baseline_model, baseline_tokenizer, prompt)

    ft_preds.append(finetuned_reply)
    base_preds.append(baseline_reply)
    refs.append(reference)

# Compute ROUGE
ft_rouge = rouge.compute(predictions=ft_preds, references=refs)
base_rouge = rouge.compute(predictions=base_preds, references=refs)

# Compute BERTScore
ft_bert = bertscore.compute(predictions=ft_preds, references=refs, lang="en")
base_bert = bertscore.compute(predictions=base_preds, references=refs, lang="en")

In [None]:
# Display results
print("\n=== Evaluation Results ===")
print("[Finetuned Model]")
print(f"ROUGE-L: {ft_rouge['rougeL']:.4f}")
print(f"BERTScore (F1): {sum(ft_bert['f1']) / len(ft_bert['f1']):.4f}")

print("\n[Baseline Model]")
print(f"ROUGE-L: {base_rouge['rougeL']:.4f}")
print(f"BERTScore (F1): {sum(base_bert['f1']) / len(base_bert['f1']):.4f}")

# Print a few samples
print("\n=== Sample Comparison ===")
for i in range(3):
    print(f"\n[Context]: {test_set[i]['Context']}\n")
    print(f"[Reference]: {refs[i]}\n")
    print(f"[Finetuned]: {ft_preds[i]}\n")
    print(f"[Baseline ]: {base_preds[i]}\n")


=== Evaluation Results ===
[Finetuned Model]
ROUGE-L: 0.1459
BERTScore (F1): 0.8308

[Baseline Model]
ROUGE-L: 0.1102
BERTScore (F1): 0.8007

=== Sample Comparison ===

[Context]: I didn't trust my wife when I found out that she had a new guy friend that she was texting and calling. I investigated him before I found out that he was gay and that there was nothing going on. Now all my wife and I do is fight about trust.

[Reference]: Instead of fighting about trust, is it possible for you and your wife to talk with other about areas which upset each of you?Whenever feelings are hurt, knowing what exactly is problematic and being heard and understood by the partner, goes a long way to building trust.These type of discussions are hard to have, and especially for the first time.  A lot of emotions arise and often people lose their conversation focus from this.If you and your wife have a tough time opening up to each other, consider scheduling time with a couples therapist.Just by each of y