In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
import json

# Load the fine-tuned model and tokenizer
model_path = "./model_artifacts"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Load test dataset
ds = load_dataset("Cynaptics/persona-chat")

# If there's a test split, use it; otherwise, split the train set
if "test" in ds:
    test_ds = ds["test"]
elif "validation" in ds:
    test_ds = ds["validation"]
else:
    # Take a small portion of training data for evaluation
    test_ds = ds["train"].select(range(100))

def combine_persona_dialogue(entry):
    """Combines the persona and dialogue into input format"""
    persona_text = " ".join(entry["persona_b"])
    dialogue_text = " ".join(entry["dialogue"])
    return f"persona: {persona_text} dialogue: {dialogue_text}"

def generate_response(prompt, max_length=150, temperature=0.7, top_p=0.9):
    """Generate response from the model"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

def calculate_perplexity(model, tokenizer, dataset, max_samples=100):
    """Calculate perplexity on the dataset"""
    total_loss = 0
    total_tokens = 0
    
    for i in tqdm(range(min(max_samples, len(dataset))), desc="Calculating perplexity"):
        text = combine_persona_dialogue(dataset[i])
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            
        total_loss += loss.item() * inputs["input_ids"].size(1)
        total_tokens += inputs["input_ids"].size(1)
    
    perplexity = np.exp(total_loss / total_tokens)
    return perplexity

def qualitative_evaluation(dataset, num_samples=5):
    """Generate sample outputs for qualitative evaluation"""
    print("\n" + "="*80)
    print("QUALITATIVE EVALUATION - Sample Generations")
    print("="*80 + "\n")
    
    for i in range(min(num_samples, len(dataset))):
        entry = dataset[i]
        persona_text = " ".join(entry["persona_b"])
        
        # Create a prompt with just the persona
        prompt = f"persona: {persona_text} dialogue:"
        
        print(f"Sample {i+1}:")
        print(f"Persona: {persona_text}")
        print(f"\nGenerated Response:")
        generated = generate_response(prompt, max_length=200)
        print(generated)
        print("\n" + "-"*80 + "\n")

def evaluate_model(test_dataset, num_perplexity_samples=100, num_qualitative_samples=5):
    """Run full evaluation"""
    print("Starting Model Evaluation...")
    print(f"Device: {device}")
    print(f"Model: {model_path}\n")
    
    # Calculate perplexity
    print("Calculating perplexity...")
    perplexity = calculate_perplexity(model, tokenizer, test_dataset, num_perplexity_samples)
    print(f"Perplexity: {perplexity:.2f}\n")
    
    # Qualitative evaluation
    qualitative_evaluation(test_dataset, num_qualitative_samples)
    
    # Save results
    results = {
        "model_path": model_path,
        "perplexity": float(perplexity),
        "num_samples_evaluated": num_perplexity_samples,
        "device": device
    }
    
    with open("evaluation_results.json", "w") as f:
        json.dump(results, f, indent=2)
    
    print("Evaluation complete! Results saved to evaluation_results.json")
    return results

# Run evaluation
if __name__ == "__main__":
    results = evaluate_model(
        test_dataset=test_ds,
        num_perplexity_samples=100,  # Adjust based on your needs
        num_qualitative_samples=5     # Number of sample generations to display
    )
    
    print(f"\nFinal Results:")
    print(f"Perplexity: {results['perplexity']:.2f}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 20000/20000 [00:00<00:00, 87833.86 examples/s] 


Starting Model Evaluation...
Device: cpu
Model: ./model_artifacts

Calculating perplexity...


Calculating perplexity: 100%|██████████| 100/100 [04:51<00:00,  2.91s/it]


Perplexity: 3.31


QUALITATIVE EVALUATION - Sample Generations

Sample 1:
Persona: I would love to try the local food with my friend. i am quiet but confident. I love to watch movies with my dad on a rainy day. i try to limit how much i eat. I just finished practicing my bass guitar in the lifeguard station.

Generated Response:
persona: I would love to try the local food with my friend. i am quiet but confident. I love to watch movies with my dad on a rainy day. i try to limit how much i eat. I just finished practicing my bass guitar in the lifeguard station. dialogue: Persona A: Hello! Persona B: Hello, how are you doing? Persona A: I'm doing well, just finished practicing my bass guitar. Persona B: That's awesome! I love to play bass guitar. Persona A: It is a lot of fun! I've played for about 10 years now. Persona B: That's really impressive! What kind of music do you like to play? Persona A: I like to play a variety of music, but my favorite is probably rock and roll. Persona B: I