# Biomedical LLM Evaluation Suite

**Project:** Pipeline Optimisation  
**Purpose:** Evaluate language models on literature interpretation tasks

This notebook uses a two-stage pipeline:
1. **Generation**: Evaluation model generates plaintext analysis
2. **Parsing**: Instructor extracts structured data from plaintext

In [1]:
!pip install -r "requirements.txt"

Collecting sentencepiece>=0.1.99 (from -r requirements.txt (line 14))
  Using cached sentencepiece-0.2.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting protobuf>=3.20.0 (from -r requirements.txt (line 15))
  Using cached protobuf-6.33.1-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting instructor>=1.0.0 (from -r requirements.txt (line 18))
  Downloading instructor-1.13.0-py3-none-any.whl.metadata (11 kB)
Collecting pydantic>=2.0.0 (from -r requirements.txt (line 19))
  Downloading pydantic-2.12.4-py3-none-any.whl.metadata (89 kB)
Collecting jupyter>=1.0.0 (from -r requirements.txt (line 22))
  Using cached jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting aiohttp<4.0.0,>=3.9.1 (from instructor>=1.0.0->-r requirements.txt (line 18))
  Downloading aiohttp-3.13.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting diskcache>=5.6.3 (from instructor>=1.0.0->-r requirements.txt (line 18))
  Downloading diskcache-5.6.3-py3-none-any.wh

In [2]:
# Login to HuggingFace (only needed for gated models like Meditron, Llama)
# Get your token from: https://huggingface.co/settings/tokens

from huggingface_hub import login

# Option 1: Interactive login (recommended for Colab)
login()

# Option 2: Login with token directly (uncomment and add your token)
# login(token='hf_YOUR_TOKEN_HERE')

print('✓ Logged in to HuggingFace')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

✓ Logged in to HuggingFace


In [3]:
import json
import time
from pathlib import Path
from datetime import datetime
from typing import Literal
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
import torch
from tqdm.notebook import tqdm

# Instructor for structured output parsing
import instructor
from pydantic import BaseModel, Field

print('✓ Imports successful')
print(f"Device: {'GPU (CUDA)' if torch.cuda.is_available() else 'CPU'}")
print(f"PyTorch version: {torch.__version__}")

✓ Imports successful
Device: CPU
PyTorch version: 2.9.1


In [None]:
# =============================================================================
# Pydantic Models for Structured Outputs
# =============================================================================
import json as json_module
from pydantic import field_validator

class RelevanceResult(BaseModel):
    """Result of relevance assessment."""
    relevance: Literal["relevant", "not_relevant"] = Field(
        description="Whether the abstract explains how the agent affects the pathway"
    )
    rationale: str = Field(
        description="One sentence explaining the relevance decision"
    )

class MechanismResult(BaseModel):
    """Result of mechanism extraction."""
    mechanism_summary: str = Field(
        description="One-sentence description of how the agent affects the pathway"
    )
    molecular_components: list[str] = Field(
        description="List of molecules/genes/proteins mentioned in the mechanism"
    )
    direction_of_effect: Literal["activation", "inhibition", "unknown"] = Field(
        description="Whether the agent activates or inhibits the pathway"
    )
    
    @field_validator('molecular_components', mode='before')
    @classmethod
    def parse_json_string(cls, v):
        """Handle case where LLM returns JSON string instead of array."""
        if isinstance(v, str):
            try:
                parsed = json_module.loads(v)
                if isinstance(parsed, list):
                    return parsed
            except json_module.JSONDecodeError:
                pass
            # If it's a comma-separated string, split it
            return [x.strip() for x in v.split(',') if x.strip()]
        return v
    
    @field_validator('direction_of_effect', mode='before')
    @classmethod
    def parse_enum_dict(cls, v):
        """Handle case where LLM returns {"enum": ["activation"]} instead of "activation"."""
        if isinstance(v, dict):
            # Handle {"enum": ["activation"]} format
            if 'enum' in v and isinstance(v['enum'], list) and len(v['enum']) > 0:
                return v['enum'][0]
            # Handle other dict formats - try to extract first string value
            for val in v.values():
                if isinstance(val, str) and val in ('activation', 'inhibition', 'unknown'):
                    return val
                if isinstance(val, list) and len(val) > 0 and isinstance(val[0], str):
                    return val[0]
        return v

class QualityResult(BaseModel):
    """Result of evidence quality assessment."""
    evidence_quality: Literal["strong", "moderate", "weak", "insufficient"] = Field(
        description="Strength of evidence in the abstract"
    )
    justification: str = Field(
        description="One sentence explaining the quality classification"
    )

print('✓ Pydantic models defined')

In [5]:
# Load test datasets
test_dir = Path('model_tests')

with open(test_dir / 'test_relevance.json') as f:
    relevance_data = json.load(f)

with open(test_dir / 'test_mechanism.json') as f:
    mechanism_data = json.load(f)

with open(test_dir / 'test_quality.json') as f:
    quality_data = json.load(f)

with open(test_dir / 'test_stability.json') as f:
    stability_data = json.load(f)

print(f'✓ Loaded test data:')
print(f'  - Relevance: {len(relevance_data)} items')
print(f'  - Mechanism: {len(mechanism_data)} items')
print(f'  - Quality: {len(quality_data)} items')
print(f'  - Stability: {len(stability_data)} items')

✓ Loaded test data:
  - Relevance: 50 items
  - Mechanism: 50 items
  - Quality: 50 items
  - Stability: 50 items


In [6]:
class ModelEvaluator:
    """Generates plaintext responses from HuggingFace models."""
    
    def __init__(self, model_name, device='auto'):
        self.model_name = model_name
        self.device = device if device != 'auto' else ('cuda' if torch.cuda.is_available() else 'cpu')
        
        print(f'Loading model: {model_name}')
        print(f'Device: {self.device}')
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        
        try:
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                trust_remote_code=True,
                torch_dtype=torch.float16 if self.device == 'cuda' else torch.float32,
                device_map=self.device if self.device == 'cuda' else None
            )
            self.is_causal = True
        except:
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                trust_remote_code=True,
                torch_dtype=torch.float16 if self.device == 'cuda' else torch.float32,
                device_map=self.device if self.device == 'cuda' else None
            )
            self.is_causal = False
        
        if self.device == 'cpu':
            self.model = self.model.to(self.device)
        
        self.model.eval()
        
        # Fix tokenizer pad token
        if self.tokenizer.pad_token is None:
            if self.tokenizer.unk_token is not None:
                self.tokenizer.pad_token = self.tokenizer.unk_token
            else:
                self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
                self.model.resize_token_embeddings(len(self.tokenizer))
        
        if hasattr(self.model.config, 'max_position_embeddings'):
            self.model_max_length = self.model.config.max_position_embeddings
        elif hasattr(self.model.config, 'n_positions'):
            self.model_max_length = self.model.config.n_positions
        else:
            self.model_max_length = 1024
        
        print(f'✓ Model loaded (max length: {self.model_max_length})')
    
    def generate_response(self, prompt, max_new_tokens=512):
        """Generate a plaintext response from the model."""
        safe_input_length = self.model_max_length - max_new_tokens - 10
        
        inputs = self.tokenizer(
            prompt, 
            return_tensors='pt', 
            truncation=True, 
            max_length=safe_input_length,
            padding=False
        )
        
        inputs = {k: v.to(self.device) for k, v in inputs.items() if k in ['input_ids', 'attention_mask']}
        input_length = inputs['input_ids'].shape[1]
        
        try:
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    min_new_tokens=20,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    repetition_penalty=1.1,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                )
            
            generated_tokens = outputs[0][input_length:]
            response = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
            return response if response else "No response generated"
        except Exception as e:
            import traceback
            print(f"[ERROR] Generation failed: {str(e)}")
            print(traceback.format_exc())
            return f"Error: {str(e)}"

print('✓ ModelEvaluator class defined')

✓ ModelEvaluator class defined


In [7]:
class InstructorParser:
    """Uses Instructor + a parsing LLM to extract structured data from plaintext."""
    
    def __init__(self, provider: str = "ollama/llama3.2"):
        """
        Initialize the parser with a provider.
        
        Args:
            provider: Instructor provider string, e.g.:
                - "ollama/llama3.2" (local, free)
                - "ollama/mistral" (local, free)
                - "openai/gpt-4o-mini" (API, requires OPENAI_API_KEY)
        """
        self.provider = provider
        self.client = instructor.from_provider(provider)
        print(f'✓ InstructorParser initialized with provider: {provider}')
    
    def parse(self, text: str, response_model: type[BaseModel], context: str = "") -> BaseModel:
        """Parse plaintext into a structured Pydantic model."""
        system_prompt = """You are a precise data extraction assistant. 
Extract the requested information from the given text.
Only extract what is explicitly stated or clearly implied.
If information is missing or unclear, use reasonable defaults."""
        
        user_prompt = f"""Extract structured data from this text.

Context: {context}

Text to parse:
{text}

Extract the information according to the requested schema."""
        
        try:
            result = self.client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                response_model=response_model,
            )
            return result
        except Exception as e:
            print(f"[PARSE ERROR] {str(e)}")
            if response_model == RelevanceResult:
                return RelevanceResult(relevance="not_relevant", rationale=f"Parse error: {str(e)}")
            elif response_model == MechanismResult:
                return MechanismResult(mechanism_summary=f"Parse error: {str(e)}", molecular_components=[], direction_of_effect="unknown")
            elif response_model == QualityResult:
                return QualityResult(evidence_quality="insufficient", justification=f"Parse error: {str(e)}")
            else:
                raise

print('✓ InstructorParser class defined')

✓ InstructorParser class defined


## Load Model

**Recommended models:**
- `facebook/galactica-1.3b` - Best for Colab free tier
- `facebook/galactica-125m` - Fastest, may struggle
- `BioMistral/BioMistral-7B` - Best accuracy, needs Colab Pro

In [8]:
# =============================================================================
# Configuration
# =============================================================================

# Model to evaluate (generates plaintext responses)
MODEL_NAME = 'facebook/galactica-1.3b'

# Parser provider for structured extraction
# Options: "ollama/llama3.2", "ollama/mistral", "openai/gpt-4o-mini", etc.
PARSER_PROVIDER = "ollama/llama3.2"

# Initialize evaluator and parser
evaluator = ModelEvaluator(MODEL_NAME)
parser = InstructorParser(PARSER_PROVIDER)

Loading model: facebook/galactica-1.3b
Device: cpu


tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


✓ Model loaded (max length: 2048)
✓ InstructorParser initialized with provider: ollama/llama3.2


## Quick Test (2 items)

Test with 2 items first to verify the model works

In [None]:
relevance_prompt_template = """
You are a biomedical expert specializing in molecular mechanisms.

Task:
Determine whether the abstract explains HOW {agent} affects {pathway} through a *direct or explicitly implied molecular mechanism*.

STRICT RELEVANCE CRITERIA:
An abstract is **RELEVANT** only if it contains:
- A *direct molecular interaction* involving {agent} and components of {pathway}  
  (e.g., binding, phosphorylation, gene expression changes, enzyme modulation, signaling events), **OR**
- An *explicit implication* of such a mechanism  
  (e.g., "{agent} increases activation of {pathway}-related kinases",  
  "{agent} suppresses transcription factors that regulate {pathway}").

An abstract is **NOT RELEVANT** if:
- {agent} and {pathway} are both mentioned but with **no mechanistic connection**.
- No molecular components or processes linking {agent} to {pathway} are described.
- Only outcomes, phenotypes, cell effects, or high-level associations are mentioned.
- Mechanistic detail is absent, vague, or unrelated to the specified pathway.

Hard Constraints:
- If the abstract does *not* mention specific molecules, genes, proteins, or signaling components connecting {agent} to {pathway}, output **Not Relevant**.
- If the effect is described only as a general phenomenon (e.g., “anti-inflammatory,” “cytotoxic,” “protective”) without molecular detail, output **Not Relevant**.

Abstract:
{abstract}

Output Requirements:
1. Clearly state **“Relevant”** or **“Not Relevant”**.
2. Provide a brief, clinical, text-only explanation describing *why* the classification was made, referencing the specific mechanistic elements (or the lack of them).
"""

quick_test = relevance_data[:2]
results = []

for item in tqdm(quick_test, desc='Quick Test'):
    prompt = relevance_prompt_template.format(
        agent=item['agent'],
        pathway=item['pathway'],
        abstract=item['abstract']
    )
    
    # Generate plaintext response
    start = time.time()
    response = evaluator.generate_response(prompt)
    gen_time = time.time() - start
    
    # Parse with Instructor
    parse_start = time.time()
    parsed = parser.parse(
        response, 
        RelevanceResult,
        context=f"Assessing if abstract about {item['agent']} is relevant to {item['pathway']}"
    )
    parse_time = time.time() - parse_start
    
    predicted = parsed.relevance
    expected = item['gold_label'].lower()
    
    results.append({
        'id': item['id'],
        'expected': expected,
        'predicted': predicted,
        'correct': predicted == expected,
        'gen_time': gen_time,
        'parse_time': parse_time
    })
    
    print(f"{item['id']}: {predicted} (expected: {expected}) - {'✓' if predicted == expected else '✗'}")
    print(f"  Gen: {gen_time:.1f}s | Parse: {parse_time:.1f}s")
    print(f"  Rationale: {parsed.rationale[:100]}...")
    # print(f"  Raw response: {response[:150]}...\n")

accuracy = sum(r['correct'] for r in results) / len(results)
print(f"\nQuick Test Accuracy: {accuracy:.1%}")
print(f"Avg Gen Time: {sum(r['gen_time'] for r in results)/len(results):.1f}s")
print(f"Avg Parse Time: {sum(r['parse_time'] for r in results)/len(results):.1f}s")

## Test 1: Relevance Assessment

In [None]:
relevance_results = []

for item in tqdm(relevance_data, desc='Relevance Test'):
    prompt = relevance_prompt_template.format(
        agent=item['agent'],
        pathway=item['pathway'],
        abstract=item['abstract']
    )
    
    start = time.time()
    response = evaluator.generate_response(prompt)
    gen_time = time.time() - start
    
    parse_start = time.time()
    parsed = parser.parse(
        response, 
        RelevanceResult,
        context=f"Assessing if abstract about {item['agent']} is relevant to {item['pathway']}"
    )
    parse_time = time.time() - parse_start
    
    predicted = parsed.relevance
    expected = item['gold_label'].lower()
    
    relevance_results.append({
        'id': item['id'],
        'agent': item['agent'],
        'pathway': item['pathway'],
        'expected': expected,
        'predicted': predicted,
        'correct': predicted == expected,
        'rationale': parsed.rationale,
        'gen_time': gen_time,
        'parse_time': parse_time
    })

df_relevance = pd.DataFrame(relevance_results)
display(df_relevance[['id', 'agent', 'pathway', 'expected', 'predicted', 'correct']])

relevance_accuracy = df_relevance['correct'].mean()
print(f"\nRelevance Accuracy: {relevance_accuracy:.1%}")
print(f"Correct: {df_relevance['correct'].sum()}/{len(df_relevance)}")
print(f"Avg Gen Time: {df_relevance['gen_time'].mean():.1f}s")
print(f"Avg Parse Time: {df_relevance['parse_time'].mean():.1f}s")

## Test 2: Mechanism Extraction

In [None]:
mechanism_prompt_template = """
You are a biomedical expert specializing in mechanistic pathway analysis.

Task: Extract *only the mechanistic information explicitly stated in the abstract* describing how **{agent}** affects **{pathway}**.

Rules:
- Use only mechanisms directly reported in the abstract. Do NOT infer, generalize, or assume unstated biology.
- Mechanistic detail must be molecular (e.g., protein interactions, gene regulation, signaling events).
- You must clearly classify the pathway effect as either **activation** or **inhibition** based solely on explicit text.

Abstract:
{abstract}

Please provide the following in plain, clinical language:

1. **Mechanistic Summary (one sentence):**  
   How does {agent} affect {pathway}, based solely on explicit statements?

2. **Molecular Components Involved:**  
   List all proteins, genes, or molecules described as part of the mechanism linking {agent} to {pathway}.

3. **Direction of Effect:**  
   Does {agent} *activate* or *inhibit* the pathway?  
   (Choose exactly one based on explicit evidence in the abstract.)
"""

mechanism_results = []

for item in tqdm(mechanism_data, desc='Mechanism Test'):
    prompt = mechanism_prompt_template.format(
        agent=item['agent'],
        pathway=item['pathway'],
        abstract=item['abstract']
    )
    
    start = time.time()
    response = evaluator.generate_response(prompt)
    gen_time = time.time() - start
    
    parse_start = time.time()
    parsed = parser.parse(
        response, 
        MechanismResult,
        context=f"Extracting mechanism of {item['agent']} on {item['pathway']}"
    )
    parse_time = time.time() - parse_start
    
    predicted = parsed.direction_of_effect
    expected = item['gold_label'].lower()
    
    mechanism_results.append({
        'id': item['id'],
        'expected': expected,
        'predicted': predicted,
        'correct': predicted == expected,
        'has_components': len(parsed.molecular_components) > 0,
        'num_components': len(parsed.molecular_components),
        'summary': parsed.mechanism_summary,
        'gen_time': gen_time,
        'parse_time': parse_time
    })

df_mechanism = pd.DataFrame(mechanism_results)
display(df_mechanism[['id', 'expected', 'predicted', 'correct', 'num_components']])

mechanism_accuracy = df_mechanism['correct'].mean()
component_rate = df_mechanism['has_components'].mean()
print(f"\nMechanism Direction Accuracy: {mechanism_accuracy:.1%}")
print(f"Correct: {df_mechanism['correct'].sum()}/{len(df_mechanism)}")
print(f"Component Extraction Rate: {component_rate:.1%}")
print(f"Avg Components Extracted: {df_mechanism['num_components'].mean():.1f}")

## Test 3: Evidence Quality

In [None]:
quality_prompt_template = """
You are a biomedical expert evaluating research quality.

Task: Assess the strength of evidence in this abstract about {agent} and {pathway}.

Evidence Quality Scale:
- STRONG: Multiple experimental approaches with rigorous controls AND clinical/in vivo validation
- MODERATE: Solid experimental support (multiple in vitro assays) but lacking clinical/in vivo validation
- WEAK: Minimal experimental data or limited assays
- INSUFFICIENT: Unclear methods, anecdotal claims, or very limited evidence

Abstract:
{abstract}

Please evaluate the evidence quality and explain your reasoning.
"""

quality_levels = ['insufficient', 'weak', 'moderate', 'strong']

def get_quality_distance(predicted, expected):
    try:
        pred_idx = quality_levels.index(predicted)
        exp_idx = quality_levels.index(expected)
        return abs(pred_idx - exp_idx)
    except ValueError:
        return -1

quality_results = []

for item in tqdm(quality_data, desc='Quality Test'):
    prompt = quality_prompt_template.format(
        agent=item['agent'],
        pathway=item['pathway'],
        abstract=item['abstract']
    )
    
    start = time.time()
    response = evaluator.generate_response(prompt)
    gen_time = time.time() - start
    
    parse_start = time.time()
    parsed = parser.parse(
        response, 
        QualityResult,
        context=f"Evaluating evidence quality for {item['agent']} on {item['pathway']}"
    )
    parse_time = time.time() - parse_start
    
    predicted = parsed.evidence_quality
    expected = item['gold_label'].lower()
    distance = get_quality_distance(predicted, expected)
    
    quality_results.append({
        'id': item['id'],
        'expected': expected,
        'predicted': predicted,
        'correct': predicted == expected,
        'steps_away': distance,
        'justification': parsed.justification,
        'gen_time': gen_time,
        'parse_time': parse_time
    })

df_quality = pd.DataFrame(quality_results)
display(df_quality[['id', 'expected', 'predicted', 'correct', 'steps_away']])

quality_accuracy = df_quality['correct'].mean()
valid_predictions = df_quality[df_quality['steps_away'] >= 0]
within_one_step = (valid_predictions['steps_away'] <= 1).mean() if len(valid_predictions) > 0 else 0

print(f"\nQuality Accuracy: {quality_accuracy:.1%}")
print(f"Correct: {df_quality['correct'].sum()}/{len(df_quality)}")
print(f"Within 1 Step: {within_one_step:.1%}")
print(f"Avg Steps Away: {valid_predictions['steps_away'].mean():.2f}")

## Test 4: Parsing Stability

Run each prompt twice to check if parsing produces consistent results.

In [None]:
stability_results = []

for item in tqdm(stability_data, desc='Stability Test'):
    prompt = relevance_prompt_template.format(
        agent=item['agent'],
        pathway=item['pathway'],
        abstract=item['abstract']
    )
    
    # Run twice
    response1 = evaluator.generate_response(prompt)
    parsed1 = parser.parse(
        response1, 
        RelevanceResult,
        context=f"Assessing if abstract about {item['agent']} is relevant to {item['pathway']}"
    )
    
    response2 = evaluator.generate_response(prompt)
    parsed2 = parser.parse(
        response2, 
        RelevanceResult,
        context=f"Assessing if abstract about {item['agent']} is relevant to {item['pathway']}"
    )
    
    is_stable = parsed1.relevance == parsed2.relevance
    
    stability_results.append({
        'id': item['id'],
        'stable': is_stable,
        'run1_relevance': parsed1.relevance,
        'run2_relevance': parsed2.relevance
    })

df_stability = pd.DataFrame(stability_results)
display(df_stability)

stability_rate = df_stability['stable'].mean()
print(f"\nParsing Stability Rate: {stability_rate:.1%}")
print(f"Stable: {df_stability['stable'].sum()}/{len(df_stability)}")

## Summary and Visualization

In [None]:
print(f"{'='*80}")
print(f"EVALUATION SUMMARY")
print(f"{'='*80}")
print(f"Evaluation Model:  {MODEL_NAME}")
print(f"Parser Provider:   {PARSER_PROVIDER}")
print(f"{'='*80}")
print(f"Test 1 - Relevance Accuracy:    {relevance_accuracy:.1%}")
print(f"Test 2 - Mechanism Accuracy:    {mechanism_accuracy:.1%}")
print(f"Test 3 - Evidence Quality:      {quality_accuracy:.1%}")
print(f"Test 4 - Parsing Stability:     {stability_rate:.1%}")
print(f"Avg Generation Time:            {df_relevance['gen_time'].mean():.1f}s per item")
print(f"Avg Parse Time:                 {df_relevance['parse_time'].mean():.1f}s per item")
print(f"{'='*80}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

accuracies = {
    'Relevance': relevance_accuracy,
    'Mechanism': mechanism_accuracy,
    'Quality': quality_accuracy,
    'Stability': stability_rate
}
colors = ['#1f77b4', '#2ca02c', '#ff7f0e', '#9467bd']
axes[0].bar(accuracies.keys(), accuracies.values(), color=colors)
axes[0].set_ylabel('Accuracy / Rate')
axes[0].set_title('Performance by Test Type')
axes[0].set_ylim([0, 1])
axes[0].axhline(y=0.5, color='r', linestyle='--', alpha=0.5)

all_gen_times = (list(df_relevance['gen_time']) + list(df_mechanism['gen_time']) + 
                 list(df_quality['gen_time']))
axes[1].hist(all_gen_times, bins=15, color='#1f77b4', alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Time (seconds)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Generation Time Distribution')

plt.tight_layout()
plt.show()

# Save results
summary_df = pd.DataFrame([{
    'Eval_Model': MODEL_NAME,
    'Parser': PARSER_PROVIDER,
    'Relevance_Accuracy': f"{relevance_accuracy:.1%}",
    'Mechanism_Accuracy': f"{mechanism_accuracy:.1%}",
    'Quality_Accuracy': f"{quality_accuracy:.1%}",
    'Parsing_Stability': f"{stability_rate:.1%}",
    'Avg_Gen_Time_s': f"{df_relevance['gen_time'].mean():.1f}",
    'Avg_Parse_Time_s': f"{df_relevance['parse_time'].mean():.1f}"
}])

display(summary_df)

# Download results (works on Colab)
try:
    from google.colab import files
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"{MODEL_NAME.replace('/', '_')}_{timestamp}_results.csv"
    summary_df.to_csv(filename, index=False)
    files.download(filename)
    print(f'\n✓ Downloaded: {filename}')
except:
    print('\n(Not on Colab - results not auto-downloaded)')