In [1]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
import re
from tqdm import tqdm

In [2]:
def evaluate_finetuned_model(model_path, test_data_path, num_samples=100):
    """
    Evaluate the fine-tuned model on test data
    """
    
    # Load model and tokenizer
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    
    # Load test data
    print("Loading test data...")
    with open(test_data_path, 'r') as f:
        test_data = json.load(f)
    
    # Sample test cases for evaluation
    if len(test_data) > num_samples:
        test_samples = test_data[:num_samples]
    else:
        test_samples = test_data
    
    results = []
    exact_match_count = 0
    structure_match_count = 0
    
    print(f"Evaluating on {len(test_samples)} samples...")
    
    for i, sample in enumerate(tqdm(test_samples)):
        instruction = sample["instruction"]
        expected_response = sample["response"]
        
        # Create prompt
        prompt = f"Instruction: {instruction} \nResponse:\n"
        
        # Generate response
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
        
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids.to(model.device),
                max_new_tokens=256,
                temperature=0.3,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                num_return_sequences=1
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract just the generated response part
        response_part = generated_text.split("Response:")[-1].strip()
        
        # Parse the generated response
        generated_json = parse_json_response(response_part)
        expected_json = expected_response
        
        # Calculate metrics
        exact_match = compare_exact_match(generated_json, expected_json)
        structure_match = compare_structure_match(generated_json, expected_json)
        key_overlap = calculate_key_overlap(generated_json, expected_json)
        
        if exact_match:
            exact_match_count += 1
        if structure_match:
            structure_match_count += 1
        
        results.append({
            "instruction": instruction,
            "expected": expected_json,
            "generated": generated_json,
            "exact_match": exact_match,
            "structure_match": structure_match,
            "key_overlap": key_overlap,
            "raw_generated": response_part
        })
    
    # Calculate overall metrics
    exact_match_accuracy = exact_match_count / len(test_samples)
    structure_match_accuracy = structure_match_count / len(test_samples)
    avg_key_overlap = sum(r["key_overlap"] for r in results) / len(test_samples)
    
    # Print evaluation results
    print("\n" + "="*50)
    print("EVALUATION RESULTS")
    print("="*50)
    print(f"Exact Match Accuracy: {exact_match_accuracy:.4f}")
    print(f"Structure Match Accuracy: {structure_match_accuracy:.4f}")
    print(f"Average Key Overlap: {avg_key_overlap:.4f}")
    print(f"Samples evaluated: {len(test_samples)}")
    
    # Save detailed results
    output_results = {
        "metrics": {
            "exact_match_accuracy": exact_match_accuracy,
            "structure_match_accuracy": structure_match_accuracy,
            "average_key_overlap": avg_key_overlap,
            "total_samples": len(test_samples)
        },
        "detailed_results": results
    }
    
    with open("evaluation_results.json", "w") as f:
        json.dump(output_results, f, indent=2)
    
    # Print some examples
    print("\n" + "="*50)
    print("SAMPLE PREDICTIONS")
    print("="*50)
    
    for i in range(min(3, len(results))):
        result = results[i]
        print(f"\nSample {i+1}:")
        print(f"Instruction: {result['instruction']}")
        print(f"Expected: {result['expected']}")
        print(f"Generated: {result['generated']}")
        print(f"Exact Match: {result['exact_match']}")
        print(f"Structure Match: {result['structure_match']}")
        print("-" * 40)
    
    return output_results

def parse_json_response(text):
    """
    Parse JSON from generated text, handling potential formatting issues
    """
    try:
        # Try to find JSON pattern in the text
        json_match = re.search(r'\{.*\}', text, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            return json.loads(json_str)
        else:
            # If no JSON found, try to parse the entire text
            return json.loads(text)
    except json.JSONDecodeError:
        # Return empty dict if parsing fails
        return {}

def compare_exact_match(generated, expected):
    """
    Check if generated response exactly matches expected response
    """
    return generated == expected

def compare_structure_match(generated, expected):
    """
    Check if generated response has the same structure as expected
    """
    if not isinstance(generated, dict) or not isinstance(expected, dict):
        return False
    
    # Check if all expected keys are present
    expected_keys = set(expected.keys())
    generated_keys = set(generated.keys())
    
    return expected_keys.issubset(generated_keys)

def calculate_key_overlap(generated, expected):
    """
    Calculate the overlap of keys between generated and expected responses
    """
    if not isinstance(generated, dict) or not isinstance(expected, dict):
        return 0.0
    
    expected_keys = set(expected.keys())
    generated_keys = set(generated.keys())
    
    if not expected_keys:
        return 0.0
    
    intersection = expected_keys.intersection(generated_keys)
    return len(intersection) / len(expected_keys)

def evaluate_with_different_temperatures(model_path, test_data_path, temperatures=[0.1, 0.3, 0.7]):
    """
    Evaluate model performance with different temperature settings
    """
    results = {}
    
    for temp in temperatures:
        print(f"\nEvaluating with temperature {temp}...")
        
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )
        
        # Load test data
        with open(test_data_path, 'r') as f:
            test_data = json.load(f)
        
        test_samples = test_data[:20]  # Use smaller subset for temperature testing
        
        exact_match_count = 0
        structure_match_count = 0
        
        for sample in tqdm(test_samples):
            instruction = sample["instruction"]
            expected_response = sample["response"]
            
            prompt = f"Instruction: {instruction} \nResponse:\n"
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
            
            with torch.no_grad():
                outputs = model.generate(
                    inputs.input_ids.to(model.device),
                    max_new_tokens=256,
                    temperature=temp,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id,
                    num_return_sequences=1
                )
            
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            response_part = generated_text.split("Response:")[-1].strip()
            generated_json = parse_json_response(response_part)
            
            if compare_exact_match(generated_json, expected_response):
                exact_match_count += 1
            if compare_structure_match(generated_json, expected_response):
                structure_match_count += 1
        
        exact_match_acc = exact_match_count / len(test_samples)
        structure_match_acc = structure_match_count / len(test_samples)
        
        results[temp] = {
            "exact_match_accuracy": exact_match_acc,
            "structure_match_accuracy": structure_match_acc
        }
        
        print(f"Temperature {temp}: Exact Match = {exact_match_acc:.4f}, Structure Match = {structure_match_acc:.4f}")
    
    # Save temperature results
    with open("temperature_evaluation.json", "w") as f:
        json.dump(results, f, indent=2)
    
    return results

In [3]:
# Paths to your model and test data
model_path = "./tinyllama-finetuned-detailed" 
test_data_path = r"C:\Users\T14 gen2\Documents\SementicSearch\test_data.json"

In [4]:
evaluation_results = evaluate_finetuned_model(
        model_path=model_path,
        test_data_path=test_data_path,
        num_samples=100  # Adjust based on your needs
    )

Loading model and tokenizer...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading test data...
Evaluating on 100 samples...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 100/100 [50:31<00:00, 30.32s/it]


EVALUATION RESULTS
Exact Match Accuracy: 0.3500
Structure Match Accuracy: 0.4500
Average Key Overlap: 0.4500
Samples evaluated: 100

SAMPLE PREDICTIONS

Sample 1:
Instruction: Show the top 5 employees where salary greater than 37
Expected: {'entity': 'employees', 'conditions': [{'field': 'salary', 'operator': '>', 'value': 37}], 'limit': 5, 'order': 'desc', 'order_by': 'salary'}
Generated: {'entity': 'employees', 'conditions': [{'field': 'salary', 'operator': '>', 'value': 37}], 'limit': 5, 'order': 'desc', 'order_by': 'salary'}
Exact Match: True
Structure Match: True
----------------------------------------

Sample 2:
Instruction: Show the  customers where city is Mumbai and total_spent greater than 19 and age below 54
Expected: {'entity': 'customers', 'conditions': [{'field': 'city', 'operator': '=', 'value': 'Mumbai'}, {'field': 'total_spent', 'operator': '>', 'value': 19}, {'field': 'age', 'operator': '<', 'value': 54}]}
Generated: {'entity': 'customers', 'conditions': [{'field': 


