In [20]:
import os
import json
import statistics
import re
from collections import Counter
from openai import OpenAI, RateLimitError, APIError
import time
import torch
import gc

import pandas as pd
import numpy as np

from load_llm_util import load_base_model_for_inference, load_fft_model_for_inference, load_lora_model_for_inference, load_qlora_model_for_inference

### Configuration

In [3]:

# Load API key securely from environment variable
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

if not OPENAI_API_KEY:
    print("Error: OPENAI_API_KEY environment variable not set.")

JUDGE_MODEL = "gpt-4o" # Or another powerful model like gpt-o1
MAX_RETRIES = 3
RETRY_DELAY = 5 # seconds

SYSTEM_PROMPT = """You are a friendly college student chatbot. Respond naturally in Hinglish (mix of Romanized Hindi and English). Keep the conversation casual and engaging. Avoid being overly formal. Respond with followup questions to keep coversation engaging"""


In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# EVALUATION PROMPTS
# ─────────────────────────────────────────────────────────────────────────────
JUDGE_SYSTEM_PROMPT = """
You are an expert evaluator assessing chatbot responses for a Hinglish‑speaking setting.

**Primary objectives (carry the most weight):**
1. **Hindi Usage** – Response should be *Hindi‑heavy Hinglish*:  
   • At least 40‑60 % of the tokens should be Roman‑script Hindi words / phrases.  
   • Pure English should be used sparingly and mostly for modern slang or technical terms.  
   • Devanagari is acceptable only for short quotes or special emphasis.  
2. **Gender Correctness** – Any gendered address (pronouns, verb forms, honorifics) **must match the user’s stated or obvious gender**.  
   • If gender is not explicit, default to gender‑neutral Hinglish (“yaar”, “friend”, etc.).  
   • Mis‑gendering or inconsistent switching is a serious error.

**Secondary objectives (still score, but lower weight):**
* **Hinglish Fluency** – Natural code‑switching, idiomatic phrasing.
* **Persona Adherence** – Friendly college student vibe: informal, relatable, campus life references.
* **Coherence** – Logical, on‑topic, internally consistent.
* **Engagingness** – Keeps the conversation lively / interesting.
* **Language Constraint** – Avoids pure English or pure Hindi blocks unless context demands.

Use a 1‑5 Likert scale (1 = Poor, 5 = Excellent) for each metric.  
Provide a concise justification (1‑2 sentences) for every score.

Output **only** a valid JSON object with exactly these keys:

- "hindi_usage_score": int (1‑5)
- "hindi_usage_justification": str
- "gender_correctness_score": int (1‑5)
- "gender_correctness_justification": str
- "hinglish_fluency_score": int (1‑5)
- "hinglish_fluency_justification": str
- "persona_adherence_score": int (1‑5)
- "persona_adherence_justification": str
- "coherence_score": int (1‑5)
- "coherence_justification": str
- "engagingness_score": int (1‑5)
- "engagingness_justification": str
- "language_constraint_score": int (1‑5)
- "language_constraint_justification": str
"""

JUDGE_USER_PROMPT_TEMPLATE = """
User Prompt:
"{user_prompt}"

Chatbot Response:
"{chatbot_response}"

Evaluate the Chatbot Response based on the criteria outlined in the system prompt.  
Output **only** the JSON object.
"""


### Judge LLM - OpenAI 4o 

In [None]:
def get_llm_evaluation(client, user_prompt, chatbot_response):
    """
    Gets evaluation scores from the LLM judge.
    Returns a dictionary with scores or None if an error occurs after retries.
    """
    judge_user_prompt = JUDGE_USER_PROMPT_TEMPLATE.format(
        user_prompt=user_prompt,
        chatbot_response=chatbot_response
    )

    for attempt in range(MAX_RETRIES):
        try:
            response = client.chat.completions.create(
                model=JUDGE_MODEL,
                messages=[
                    {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
                    {"role": "user", "content": judge_user_prompt}
                ],
                temperature=0.1, # Low temperature for consistent evaluation
                response_format={"type": "json_object"} # Request JSON output directly
            )
            content = response.choices[0].message.content
            eval_data = json.loads(content)

            # Basic validation of expected keys (add more checks if needed)
            required_keys = [
                "hinglish_fluency_score", "persona_adherence_score",
                "coherence_score", "engagingness_score", "language_constraint_score"
            ]
            if all(key in eval_data for key in required_keys):
                 return eval_data
            else:
                print(f"Warning: LLM judge response missing required keys. Response: {content}")
                # Optionally treat this as an error and retry/return None

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from LLM judge (Attempt {attempt + 1}/{MAX_RETRIES}): {e}")
            print(f"LLM raw response: {content}")
        except (RateLimitError, APIError) as e:
            print(f"OpenAI API error (Attempt {attempt + 1}/{MAX_RETRIES}): {e}")
        except Exception as e:
            print(f"An unexpected error occurred during LLM evaluation (Attempt {attempt + 1}/{MAX_RETRIES}): {e}")

        if attempt < MAX_RETRIES - 1:
            print(f"Retrying in {RETRY_DELAY} seconds...")
            time.sleep(RETRY_DELAY)
        else:
            print("Error: Max retries reached for LLM evaluation.")
            return None # Failed after retries

    return None # Should not be reached, but added for clarity

### Quantitative Metric

In [None]:
def calculate_response_length(response):
    """Calculates the number of words in the response."""
    return len(response.split())

def calculate_repetition_rate(response: str, n: int = 3) -> float:
    """
    Calculates the repetition rate of n-grams.
    Args:
        response (str): The text response.
        n (int): The size of the n-gram (e.g., 3 for trigrams).
    Returns:
        float: The ratio of repeated n-grams to total n-grams. Returns 0 if not enough n-grams.
    """
    words = response.lower().split()
    if len(words) < n:
        return 0.0

    ngrams = [' '.join(words[i:i+n]) for i in range(len(words) - n + 1)]
    if not ngrams:
        return 0.0

    ngram_counts = Counter(ngrams)
    repeated_ngrams = sum(1 for count in ngram_counts.values() if count > 1)

    return repeated_ngrams / len(ngrams)

def calculate_basic_code_switching(response: str, hindi_keywords: set, english_keywords: set) -> int:
    """
    A very basic heuristic for counting potential code switches.
    Counts transitions between likely English and likely Hindi words.
    NOTE: This is highly approximate and prone to errors with Romanized text.
    """
    words = re.findall(r'\b\w+\b', response.lower())
    if len(words) < 2:
        return 0

    switches = 0
    current_lang = None

    for word in words:
        lang = None
        if word in hindi_keywords:
            lang = "hindi"
        elif word in english_keywords:
            lang = "english"
        else: # (unknown word)
            lang = None

        if lang is not None:
            if current_lang is not None and lang != current_lang:
                switches += 1
            current_lang = lang

    return switches

In [None]:
# --- Helper Functions for Model Loading & Inference ---

def generate_conversational_response(model, tokenizer, chat_history):
    """
    Generates a response from the model based on the chat history.
    Does not modify the input chat_history.
    """
    model.eval() # Set model to evaluation mode
    device = model.device

    # Prepare the conversation history
    formatted_history = []
    if chat_history and chat_history[0].get("role") != "system":
         # Prepend the default system prompt if not present
         formatted_history.append({"role": "system", "content": SYSTEM_PROMPT})
    formatted_history.extend(chat_history)

    try:
        input_ids = tokenizer.apply_chat_template(
            formatted_history,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(device)

        attention_mask = torch.ones_like(input_ids)

        generation_kwargs = dict(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,      # Adjusted max tokens
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

        with torch.no_grad():
            outputs = model.generate(**generation_kwargs)

        response_ids = outputs[0][input_ids.shape[-1]:]
        response = tokenizer.decode(response_ids, skip_special_tokens=True).strip()

    except Exception as e:
        print(f"Error during generation: {e}")
        response = "Sorry yaar, kuch gadbad ho gayi generation mein. Dobara try karega?" # Default error response

    return response

### Evaluate

In [None]:
def evaluate_model(
    base_model_id: str,              # Base model ID
    adapter_path: str | None,        # Path to the saved LoRA adapter directory
    eval_prompts: list[str],         # Evaluations input promts list
    model_name: str,                 # Name for this specific evaluated model/adapter
    fine_tune_type: str | None       # Type : Base | LoRA | QLoRA | Full Fine tune
    ) -> dict:
    """
    Loads various models and evaluates it using LLM judge and quantitative metrics.

    Args:
        base_model_id: Identifier for the base model on Hugging Face Hub or local path.
        adapter_path: Path to the directory containing the trained LoRA adapter files.
        eval_prompts: A list of user prompts for evaluation.
        model_name: A descriptive name for the model being evaluated (e.g., "Qwen1.5B_Hinglish_v1").

    Returns:
        A dictionary containing aggregated evaluation results.
    """
    print(f"\n--- Starting Evaluation for Model: {model_name} ---")
    print(f"--- Base: {base_model_id}, Adapter: {adapter_path} ---")

    if not OPENAI_API_KEY:
         raise("Error: OpenAI API Key not configured. Cannot perform LLM evaluation.")

    # Initialize OpenAI Client (only if key is present)
    client = OpenAI(api_key=OPENAI_API_KEY) if OPENAI_API_KEY else None

    # Load Model and Tokenizer
    try:
        if fine_tune_type == "LoRA":
            model, tokenizer = load_lora_model_for_inference(base_model_id, adapter_path, device='cuda')

        elif fine_tune_type == "QLoRA":
            model, tokenizer = load_qlora_model_for_inference(base_model_id, adapter_path, device='cuda')

        elif fine_tune_type == "base":
            model, tokenizer = load_base_model_for_inference(base_model_id, device='cuda')
            
        else:
            model, tokenizer = load_fft_model_for_inference(base_model_id, device='cuda')
    except Exception as e:
        print(f"Failed to load model {model_name}. Aborting evaluation for this model.")
        print(e)
        return {
            "model_name": model_name,
            "error": f"Model loading failed: {e}",
            "per_prompt_details": [],
            "aggregated_metrics": {"status": "Model Loading Failed"}
        }

    # --- Evaluation Loop ---
    all_results = {
        "model_name": model_name,
        "base_model_id": base_model_id,
        "adapter_path": adapter_path,
        "per_prompt_details": [],
        "aggregated_metrics": {}
    }

    for i, prompt in enumerate(eval_prompts):
        print(f"  Processing prompt {i+1}/{len(eval_prompts)} for {model_name}...")
        try:
            # 1. Prepare Chat History
            chat_history = [{"role": "user", "content": prompt}]

            # 2. Get Model Response
            response_text = generate_conversational_response(model, tokenizer, chat_history)
            if not response_text or not isinstance(response_text, str):
                print(f"  Warning: Invalid response received for prompt {i+1}. Using empty string.")
                response_text = ""

            # 3. Calculate Quantitative Metrics
            length = calculate_response_length(response_text)
            repetition = calculate_repetition_rate(response_text)

            # 4. Get LLM Judge Evaluation (if client is available)
            llm_scores = None
            if client:
                llm_scores = get_llm_evaluation(client, prompt, response_text)
            else:
                llm_scores = "Skipped (No API Key)"


            prompt_result = {
                "prompt": prompt,
                "response": response_text,
                "length": length,
                "repetition_rate_3gram": repetition,
                "llm_evaluation": llm_scores if llm_scores else "Evaluation Failed or Skipped"
            }
            all_results["per_prompt_details"].append(prompt_result)

        except Exception as e:
            print(f"  Error processing prompt {i+1} for {model_name}: {e}")
            all_results["per_prompt_details"].append({
                "prompt": prompt,
                "response": f"Error during generation or processing: {e}",
                "length": 0,
                "repetition_rate_3gram": 0.0,
                "llm_evaluation": "Processing Error"
            })

    # --- Aggregation ---
    valid_llm_evals = [
        res["llm_evaluation"] for res in all_results["per_prompt_details"]
        if isinstance(res["llm_evaluation"], dict) # Only aggregate successful LLM evals
    ]
    num_successful_evals = len(valid_llm_evals)
    num_total_prompts = len(eval_prompts)

    aggregated = {
        "status": "Completed",
        "total_prompts": num_total_prompts,
        "successful_llm_evaluations": num_successful_evals,
        "failed_or_skipped_llm_evaluations": num_total_prompts - num_successful_evals,
    }

    # Aggregate LLM Scores (if any successful evaluations)
    if num_successful_evals > 0:
        for key in valid_llm_evals[0].keys():
             if key.endswith("_score"):
                metric_name = key.replace("_score", "")
                # Ensure score is int/float before aggregating
                scores = [eval_data[key] for eval_data in valid_llm_evals if isinstance(eval_data.get(key), (int, float))]
                if scores:
                    try:
                        aggregated[f"avg_{metric_name}_score"] = statistics.mean(scores)
                        aggregated[f"stdev_{metric_name}_score"] = statistics.stdev(scores) if len(scores) > 1 else 0.0
                        aggregated[f"median_{metric_name}_score"] = statistics.median(scores)
                    except statistics.StatisticsError as stat_err:
                         print(f"Warning: Statistics error for {metric_name}: {stat_err}")
                         aggregated[f"avg_{metric_name}_score"] = None
                else:
                     aggregated[f"avg_{metric_name}_score"] = None

    # Aggregate Quantitative Metrics
    lengths = [res["length"] for res in all_results["per_prompt_details"] if isinstance(res.get("length"), (int, float))]
    repetitions = [res["repetition_rate_3gram"] for res in all_results["per_prompt_details"] if isinstance(res.get("repetition_rate_3gram"), (int, float))]

    aggregated["avg_response_length"] = statistics.mean(lengths) if lengths else 0
    aggregated["avg_repetition_rate_3gram"] = statistics.mean(repetitions) if repetitions else 0

    all_results["aggregated_metrics"] = aggregated

    # --- Cleanup ---
    print(f"  Cleaning up resources for {model_name}...")
    del model
    del tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print(f"--- Evaluation Complete for Model: {model_name} ---")

    return all_results

### Eval LLMs

In [15]:
# Define your evaluation prompts
evaluation_prompts = [
    "Hey kaise ho? College mein kya chal raha hai aajkal?",
    "Yaar assignments ka bohot tension hai. Kuch tips?",
    "Suggest some cool places to hangout near campus.",
    "What did you think of the canteen food today?",
    "Exams aa rahe hain, dar lag raha hai bhai.",
    "Kal ka lecture attend kiya ya bunk maar diya?",
    "Hostel ki light firse chali gayi kya?",
    "Tu fest ke liye audition de raha hai kya?",
    "Kya placement ka kuch update mila?",
    "Online classes bore kar rahe hain, koi escape idea?"
]

# === Define Models to Evaluate ===
models_to_evaluate = [
    {
        "model_name": "Qwen2.5-3B_Hinglish_base",
        "base_model_id": "Qwen/Qwen2.5-3B-Instruct",
        "adapter_path": None,
        "fine_tune_type" : "base"
    },
    {
        "model_name": "Qwen2.5-0.5B_Hinglish_FFT",
        "base_model_id": "./Qwen2.5-0.5B-Instruct_hinglish_finetune/Qwen2.5-0.5B-Instruct/full_finetune",
        "adapter_path": None,
        "fine_tune_type": "FFT"
    },
    {
        "model_name": "Qwen2.5-3B_Hinglish_LoRA",
        "base_model_id": "Qwen/Qwen2.5-3B-Instruct",
        "adapter_path": "./Qwen2.5-3B-Instruct_hinglish_finetune/Qwen2.5-3B-Instruct/lora_finetune",
        "fine_tune_type": "LoRA"
    },
    {
        "model_name": "Qwen2.5-3B_Hinglish_QLoRA",
        "base_model_id": "Qwen/Qwen2.5-3B-Instruct",
        "adapter_path": "./Qwen2.5-3B-Instruct_hinglish_finetune/Qwen2.5-3B-Instruct/qlora_finetune",
        "fine_tune_type": "QLoRA"
    }
]

# === Run Evaluation Loop ===
all_evaluation_results = {}

for model_info in models_to_evaluate:

    results = evaluate_model(
        base_model_id=model_info["base_model_id"],
        adapter_path=model_info["adapter_path"],
        eval_prompts=evaluation_prompts,
        model_name=model_info["model_name"],
        fine_tune_type=model_info["fine_tune_type"]
    )
    all_evaluation_results[model_info["model_name"]] = results


# --- Print Summary of Results ---
print("\n\n" + "="*60)
print("          Evaluation Summary")
print("="*60)

for model_name, results in all_evaluation_results.items():
    print(f"\nModel: {model_name}")
    if "error" in results:
        print(f"  Status: Error - {results.get('status', 'Unknown Error')}")
        print(f"  Details: {results['error']}")
    elif "aggregated_metrics" in results:
        print(f"  Status: {results['aggregated_metrics'].get('status', 'Unknown')}")
        print(f"  Aggregated Metrics:")
        for key, value in results["aggregated_metrics"].items():
            if key != "status": # Don't print status twice
                # Format floats for readability
                if isinstance(value, float):
                    print(f"    {key}: {value:.3f}")
                else:
                    print(f"    {key}: {value}")
    else:
            print("  Status: Unknown - No aggregated metrics found.")

print("\n" + "="*60)

# Dump full `all_evaluation_results` dictionary to a JSON file
with open("evaluation_results.json", "w") as f:
    json.dump(all_evaluation_results, f, indent=2)
print("Full evaluation results saved to evaluation_results.json")


--- Starting Evaluation for Model: Qwen2.5-3B_Hinglish_base ---
--- Base: Qwen/Qwen2.5-3B-Instruct, Adapter: None ---

Loading Full Fine-Tuned model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 52.63it/s]


Model loaded successfully!
  Processing prompt 1/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 2/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 3/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 4/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 5/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 6/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 7/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 8/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 9/10 for Qwen2.5-3B_Hinglish_base...
  Processing prompt 10/10 for Qwen2.5-3B_Hinglish_base...
  Cleaning up resources for Qwen2.5-3B_Hinglish_base...
--- Evaluation Complete for Model: Qwen2.5-3B_Hinglish_base ---

--- Starting Evaluation for Model: Qwen2.5-0.5B_Hinglish_FFT ---
--- Base: ./Qwen2.5-0.5B-Instruct_hinglish_finetune/Qwen2.5-0.5B-Instruct/full_finetune, Adapter: None ---

Loading Full Fine-Tuned model...
Model loaded successfully!
  Processing prompt 1/10 for Qwen2.5-0.5B_Hingli

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 55.56it/s]


Loading adapters from './Qwen2.5-3B-Instruct_hinglish_finetune/Qwen2.5-3B-Instruct/lora_finetune'...
Model loaded successfully!
  Processing prompt 1/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 2/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 3/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 4/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 5/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 6/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 7/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 8/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 9/10 for Qwen2.5-3B_Hinglish_LoRA...
  Processing prompt 10/10 for Qwen2.5-3B_Hinglish_LoRA...
  Cleaning up resources for Qwen2.5-3B_Hinglish_LoRA...
--- Evaluation Complete for Model: Qwen2.5-3B_Hinglish_LoRA ---

--- Starting Evaluation for Model: Qwen2.5-3B_Hinglish_QLoRA ---
--- Base: Qwen/Qwen2.5-3B-Instruct, Adapter: ./Qwen2.5-3B-Instruct_hinglish_finetune/Qwen2.5-3B-Instruct/qlora_fine

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.55s/it]


Loading adapters from './Qwen2.5-3B-Instruct_hinglish_finetune/Qwen2.5-3B-Instruct/qlora_finetune'...
Model loaded successfully!
  Processing prompt 1/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 2/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 3/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 4/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 5/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 6/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 7/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 8/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 9/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Processing prompt 10/10 for Qwen2.5-3B_Hinglish_QLoRA...
  Cleaning up resources for Qwen2.5-3B_Hinglish_QLoRA...
--- Evaluation Complete for Model: Qwen2.5-3B_Hinglish_QLoRA ---


          Evaluation Summary

Model: Qwen2.5-3B_Hinglish_base
  Status: Completed
  Aggregated Metrics:
    total_prompts: 10
    successful_llm_evaluations: 10
    f

In [None]:
def create_comparison_table(evaluation_results):
    """
    Creates a comparison table from the evaluation results dictionary,
    including percentage improvement over the base model.

    Args:
        evaluation_results: The dictionary containing results for multiple models.

    Returns:
        A pandas DataFrame containing the comparison table, ready for display
        in Jupyter, or None if no valid results are found.
    """
    table_data = []

    base_metric_keys = [
        "avg_hinglish_fluency_score",
        "avg_persona_adherence_score",
        "avg_coherence_score",
        "avg_engagingness_score",
        "avg_language_constraint_score",
        "avg_gender_correctness_score",
        "avg_hindi_usage_score",
        "avg_response_length",
        "avg_repetition_rate_3gram",
        "successful_llm_evaluations",
        "total_prompts",
    ]

    column_names = {
        "model_name": "Model",
        "eval_type": "Type",
        "avg_hinglish_fluency_score": "Avg Hinglish Fluency",
        "avg_persona_adherence_score": "Avg Persona Adherence",
        "avg_gender_correctness_score": "Avg Gender Correctness",
        "avg_hindi_usage_score": "Avg Hindi Usage",
        "avg_coherence_score": "Avg Coherence",
        "avg_engagingness_score": "Avg Engagingness",
        "avg_language_constraint_score": "Avg Lang Constraint",
        "avg_response_length": "Avg Length (words)",
        "avg_repetition_rate_3gram": "Avg Repetition (3-gram)",
        "successful_llm_evaluations": "Successful Evals",
        "total_prompts": "Total Prompts",
    }

    # --- 1. Extract Base Data and Identify Base Model ---
    base_model_metrics = all_evaluation_results["Qwen2.5-3B_Hinglish_base"]['aggregated_metrics']
    base_model_name = "Qwen2.5-3B_Hinglish_base"
    processed_results = []

    for model_name, result in evaluation_results.items():
        # Standardize result structure
        processed_row = {
            "model_name": result.get("model_name", model_name),
            "eval_type": result.get("eval_type", "N/A")
        }
        metrics = result.get("aggregated_metrics", {})
        status = metrics.get("status", "Unknown")
        processed_row["status"] = status

        if status == "Completed":
            for key in base_metric_keys:
                processed_row[key] = metrics.get(key)
            processed_results.append(processed_row)

            if processed_row["eval_type"] == "Base Model":
                if base_model_metrics is not None:
                    print("Warning: Multiple base models found in results. Using the first one.")
                else:
                    base_model_metrics = processed_row
                    base_model_name = processed_row["model_name"]
        else:
            processed_row.update({key: "Failed" for key in base_metric_keys})
            processed_results.append(processed_row)

    if not processed_results:
        print("No evaluation results found to create a table.")
        return None

    df = pd.DataFrame(processed_results)
    df_completed = df[df['status'] == 'Completed'].copy()

    # --- 2. Calculate Percentage Improvement ---
    if base_model_metrics is None:
        print("Warning: Base model results not found or failed. Cannot calculate % improvement.")
    else:
        print(f"Calculating % improvement relative to base model: '{base_model_name}'")
        # Metrics where higher score is better
        metrics_higher_better = {
            "avg_hinglish_fluency_score": "Hinglish Fluency % Imp",
            "avg_persona_adherence_score": "Persona Adherence % Imp",
            "avg_gender_correctness_score": "Gender Correctness % Imp",
            "avg_hindi_usage_score": "Hindi Usage % Imp",
            "avg_coherence_score": "Coherence % Imp",
            "avg_engagingness_score": "Engagingness % Imp",
            "avg_language_constraint_score": "Lang Constraint % Imp",
        }
        # Metrics where lower score is better
        metrics_lower_better = {
            "avg_repetition_rate_3gram": "Repetition % Imp (Lower Better)"
        }

        def calculate_improvement(adapter_val, base_val, lower_is_better=False):
            adapter_val = pd.to_numeric(adapter_val, errors='coerce')
            base_val = pd.to_numeric(base_val, errors='coerce')

            if pd.isna(adapter_val) or pd.isna(base_val):
                return np.nan
            if base_val == 0:
                if adapter_val == 0:
                    return 0.0
                return np.inf if not lower_is_better and adapter_val > 0 else (-np.inf if lower_is_better and adapter_val > 0 else np.nan)

            if lower_is_better:
                return ((base_val - adapter_val) / abs(base_val)) * 100
            else:
                return ((adapter_val - base_val) / abs(base_val)) * 100

        for metric_key, imp_col_name in metrics_higher_better.items():
            base_val = base_model_metrics.get(metric_key)
            df_completed[imp_col_name] = df_completed.apply(
                lambda row: calculate_improvement(row[metric_key], base_val, lower_is_better=False)
                            if row["model_name"] != base_model_name else np.nan,
                axis=1
            )
            column_names[imp_col_name] = imp_col_name

        for metric_key, imp_col_name in metrics_lower_better.items():
            base_val = base_model_metrics.get(metric_key)
            df_completed[imp_col_name] = df_completed.apply(
                lambda row: calculate_improvement(row[metric_key], base_val, lower_is_better=True)
                            if row["model_name"] != base_model_name else np.nan,
                axis=1
            )
            column_names[imp_col_name] = imp_col_name

        improvement_cols = list(metrics_higher_better.values()) + list(metrics_lower_better.values())
        df_improvements = df_completed[['model_name'] + improvement_cols]
        df = pd.merge(df, df_improvements, on='model_name', how='left')

    # --- 3. Final Table Formatting ---
    df.rename(columns=column_names, inplace=True)

    desired_column_order = [
        "Model", "Type",
        "Avg Hinglish Fluency", "Hinglish Fluency % Imp",
        "Avg Persona Adherence", "Persona Adherence % Imp",
        "Avg Gender Correctness", "Gender Correctness % Imp",
        "Avg Hindi Usage", "Hindi Usage % Imp",
        "Avg Coherence", "Coherence % Imp",
        "Avg Engagingness", "Engagingness % Imp",
        "Avg Lang Constraint", "Lang Constraint % Imp",
        "Avg Length (words)",
        "Avg Repetition (3-gram)", "Repetition % Imp (Lower Better)",
        "Successful Evals", "Total Prompts",
    ]
    existing_desired_columns = [col for col in desired_column_order if col in df.columns]
    df = df[existing_desired_columns]

    df.set_index("Model", inplace=True)

    styled_df = df.style
    float_cols = df.select_dtypes(include=['float']).columns
    float_format_dict = {col: '{:.2f}' for col in float_cols if '% Imp' not in col}
    percent_cols = [col for col in df.columns if '% Imp' in col]
    percent_format_dict = {col: '{:+.1f}%' for col in percent_cols}
    formatters = {**float_format_dict, **percent_format_dict}
    styled_df = styled_df.format(formatters, na_rep="N/A")

    return styled_df

In [17]:
all_evaluation_results

{'Qwen2.5-3B_Hinglish_base': {'model_name': 'Qwen2.5-3B_Hinglish_base',
  'base_model_id': 'Qwen/Qwen2.5-3B-Instruct',
  'adapter_path': None,
  'per_prompt_details': [{'prompt': 'Hey kaise ho? College mein kya chal raha hai aajkal?',
    'response': 'Hey! Kya aapne college se pehle khelna chahiye? Aaj main apni class mein hai, yeh sab samay hota hai. Kuch ghar mein phirse zyada relax kar',
    'length': 26,
    'repetition_rate_3gram': 0.0,
    'llm_evaluation': {'hindi_usage_score': 4,
     'hindi_usage_justification': "Response uses a good mix of Hindi and Hinglish, but some phrases like 'khelna chahiye' seem out of context.",
     'gender_correctness_score': 3,
     'gender_correctness_justification': "The response uses 'aapne' which is formal and doesn't match the informal tone of the user's prompt.",
     'hinglish_fluency_score': 3,
     'hinglish_fluency_justification': "The code-switching is somewhat awkward, especially with phrases like 'yeh sab samay hota hai'.",
     'perso

In [18]:
create_comparison_table(all_evaluation_results)

Calculating % improvement relative to base model: 'Qwen2.5-3B_Hinglish_base'


Unnamed: 0_level_0,Type,Avg Hinglish Fluency,Hinglish Fluency % Imp,Avg Persona Adherence,Persona Adherence % Imp,Avg Gender Correctness,Gender Correctness % Imp,Avg Hindi Usage,Hindi Usage % Imp,Avg Coherence,Coherence % Imp,Avg Engagingness,Engagingness % Imp,Avg Lang Constraint,Lang Constraint % Imp,Avg Length (words),Avg Repetition (3-gram),Repetition % Imp (Lower Better),Successful Evals,Total Prompts
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Qwen2.5-3B_Hinglish_base,,2.7,,3.2,,4.4,,3.3,,3.3,,2.6,,3.4,,31.7,0.0,,10,10
Qwen2.5-0.5B_Hinglish_FFT,,3.2,+18.5%,3.3,+3.1%,4.7,+6.8%,3.4,+3.0%,2.7,-18.2%,2.5,-3.8%,3.7,+8.8%,29.0,0.0,+0.0%,10,10
Qwen2.5-3B_Hinglish_LoRA,,4.2,+55.6%,4.3,+34.4%,4.9,+11.4%,4.0,+21.2%,3.9,+18.2%,3.7,+42.3%,4.3,+26.5%,30.6,0.0,+0.0%,10,10
Qwen2.5-3B_Hinglish_QLoRA,,4.3,+59.3%,4.5,+40.6%,5.0,+13.6%,3.8,+15.2%,4.5,+36.4%,4.1,+57.7%,4.5,+32.4%,29.5,0.0,+0.0%,10,10


In [None]:
import json
import os
from typing import Dict, Any

def transform_for_ab_test(
    all_evaluation_results: Dict[str, Dict[str, Any]],
    output_filename: str = "blind_test_data.json"):
    """
    Transforms the aggregated evaluation results into a JSON file suitable
    for loading into the blind test application.

    The output JSON will be a list of dictionaries, where each dictionary
    represents a single valid response from a model to a specific prompt.
    Format: [{"prompt": str, "model_name": str, "response": str}, ...]

    Args:
        all_evaluation_results: The dictionary containing results for multiple models,
                                as generated by the evaluation script.
        output_filename: The name of the JSON file to save the data to.

    Returns:
        True if the file was successfully created, False otherwise.
    """
    blind_test_data = []
    processed_models = 0
    added_responses = 0

    print("Starting transformation for blind test data...")

    # Iterate through each model evaluated
    for model_key, result_data in all_evaluation_results.items():
        model_name = result_data.get("model_name", model_key) # Use specific name if available

        # 1. Check if the overall model evaluation completed successfully
        aggregated_metrics = result_data.get("aggregated_metrics", {})
        if aggregated_metrics.get("status") != "Completed":
            print(f"Skipping model '{model_name}': Evaluation status was '{aggregated_metrics.get('status', 'Unknown')}'.")
            continue

        processed_models += 1
        print(f"Processing model: '{model_name}'...")

        # 2. Iterate through the responses for each prompt for this model
        per_prompt_details = result_data.get("per_prompt_details", [])
        if not per_prompt_details:
            print(f"  Warning: No 'per_prompt_details' found for model '{model_name}'.")
            continue

        for detail in per_prompt_details:
            prompt = detail.get("prompt")
            response = detail.get("response")
            llm_eval_status = detail.get("llm_evaluation") # Can be dict, str, or None

            # 3. Validate the response and its evaluation status
            is_valid_response = isinstance(response, str) and response.strip() and \
                                "Error during generation" not in response and \
                                "Processing Error" not in response

            # Consider evaluation valid if it's a dict (success) or skipped due to no API key
            is_valid_evaluation = isinstance(llm_eval_status, dict) or \
                                  llm_eval_status == "Skipped (No API Key)" or \
                                  llm_eval_status == "Evaluation Failed or Skipped" # Allow this status from previous code


            if prompt and is_valid_response and is_valid_evaluation:
                # 4. Add valid data point to the list
                blind_test_data.append({
                    "prompt": prompt,
                    "model_name": model_name,
                    "response": response
                })
                added_responses += 1
            # Optionally log skipped responses
            # else:
            #     print(f"  Skipping response for prompt '{prompt[:30]}...' - Invalid response or LLM eval status.")


    print(f"\nProcessed {processed_models} completed model evaluations.")
    print(f"Extracted {added_responses} valid prompt-response pairs for blind testing.")

    if not blind_test_data:
        print("Warning: No valid data found to write to the blind test file.")
        return False

    # 5. Write the collected data to the JSON file
    try:
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(blind_test_data, f, indent=2, ensure_ascii=False) # Use indent for readability
        print(f"Successfully saved blind test data to '{output_filename}'")
        return True
    except IOError as e:
        print(f"Error writing blind test data to file '{output_filename}': {e}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred during file writing: {e}")
        return False

# Call the function with your results dictionary
output_file = "ab_test_data_output.json"
success = transform_for_ab_test(all_evaluation_results, output_file)

if success:
    # You can optionally print the first few records to verify
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            print("\n--- First few records from generated JSON ---")
            for i, record in enumerate(data[:3]): # Print first 3 records
                print(f"Record {i+1}:")
                print(f"  Prompt: {record['prompt'][:60]}...") # Truncate long prompts
                print(f"  Model: {record['model_name']}")
                print(f"  Response: {record['response'][:80]}...") # Truncate long responses
            if len(data) > 3:
                print("...")
    except Exception as e:
        print(f"Error reading back generated file for verification: {e}")

Starting transformation for blind test data...
Processing model: 'Qwen2.5-3B_Hinglish_base'...
Processing model: 'Qwen2.5-0.5B_Hinglish_FFT'...
Processing model: 'Qwen2.5-3B_Hinglish_LoRA'...
Processing model: 'Qwen2.5-3B_Hinglish_QLoRA'...

Processed 4 completed model evaluations.
Extracted 40 valid prompt-response pairs for blind testing.
Successfully saved blind test data to 'ab_test_data_output.json'

--- First few records from generated JSON ---
Record 1:
  Prompt: Hey kaise ho? College mein kya chal raha hai aajkal?...
  Model: Qwen2.5-3B_Hinglish_base
  Response: Hey! Kya aapne college se pehle khelna chahiye? Aaj main apni class mein hai, ye...
Record 2:
  Prompt: Yaar assignments ka bohot tension hai. Kuch tips?...
  Model: Qwen2.5-3B_Hinglish_base
  Response: Tension about assignments, yaar? Bhai, maine yeh bhi ek baar laga tha. Toh tu ku...
Record 3:
  Prompt: Suggest some cool places to hangout near campus....
  Model: Qwen2.5-3B_Hinglish_base
  Response: Hey there! Near yo