In [5]:
import json
import math
import numpy as np
import google.generativeai as genai
from huggingface_hub import InferenceClient
import time
import re
import ast

# --- 1. CONFIGURATION ---
HF_ENDPOINT_URL = ""
HF_TOKEN = ""
GEMINI_API_KEY = ""

# Clients
hf_client = InferenceClient(base_url=HF_ENDPOINT_URL, token=HF_TOKEN)
genai.configure(api_key=GEMINI_API_KEY)

# --- 2. HELPER FUNCTIONS ---
def calculate_perplexity(response_details):
    """
    Robustly calculates PPL. Returns 0.0 if logprobs are unavailable.
    """
    # SAFETY CHECK: If the endpoint didn't return details, skip PPL
    if response_details is None:
        return 0.0

    tokens = response_details.tokens
    if not tokens: return 0.0

    # Collect logprobs
    log_probs = [t.logprob for t in tokens if t.logprob is not None]

    if not log_probs: return 0.0
    return math.exp(-np.mean(log_probs))

def clean_and_parse_json(text):
    try:
        match = re.search(r"\{[\s\S]*\}", text)
        if match:
            return json.loads(match.group(0))
        return None
    except:
        return None

def gemini_question_judge(domain, difficulty, generated_question):
    model = genai.GenerativeModel('gemini-2.5-flash-lite')

    rubric = """
    1. Domain Validity: Is this definitely a question about the specific Domain?
    2. Difficulty Alignment: Does the complexity match the requested Difficulty?
    3. Phrasing Quality: Is the question clear, professional, and interview-ready?
    4. Solvability: Is it a valid question with a reasonable answer?
    """

    prompt = f"""You are an Expert Interviewer. Evaluate this AI-generated interview question.

    ### Input Parameters:
    - Target Domain: {domain}
    - Target Difficulty: {difficulty}

    ### AI Generated Question:
    {generated_question}

    ### Task:
    Evaluate the question based on the Rubric.
    Return ONLY JSON:
    {{
        "domain_score": int (1-5), "domain_reason": "string",
        "difficulty_score": int (1-5), "difficulty_reason": "string",
        "phrasing_score": int (1-5), "phrasing_reason": "string",
        "overall_score": float
    }}
    """
    try:
        response = model.generate_content(prompt)
        # Robust cleanup for Gemini markdown output
        clean_text = response.text.replace("```json", "").replace("```", "").strip()
        return json.loads(clean_text)
    except:
        return None

# --- 3. BENCHMARK PROMPTS ---
benchmark_prompts = [
    {"domain": "Python", "difficulty": "Basic"},
    {"domain": "Python", "difficulty": "Medium"},
    {"domain": "Python", "difficulty": "Advanced"},
    {"domain": "SQL", "difficulty": "Medium"},
    {"domain": "SQL", "difficulty": "Advanced"},
    {"domain": "Machine Learning", "difficulty": "Medium"},
    {"domain": "Deep Learning", "difficulty": "Advanced"},
    {"domain": "System Design", "difficulty": "Advanced"},
]

# --- 4. EXECUTION ---
ppl_scores = []
gemini_scores = []

print(f"Starting Question Generation Benchmark...\n")

for i, item in enumerate(benchmark_prompts):
    print(f"Test {i+1}: {item['domain']} ({item['difficulty']})")

    # A. Format Prompt (Alpaca Style)
    instruction = "You are an expert interview question generator. Generate an interview question based on the parameters provided in the input."
    input_params = f"Domain: {item['domain']}\nDifficulty: {item['difficulty']}"

    alpaca_prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_params}

### Response:
"""

    try:
        # B. Generate Question
        response = hf_client.text_generation(
            prompt=alpaca_prompt,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            details=True, # We still ask for details
            return_full_text=False
        )

        gen_text = response.generated_text.strip()
        print(f"   Generated: \"{gen_text[:80]}...\"")

        # C. Calculate Metrics (Robust Call)
        ppl = calculate_perplexity(response.details)
        if ppl > 0:
            ppl_scores.append(ppl)
            print(f"   PPL: {ppl:.4f}")
        else:
            #print(f"   PPL Unavailable (Endpoint didn't return logprobs)")
            print('------')

        # D. Gemini Judge
        gemini_json = gemini_question_judge(item['domain'], item['difficulty'], gen_text)

        if gemini_json:
            g_score = gemini_json.get('overall_score', 0)
            gemini_scores.append(g_score)

            print("   Gemini Evaluation:")
            print(f"      • Domain Match: {gemini_json.get('domain_score')}/5 - {gemini_json.get('domain_reason')}")
            print(f"      • Diff. Match:  {gemini_json.get('difficulty_score')}/5 - {gemini_json.get('difficulty_reason')}")
            print(f"      • Phrasing:     {gemini_json.get('phrasing_score')}/5")
            print(f"      • Overall:      {g_score}/5")
        else:
            print("      Gemini failed to evaluate")

        print("-" * 50)
        time.sleep(1)

    except Exception as e:
        print(f"   Error: {e}")

# --- 5. FINAL REPORT ---
print("\n" + "="*30)
print("Q-GEN BENCHMARK REPORT")
print("="*30)
if ppl_scores:
    print(f"Avg Perplexity:        {np.mean(ppl_scores):.4f}")
else:
    print("Avg Perplexity:        N/A (Not supported by this endpoint)")

if gemini_scores:
    print(f"Avg Gemini Quality:    {np.mean(gemini_scores):.2f} / 5.0")
print("="*30)

Starting Question Generation Benchmark...

Test 1: Python (Basic)
   Generated: "What is a Pandas Series?<|end_of_text|>..."
------
   Gemini Evaluation:
      • Domain Match: 5/5 - The question directly targets the core concepts of the Python data science ecosystem, specifically the Pandas library. Pandas Series is a fundamental data structure within Pandas, making it highly relevant to the target domain.
      • Diff. Match:  2/5 - The question asks for a definition of a basic data structure in Pandas. Answering this question requires knowledge of what a Series is, its properties, and its basic use cases, which aligns with a basic difficulty level. It doesn't require complex problem-solving or advanced syntax.
      • Phrasing:     5/5
      • Overall:      3.67/5
--------------------------------------------------
Test 2: Python (Medium)
   Generated: "Explain the purpose of the `__getattr__` method in Python. When would you choose..."
------
   Gemini Evaluation:
      • Domain Matc