In [None]:
!pip install huggingface_hub -q

In [None]:
# @title Test Answer Evaluation Logic (Inference Endpoint)

import json
from huggingface_hub import InferenceClient

# --- CONFIGURATION ---
ENDPOINT_URL = ""
HF_TOKEN = ""

# Initialize the Client
client = InferenceClient(base_url=ENDPOINT_URL, token=HF_TOKEN)

# --- 1. Define the Test Case ---
TEST_DOMAIN = "SQL"
TEST_QUESTION = "Explain how a database Index works."
TEST_USER_ANSWER = "An index is a data structure, similar to a book's index, that improves the speed of data retrieval operations. It allows the database engine to find rows much faster than scanning the entire table."

# --- 2. Step A: Generate Reference (Expert) ---
# Using your exact prompt structure
reference_prompt = f"""You are an expert in {TEST_DOMAIN}.
Write a concise, technically perfect answer to the following interview question.
Focus on the definition and the 'why'. Do NOT use code examples unless absolutely necessary.

Question: {TEST_QUESTION}

Answer:"""

print(f"üîπ Generating Reference for domain '{TEST_DOMAIN}'...")

try:
    # Call the endpoint
    gold_standard = client.text_generation(
        prompt=reference_prompt,
        max_new_tokens=256,
        temperature=0.2,
        stop_sequences=["<|end_of_text|>"] # Stop if model finishes early
    ).strip()

    print(f"‚úÖ Reference Generated:\n{gold_standard}\n")
    print("-" * 40)

except Exception as e:
    print(f"‚ùå Error in Step A: {e}")
    gold_standard = "ERROR GENERATING REFERENCE"

# --- 3. Step B: Multi-Metric Judge ---
# Using your exact prompt structure
judge_prompt = f"""You are a strict technical interviewer.

### Question:
{TEST_QUESTION}

### Reference Answer (Truth):
{gold_standard}

### Candidate's Answer:
{TEST_USER_ANSWER}

### Evaluation Protocol:
1. **Analyze:** Compare the Candidate's answer to the Reference. Note matches and misses.
2. **Score Technical Accuracy (0.0-1.0):** Is the information factually correct? (No lies/hallucinations).
3. **Score Completeness (0.0-1.0):** Did they cover the main points? (e.g. missed "test data" in overfitting).
4. **Score Clarity (0.0-1.0):** Is the answer easy to understand?
5. **Overall Score (0.0-1.0):** A weighted average of the above.

### Instructions:
- Be objective.
- **CRITICAL:** Respond using ONLY valid JSON. Do not write anything else.

### Output Format (JSON):
{{
    "analysis": "<Short comparison of Reference vs Candidate>",
    "technical_accuracy": <float>,
    "completeness": <float>,
    "clarity": <float>,
    "overall_score": <float>,
    "feedback": "<Constructive feedback for the student>"
}}

### Response:
"""

print("üîπ Running Judge...")

try:
    # Call the endpoint with lower temperature for JSON stability
    response_text = client.text_generation(
        prompt=judge_prompt,
        max_new_tokens=512,
        temperature=0.1,
        stop_sequences=["<|end_of_text|>"]
    )

    # --- 4. Parse and Display Results ---
    print("\n--- üìä FINAL JSON OUTPUT ---")

    # Clean up potential markdown code blocks (Llama 3 loves to add these)
    clean_json = response_text.replace("```json", "").replace("```", "").strip()

    data = json.loads(clean_json)

    print(json.dumps(data, indent=4))

except json.JSONDecodeError:
    print("‚ùå Failed to parse JSON. Raw output:")
    print(response_text)
except Exception as e:
    print(f"‚ùå Error in Step B: {e}")

üîπ Generating Reference for domain 'SQL'...
‚úÖ Reference Generated:
You are an expert in SQL.
Write a concise, technically perfect answer to the following interview question.
Focus on the definition and the 'why'. Do NOT use code examples unless absolutely necessary.

Question: Explain how a database Index works.

Answer: 

A database index is a data structure that improves the speed of data retrieval operations by providing a quick way to locate data. It's essentially a pointer to a specific location in the database table, allowing the database management system (DBMS) to quickly locate and retrieve the required data.

Think of an index like the index in a book. Just as the index in a book allows you to quickly find a specific chapter or section, a database index enables the DBMS to quickly locate a specific row or set of rows in a table.

The index is created on one or more columns of a table, and it contains a copy of the indexed column(s) along with a unique identifier, called t

### 2

In [None]:
pip install -q google-generativeai huggingface_hub numpy

In [None]:
import json
import math
import numpy as np
import google.generativeai as genai
from huggingface_hub import InferenceClient
import time
import re
import ast

# --- 1. CONFIGURATION ---
HF_ENDPOINT_URL = ""
HF_TOKEN = ""
GEMINI_API_KEY = ""

# Configure Clients
hf_client = InferenceClient(base_url=HF_ENDPOINT_URL, token=HF_TOKEN)
genai.configure(api_key=GEMINI_API_KEY)

# --- 2. HELPER FUNCTIONS ---
def calculate_perplexity(response_details):
    tokens = response_details.tokens
    if not tokens: return 0.0
    log_probs = [t.logprob for t in tokens if t.logprob is not None]
    if not log_probs: return 0.0
    return math.exp(-np.mean(log_probs))

def clean_and_parse_json(text):
    try:
        # Regex to find the largest outer block enclosed in {}
        match = re.search(r"\{[\s\S]*\}", text)
        if match:
            json_str = match.group(0)
        else:
            return None

        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass

        return ast.literal_eval(json_str)
    except Exception as e:
        return None

def gemini_meta_judge(domain, question, user_answer, model_feedback_json):
    model = genai.GenerativeModel('gemini-2.5-flash-lite')

    if model_feedback_json is None:
        llama_feedback_str = "Error: Llama model failed to generate valid JSON structure."
    else:
        llama_feedback_str = json.dumps(model_feedback_json)

    # Simplified Rubric for robustness
    prompt = f"""You are a 'Meta-Evaluator'.

    Scenario:
    - Domain: {domain}
    - Question: {question}
    - Candidate Answer: {user_answer}

    AI Feedback to Evaluate:
    {llama_feedback_str}

    Task: Evaluate the AI Feedback. Return ONLY JSON:
    {{
        "relevance_score": int (1-5), "relevance_reason": "string",
        "faithfulness_score": int (1-5), "faithfulness_reason": "string",
        "helpfulness_score": int (1-5), "helpfulness_reason": "string",
        "coherence_score": int (1-5), "coherence_reason": "string",
        "overall_score": float
    }}
    """
    try:
        response = model.generate_content(prompt)
        clean_text = response.text.replace("```json", "").replace("```", "").strip()
        return json.loads(clean_text)
    except:
        return None

# --- 3. TEST CASES ---
test_cases = [
    {
        "domain": "SQL",
        "question": "What is the difference between TRUNCATE and DELETE commands?",
        "user_answer": "TRUNCATE is a DDL command that removes all rows and resets identity. DELETE is DML and can use a WHERE clause. TRUNCATE is generally faster."
    },
    {
        "domain": "Python",
        "question": "Explain the concept of decorators and provide a use case.",
        "user_answer": "Decorators are used to decorate functions. They make the code look pretty. I use them for styling."
    },
    {
        "domain": "Machine Learning",
        "question": "Describe the Bias-Variance tradeoff.",
        "user_answer": "High bias means underfitting. High variance means overfitting. We want to find a balance to minimize total error."
    },
    {
        "domain": "System Design",
        "question": "How would you design a rate limiter?",
        "user_answer": "I would use a database to count requests. Every time a user hits the API, I update the row. If it's above 10, I block them."
    },
    {
        "domain": "Data Engineering",
        "question": "Explain the difference between ETL and ELT.",
        "user_answer": "ETL stands for Extract Transform Load. ELT stands for Extract Load Transform. In ELT, we load raw data into the warehouse first, then transform it using the warehouse's power (like Snowflake)."
    }
]

# --- 4. EXECUTION ---
ppl_scores = []
gemini_scores = []
llama_scores = []

print(f"Starting Detailed Benchmark...\n")

for i, test in enumerate(test_cases):
    print(f"Test {i+1}: {test['domain']}")

    # --- A. Generate Reference ---
    # Removed <|begin_of_text|> to avoid double BOS token issues
    ref_prompt = f"""<|start_header_id|>system<|end_header_id|>\n\nYou are an expert in {test['domain']}. Write a concise, technically perfect answer.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{test['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"""

    try:
        ref_response = hf_client.text_generation(
            prompt=ref_prompt,
            max_new_tokens=200,
            stop_sequences=["<|eot_id|>"],
            return_full_text=False # <--- FIX 1
        )
        gold_standard = ref_response.strip()
    except:
        gold_standard = "Error."

    # --- B. Run Llama Judge ---
    # Removed <|begin_of_text|> and added return_full_text=False
    judge_prompt = f"""<|start_header_id|>system<|end_header_id|>\n\nYou are a strict technical interviewer. Compare the Candidate's Answer to the Reference.
Return ONLY valid JSON with fields: "analysis", "technical_accuracy" (0.0-1.0), "completeness" (0.0-1.0), "overall_score" (0.0-1.0), "feedback".<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuestion: {test['question']}
Reference: {gold_standard}
Candidate Answer: {test['user_answer']}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"""

    try:
        response = hf_client.text_generation(
            prompt=judge_prompt,
            max_new_tokens=400,
            temperature=0.1,
            details=True,
            stop_sequences=["<|eot_id|>"],
            return_full_text=False # <--- FIX 2: Crucial for parsing
        )

        # 1. Perplexity
        ppl = calculate_perplexity(response.details)
        ppl_scores.append(ppl)

        # 2. Parse Llama Output
        llama_json = clean_and_parse_json(response.generated_text)

        if llama_json:
            # Scale 0-1 score to 0-5
            raw_score = float(llama_json.get('overall_score', 0.0))
            l_score = raw_score * 5.0
            llama_scores.append(l_score)
            print(f"   Llama Awarded Score: {l_score:.2f} / 5.0")
        else:
            print(f"   Llama output invalid JSON")
            print(f"   RAW OUTPUT SNAPSHOT: {response.generated_text[:100]}...") # Debug print
            llama_scores.append(0.0)

        # 3. Gemini Meta-Review
        gemini_json = gemini_meta_judge(test['domain'], test['question'], test['user_answer'], llama_json)

        if gemini_json:
            g_score = gemini_json.get('overall_score', 0)
            gemini_scores.append(g_score)

            print("   Gemini Evaluation:")
            print(f"      ‚Ä¢ Relevance:    {gemini_json.get('relevance_score')}/5")
            print(f"      ‚Ä¢ Faithfulness: {gemini_json.get('faithfulness_score')}/5")
            print(f"      ‚Ä¢ Helpfulness:  {gemini_json.get('helpfulness_score')}/5")
            print(f"      ‚Ä¢ Coherence:    {gemini_json.get('coherence_score')}/5")
            print(f"      ‚Ä¢ Overall:      {g_score}/5")
        else:
            print("      Gemini failed to evaluate")

        print(f"   üìâ PPL: {ppl:.4f}")
        print("-" * 50)

        time.sleep(1)

    except Exception as e:
        print(f"   Error: {e}")

# --- 5. FINAL REPORT ---
print("\n" + "="*30)
print("FINAL BENCHMARK REPORT")
print("="*30)
if ppl_scores: print(f"Avg Perplexity:        {np.mean(ppl_scores):.4f}")
if llama_scores: print(f"Avg Llama Awarded:     {np.mean(llama_scores):.2f} / 5.0")
if gemini_scores: print(f"Avg Gemini Meta-Score: {np.mean(gemini_scores):.2f} / 5.0")
print("="*30)

Starting Detailed Benchmark...

Test 1: SQL
   Llama Awarded Score: 3.50 / 5.0
   Gemini Evaluation:
      ‚Ä¢ Relevance:    5/5
      ‚Ä¢ Faithfulness: 5/5
      ‚Ä¢ Helpfulness:  4/5
      ‚Ä¢ Coherence:    5/5
      ‚Ä¢ Overall:      4.5/5
   üìâ PPL: 1.0412
--------------------------------------------------
Test 2: Python
   Llama Awarded Score: 0.00 / 5.0
   Gemini Evaluation:
      ‚Ä¢ Relevance:    5/5
      ‚Ä¢ Faithfulness: 5/5
      ‚Ä¢ Helpfulness:  5/5
      ‚Ä¢ Coherence:    5/5
      ‚Ä¢ Overall:      5.0/5
   üìâ PPL: 1.0521
--------------------------------------------------
Test 3: Machine Learning
   Llama Awarded Score: 2.50 / 5.0
   Gemini Evaluation:
      ‚Ä¢ Relevance:    5/5
      ‚Ä¢ Faithfulness: 5/5
      ‚Ä¢ Helpfulness:  4/5
      ‚Ä¢ Coherence:    5/5
      ‚Ä¢ Overall:      4.5/5
   üìâ PPL: 1.0245
--------------------------------------------------
Test 4: System Design
   Llama Awarded Score: 0.00 / 5.0
   Gemini Evaluation:
      ‚Ä¢ Relevance:    5/5