# Nyaya-LLM ‚Äî Phase 1 vs Phase 2 Comparison

Evaluates the best model's **Phase 1 adapter** vs **Phase 2 adapter** on `eval_set.json`.

**80 curated questions across 4 categories:**
- `Statute Accuracy` ‚Äî factual recall from trained acts
- `Hypothetical Scenario` ‚Äî applying law to real situations
- `Hallucination Test` ‚Äî traps with fake/repealed sections
- `Generalization` ‚Äî legal concepts without section numbers

In [None]:
!pip install peft bitsandbytes accelerate huggingface_hub -q

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HF_TOKEN"))

In [None]:
import torch
import json
import re
import os
import gc
from tqdm import tqdm
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import PeftModel
from datetime import datetime

print("Imports done.")

In [None]:
# ==========================================
# ‚öôÔ∏è  CONFIG ‚Äî edit these to match your setup
# ==========================================

# Base model ‚Äî whichever won Phase 1
BASE_MODEL = "microsoft/Phi-4-mini-instruct"
# BASE_MODEL = "Qwen/Qwen3-4B-Instruct-2507"
# BASE_MODEL = "google/gemma-3-4b-it"

# Adapter dataset
ADAPTER_DATASET = "/kaggle/input/datasets/shreyashgaurgla/nyaya-adapters"

# Phase 1 adapter ‚Äî best from Phase 1 eval
PHASE_1_ADAPTER = f"{ADAPTER_DATASET}/qlora_phase1_phi4_mini/qlora_phase1_phi4_mini"
# PHASE_1_ADAPTER = f"{ADAPTER_DATASET}/lora_phase1_phi4_mini/lora_phase1_phi4_mini"

# Phase 2 adapter ‚Äî same model trained on augmented data
PHASE_2_ADAPTER = f"{ADAPTER_DATASET}/qlora_phase2_phi4_mini/qlora_phase2_phi4_mini"
# PHASE_2_ADAPTER = f"{ADAPTER_DATASET}/lora_phase2_phi4_mini/lora_phase2_phi4_mini"

# Eval set
EVAL_SET_PATH = "/kaggle/input/datasets/shreyashgaurgla/nyaya-eval/eval_set.json"

# Output
OUTPUT_FILE  = "/kaggle/working/compare_phases_results.json"
RESULTS_DIR  = "/kaggle/working"
JUDGE_RETRIES = 3

print("Config loaded.")
print(f"  Base model  : {BASE_MODEL}")
print(f"  Phase 1     : {PHASE_1_ADAPTER}")
print(f"  Phase 2     : {PHASE_2_ADAPTER}")
print(f"  Eval set    : {EVAL_SET_PATH}")

In [None]:
# ==========================================
# üí¨  JUDGE PROMPT
# Hallucination category: correctly rejecting
# a fake section scores 5, fabricating = 1
# ==========================================
JUDGE_PROMPT_TEMPLATE = """You are a strict legal AI evaluator. Score the MODEL ANSWER against the REFERENCE ANSWER.

SCORING RUBRIC:
  5 - Perfect. Legally accurate, complete, no errors.
  4 - Good. Correct core content, minor omissions or slight imprecision.
  3 - Acceptable. Gets the general idea but misses important legal details.
  2 - Poor. Partially correct but contains notable legal errors.
  1 - Wrong. Fabricated law, completely incorrect, or irrelevant answer.

IMPORTANT:
- Score based on semantic correctness, NOT exact wording match.
- The reference may be long. The model only needs to capture the key legal meaning.
- If the model answer is legally equivalent to the reference but phrased differently, score it high.
- For HALLUCINATION TEST questions: a model that correctly says a section does not exist scores 5.
  A model that invents a plausible-sounding answer scores 1.

QUESTION:
{instruction}

REFERENCE ANSWER:
{reference}

MODEL ANSWER:
{prediction}

Respond ONLY with a valid JSON object, nothing else:
{{"score": <int 1-5>, "reasoning": "<one concise sentence>"}}"""

print("Judge prompt ready.")

In [None]:
# ==========================================
# ü§ñ  GENERATION
# ==========================================
def generate_response(model, tokenizer, instruction: str) -> str:
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    del inputs, outputs
    torch.cuda.empty_cache()
    gc.collect()

    return full_output.split("### Response:\n")[-1].strip()

print("generate_response() ready.")

In [None]:
# ==========================================
# üßë‚Äç‚öñÔ∏è  JUDGE ‚Äî HuggingFace
# Same judge as evaluate-phase1.ipynb
# ==========================================
judge_pipe = None

def load_judge():
    global judge_pipe
    print("Loading judge model (Qwen2.5-7B 4-bit)...")

    judge_bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    )

    judge_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",
        quantization_config=judge_bnb,
        device_map="auto",
        torch_dtype=torch.float16
    )
    judge_model.generation_config.max_length = None

    judge_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")

    judge_pipe = pipeline(
        "text-generation",
        model=judge_model,
        tokenizer=judge_tokenizer,
    )
    print("Judge loaded.\n")


def judge_score(instruction: str, reference: str, prediction: str) -> tuple:
    prompt = JUDGE_PROMPT_TEMPLATE.format(
        instruction=instruction,
        reference=reference[:600],
        prediction=prediction[:600]
    )

    for attempt in range(JUDGE_RETRIES):
        try:
            output = judge_pipe(
                prompt,
                max_new_tokens=150,
                min_new_tokens=10,
                do_sample=False,
                return_full_text=False,
                pad_token_id=judge_pipe.tokenizer.eos_token_id
            )
            response = output[0]["generated_text"].strip()
            response = re.sub(r"```(?:json)?", "", response).strip()

            if not response:
                raise ValueError("Empty response from judge")

            match = re.search(r"\{.*?\}", response, re.DOTALL)
            if not match:
                raise ValueError(f"No JSON found. Raw: {response[:150]}")

            parsed = json.loads(match.group())
            score  = int(parsed["score"])

            if not (1 <= score <= 5):
                raise ValueError(f"Score out of range: {score}")

            return score, parsed.get("reasoning", "")

        except Exception as e:
            print(f"      ‚ö†Ô∏è  Judge attempt {attempt + 1} failed: {e}")
            if attempt == JUDGE_RETRIES - 1:
                return 0, "Judge error ‚Äî skipped"

    return 0, "Judge error ‚Äî skipped"

print("Judge functions ready.")

In [None]:
# ==========================================
# üìä  SUMMARY PRINTER
# ==========================================
def print_summary(results: list):
    categories = [
        "Statute Accuracy",
        "Hypothetical Scenario",
        "Hallucination Test",
        "Generalization"
    ]

    print("\n" + "=" * 70)
    print("üìä  PHASE 1 vs PHASE 2 ‚Äî FINAL COMPARISON")
    print("=" * 70)

    phase_avgs = {}

    for phase in ["Phase_1", "Phase_2"]:
        phase_results = [r for r in results if r["model"] == phase]
        valid         = [r for r in phase_results if r["score"] > 0]

        if not valid:
            print(f"\n{phase}: No valid scores.")
            continue

        overall = sum(r["score"] for r in valid) / len(valid)
        phase_avgs[phase] = overall

        print(f"\n  {phase}:")
        print(f"    Overall avg : {overall:.2f} / 5.0  (n={len(valid)}/{len(phase_results)})")
        print(f"    By category :")

        for cat in categories:
            cat_scores = [r["score"] for r in valid if r["category"] == cat]
            if cat_scores:
                avg = sum(cat_scores) / len(cat_scores)
                bar = "‚ñà" * int(avg)
                print(f"      {cat:<25} {avg:.2f}  {bar}  (n={len(cat_scores)})")

    # Delta table
    print("\n" + "-" * 70)
    print("  DELTA (Phase 2 - Phase 1):")

    p1_valid = [r for r in results if r["model"] == "Phase_1" and r["score"] > 0]
    p2_valid = [r for r in results if r["model"] == "Phase_2" and r["score"] > 0]

    for cat in categories:
        p1_scores = [r["score"] for r in p1_valid if r["category"] == cat]
        p2_scores = [r["score"] for r in p2_valid if r["category"] == cat]
        if p1_scores and p2_scores:
            p1_avg = sum(p1_scores) / len(p1_scores)
            p2_avg = sum(p2_scores) / len(p2_scores)
            delta  = p2_avg - p1_avg
            arrow  = "‚¨ÜÔ∏è " if delta > 0.05 else ("‚¨áÔ∏è " if delta < -0.05 else "‚û°Ô∏è ")
            print(f"    {cat:<25} P1={p1_avg:.2f}  P2={p2_avg:.2f}  {arrow} {delta:+.2f}")

    if "Phase_1" in phase_avgs and "Phase_2" in phase_avgs:
        overall_delta = phase_avgs["Phase_2"] - phase_avgs["Phase_1"]
        arrow = "‚¨ÜÔ∏è " if overall_delta > 0.05 else ("‚¨áÔ∏è " if overall_delta < -0.05 else "‚û°Ô∏è ")
        print(f"\n    {'OVERALL':<25} P1={phase_avgs['Phase_1']:.2f}  P2={phase_avgs['Phase_2']:.2f}  {arrow} {overall_delta:+.2f}")

    print("=" * 70)

print("print_summary() ready.")

In [None]:
# ==========================================
# üöÄ  MAIN
# ==========================================
def main():
    os.makedirs(RESULTS_DIR, exist_ok=True)

    # Load eval set
    print(f"Loading eval set from: {EVAL_SET_PATH}")
    with open(EVAL_SET_PATH, "r", encoding="utf-8") as f:
        eval_data = json.load(f)
    print(f"Loaded {len(eval_data)} questions.\n")

    # Verify categories
    from collections import Counter
    cat_counts = Counter(item["category"] for item in eval_data)
    print("Category breakdown:")
    for cat, count in sorted(cat_counts.items()):
        print(f"  {cat:<25} {count} questions")
    print()

    # Load judge once ‚Äî stays loaded for both phases
    load_judge()

    # Load base model once
    print(f"Loading base model: {BASE_MODEL}...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=("qwen" in BASE_MODEL.lower()),
        torch_dtype=torch.float16
    )
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL,
        trust_remote_code=("qwen" in BASE_MODEL.lower())
    )
    print("Base model loaded.\n")

    results = []

    # ‚îÄ‚îÄ Evaluate both phases ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    for phase_name, adapter_path in [
        ("Phase_1", PHASE_1_ADAPTER),
        ("Phase_2", PHASE_2_ADAPTER)
    ]:
        print(f"\n{'='*60}")
        print(f"üîÑ  {phase_name} ‚Äî Loading adapter...")
        print(f"    {adapter_path}")
        print(f"{'='*60}\n")

        try:
            model = PeftModel.from_pretrained(base_model, adapter_path)
            model.eval()
        except Exception as e:
            print(f"‚ùå Could not load {phase_name} adapter: {e}")
            continue

        phase_written = 0

        for i, item in enumerate(tqdm(eval_data, desc=phase_name), 1):
            instruction = item["prompt"]
            reference   = item["reference"]
            category    = item["category"]
            item_id     = item.get("id", f"{i:03d}")

            # Generate answer
            answer = generate_response(model, tokenizer, instruction)

            # Judge scores it
            score, reasoning = judge_score(instruction, reference, answer)

            print(f"  [{i:02d}/{len(eval_data)}] [{category}] Score: {score}/5 ‚Äî {reasoning[:80]}")

            results.append({
                "model":           phase_name,
                "category":        category,
                "id":              item_id,
                "prompt":          instruction,
                "reference":       reference,
                "answer":          answer,
                "score":           score,
                "judge_reasoning": reasoning,
                "timestamp":       datetime.now().isoformat()
            })
            phase_written += 1

        # Save after each phase so you don't lose Phase 1 if Phase 2 crashes
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"\n‚úÖ {phase_name} done ‚Äî {phase_written} questions scored.")
        print(f"üíæ Intermediate save ‚Üí {OUTPUT_FILE}")

        # Unload adapter before loading Phase 2
        print(f"Unloading {phase_name} adapter...")
        del model
        torch.cuda.empty_cache()
        gc.collect()

    # Final save
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\nüíæ Final results saved ‚Üí {OUTPUT_FILE}")

    # Print comparison
    print_summary(results)


main()