In [1]:
!pip install peft bitsandbytes accelerate huggingface_hub -q

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m60.7/60.7 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
# Add these tokens in Kaggle's "Secrets" tab (Add-ons -> Secrets)
login(token=user_secrets.get_secret("HF_TOKEN"))

In [3]:
import torch
import json
import re
import os
import random
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import PeftModel
from datetime import datetime
import warnings
import transformers
warnings.filterwarnings("ignore")
transformers.logging.set_verbosity_error()

# ==========================================
# üéØ SELECT THE MODEL TO EVALUATE
# ==========================================
# ACTIVE_MODEL = "phi4_qlora"
ACTIVE_MODEL = "phi4_lora"
# ACTIVE_MODEL = "qwen3_qlora"
# ACTIVE_MODEL = "qwen3_lora"
# ACTIVE_MODEL = "gemma3_qlora"
# ACTIVE_MODEL = "gemma3_lora"

# ==========================================
# ‚öôÔ∏è  PATHS ‚Äî edit dataset name to match yours
# ==========================================
ADAPTER_DATASET = "/kaggle/input/datasets/shreyashgaurgla/nyaya-adapters"   # ‚Üê change to your adapter dataset path
DATA_DATASET    = "/kaggle/input/datasets/shreyashgaurgla/nyaya-llm-dataset"  # ‚Üê change to your data dataset path

CONFIGS = {
    "phi4_qlora":   {"base_model": "microsoft/Phi-4-mini-instruct", "adapter_dir": f"{ADAPTER_DATASET}/qlora_phase1_phi4_mini/qlora_phase1_phi4_mini"},
    "phi4_lora":    {"base_model": "microsoft/Phi-4-mini-instruct", "adapter_dir": f"{ADAPTER_DATASET}/lora_phase1_phi4_mini/lora_phase1_phi4_mini"},
    "qwen3_qlora":  {"base_model": "Qwen/Qwen3-4B-Instruct-2507",  "adapter_dir": f"{ADAPTER_DATASET}/qlora_phase1_qwen3_4b/qlora_phase1_qwen3_4b"},
    "qwen3_lora":   {"base_model": "Qwen/Qwen3-4B-Instruct-2507",  "adapter_dir": f"{ADAPTER_DATASET}/lora_phase1_qwen3_4b/lora_phase1_qwen3_4b"},
    "gemma3_qlora": {"base_model": "google/gemma-3-4b-it",          "adapter_dir": f"{ADAPTER_DATASET}/qlora_phase1_gemma3_4b/qlora_phase1_gemma3_4b"},
    "gemma3_lora":  {"base_model": "google/gemma-3-4b-it",          "adapter_dir": f"{ADAPTER_DATASET}/lora_phase1_gemma3_4b/lora_phase1_gemma3_4b"},
}

TEST_DATA_PATH = f"{DATA_DATASET}/all_acts_test.jsonl"
NUM_SAMPLES    = 150
SEED           = 42
RESULTS_DIR    = "/kaggle/working/results"
JUDGE_RETRIES  = 3

# ==========================================
# üè∑Ô∏è  TASK TYPE DETECTOR
# ==========================================
def detect_task_type(instruction: str) -> str:
    il = instruction.lower()
    if il.startswith("explain"):
        return "Explanation"
    elif il.startswith("summarize"):
        return "Summarization"
    elif il.startswith("what does"):
        return "Direct Q&A"
    elif il.startswith("under which act"):
        return "Act Identification"
    else:
        return "Other"

# ==========================================
# üì¶  LOAD TEST SAMPLES
# ==========================================
def load_test_samples(filepath: str, num_samples: int) -> list:
    with open(filepath, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    random.seed(SEED)
    return random.sample(data, min(num_samples, len(data)))

# ==========================================
# ü§ñ  GENERATION
# ==========================================
def generate_response(model, tokenizer, instruction: str) -> str:
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return full_output.split("### Response:\n")[-1].strip()

# ==========================================
# üßë‚Äç‚öñÔ∏è  JUDGE ‚Äî HuggingFace (no Ollama needed)
# ==========================================
JUDGE_PROMPT_TEMPLATE = """You are a strict legal AI evaluator. Score the MODEL ANSWER against the REFERENCE ANSWER.

SCORING RUBRIC:
  5 - Perfect. Legally accurate, complete, no errors.
  4 - Good. Correct core content, minor omissions or slight imprecision.
  3 - Acceptable. Gets the general idea but misses important legal details.
  2 - Poor. Partially correct but contains notable legal errors.
  1 - Wrong. Fabricated law, completely incorrect, or irrelevant answer.

IMPORTANT:
- Score based on semantic correctness, NOT exact wording match.
- The reference may be long. The model only needs to capture the key legal meaning.
- If the model answer is legally equivalent to the reference but phrased differently, score it high.

QUESTION:
{instruction}

REFERENCE ANSWER:
{reference}

MODEL ANSWER:
{prediction}

Respond ONLY with a valid JSON object, nothing else:
{{"score": <int 1-5>, "reasoning": "<one concise sentence>"}}"""

judge_pipe = None

def load_judge():
    global judge_pipe
    print("Loading judge model (Qwen2.5-7B 4-bit)...")

    judge_bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    )

    judge_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",
        quantization_config=judge_bnb,
        device_map="auto",
        torch_dtype=torch.float16
    )
    judge_model.generation_config.max_length = None

    judge_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")

    judge_pipe = pipeline(
        "text-generation",
        model=judge_model,
        tokenizer=judge_tokenizer,
    )
        # clears the conflicting defaults
    judge_pipe.model.generation_config.max_length = None
    judge_pipe.model.generation_config.min_length = 0
    print("Judge loaded.\n")

def judge_score(instruction: str, reference: str, prediction: str) -> tuple:
    prompt = JUDGE_PROMPT_TEMPLATE.format(
        instruction=instruction,
        reference=reference[:600],
        prediction=prediction[:600]
    )

    for attempt in range(JUDGE_RETRIES):
        try:
            output = judge_pipe(
                prompt,
                max_new_tokens=150,
                min_new_tokens=10,
                do_sample=False,
                return_full_text=False,
                pad_token_id=judge_pipe.tokenizer.eos_token_id
            )
            response = output[0]["generated_text"].strip()

            # Strip markdown fences
            response = re.sub(r"```(?:json)?", "", response).strip()

            if not response:
                raise ValueError("Empty response from judge")

            match = re.search(r"\{.*?\}", response, re.DOTALL)
            if not match:
                raise ValueError(f"No JSON found. Raw: {response[:150]}")

            parsed = json.loads(match.group())
            score  = int(parsed["score"])

            if not (1 <= score <= 5):
                raise ValueError(f"Score out of range: {score}")

            return score, parsed.get("reasoning", "")

        except Exception as e:
            print(f"      ‚ö†Ô∏è  Judge attempt {attempt + 1} failed: {e}")
            if attempt == JUDGE_RETRIES - 1:
                return 0, "Judge error ‚Äî skipped"

    return 0, "Judge error ‚Äî skipped"

# ==========================================
# üìä  SUMMARY PRINTER
# ==========================================
def print_summary(results: list, model_name: str):
    valid = [r for r in results if r["score"] > 0]

    if not valid:
        print("No valid scores to summarise.")
        return

    overall_avg = sum(r["score"] for r in valid) / len(valid)

    by_task    = defaultdict(list)
    score_dist = defaultdict(int)

    for r in valid:
        by_task[r["task_type"]].append(r["score"])
        score_dist[r["score"]] += 1

    print("\n" + "=" * 60)
    print(f"üìä  PHASE 1 SUMMARY ‚Äî {model_name.upper()}")
    print("=" * 60)
    print(f"  Scored      : {len(valid)} / {len(results)} samples")
    print(f"  Overall avg : {overall_avg:.2f} / 5.0")

    print("\n  Score distribution:")
    for s in range(5, 0, -1):
        count = score_dist.get(s, 0)
        bar   = "‚ñà" * count
        print(f"    {s}/5  {bar} ({count})")

    print("\n  By task type:")
    for task in sorted(by_task.keys()):
        scores = by_task[task]
        avg    = sum(scores) / len(scores)
        print(f"    {task:<20} avg={avg:.2f}  (n={len(scores)})")

    print("=" * 60)

# ==========================================
# üöÄ  MAIN
# ==========================================
def main():
    cfg = CONFIGS[ACTIVE_MODEL]
    os.makedirs(RESULTS_DIR, exist_ok=True)
    output_file = f"{RESULTS_DIR}/phase1_eval_{ACTIVE_MODEL}.json"

    print(f"\nüöÄ Evaluating: {ACTIVE_MODEL.upper()}")
    print("Loading 4-bit quantization config...")

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    print(f"Loading base model: {cfg['base_model']}...")
    base_model = AutoModelForCausalLM.from_pretrained(
        cfg["base_model"],
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=("qwen" in ACTIVE_MODEL),
        torch_dtype=torch.float16
    )

    print(f"Loading LoRA adapter: {cfg['adapter_dir']}...")
    try:
        model = PeftModel.from_pretrained(base_model, cfg["adapter_dir"])
        model.eval()
    except Exception as e:
        print(f"\n‚ùå Could not load adapter. Did training finish?\n{e}")
        return

    tokenizer = AutoTokenizer.from_pretrained(
        cfg["base_model"],
        trust_remote_code=("qwen" in ACTIVE_MODEL)
    )

    # Load judge after main model to avoid VRAM conflict
    load_judge()

    print(f"\nLoading {NUM_SAMPLES} test samples (seed={SEED})...")
    samples = load_test_samples(TEST_DATA_PATH, NUM_SAMPLES)

    print(f"\n{'='*60}")
    print(f"‚öñÔ∏è   PHASE 1 EVALUATION: {ACTIVE_MODEL.upper()}")
    print(f"{'='*60}\n")

    results = []

    for i, sample in enumerate(samples, 1):
        instruction  = sample["instruction"]
        ground_truth = sample["output"]
        task_type    = detect_task_type(instruction)

        print(f"[{i:02d}/{NUM_SAMPLES}] [{task_type}]")
        print(f"  Prompt : {instruction[:80]}...")

        prediction = generate_response(model, tokenizer, instruction)
        print(f"  Answer : {prediction[:80]}...")

        score, reasoning = judge_score(instruction, ground_truth, prediction)
        print(f"  Score  : {score}/5  ‚Äî {reasoning}\n")

        results.append({
            "model":        ACTIVE_MODEL,
            "task_type":    task_type,
            "instruction":  instruction,
            "ground_truth": ground_truth,
            "prediction":   prediction,
            "score":        score,
            "reasoning":    reasoning,
            "timestamp":    datetime.now().isoformat()
        })

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"üíæ Raw results saved ‚Üí {output_file}")
    print_summary(results, ACTIVE_MODEL)


if __name__ == "__main__":
    main()


üöÄ Evaluating: PHI4_LORA
Loading 4-bit quantization config...
Loading base model: microsoft/Phi-4-mini-instruct...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/194 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Loading LoRA adapter: /kaggle/input/datasets/shreyashgaurgla/nyaya-adapters/lora_phase1_phi4_mini/lora_phase1_phi4_mini...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Loading judge model (Qwen2.5-7B 4-bit)...


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Judge loaded.


Loading 150 test samples (seed=42)...

‚öñÔ∏è   PHASE 1 EVALUATION: PHI4_LORA

[01/150] [Act Identification]
  Prompt : Under which Act does Section 221 titled 'Where it is doubtful what offence has b...
  Answer : Section 221 titled 'Where it is doubtful what offence has been committed' falls ...
  Score  : 2/5  ‚Äî The model incorrectly identifies the Code of Criminal Procedure, 1973 as the source, when it actually belongs to the Indian Penal Code, 1860.

[02/150] [Direct Q&A]
  Prompt : What does Section 326A of Indian Penal Code, 1860 state?...
  Answer : Whoever causes grievous hurt to any woman, being an officer or a member of the a...
  Score  : 2/5  ‚Äî The model answer is partially correct but contains notable legal errors, as it misidentifies the section and provides an incorrect description of the offense.

[03/150] [Act Identification]
  Prompt : Under which Act does Section 88A titled 'Presumption as to electronic messages' ...
  Answer : Section 88A titled