In [None]:
from google.colab import drive
drive.mount('/content/drive')

ASSIGN_DIR = "/content/drive/MyDrive/LLM_Assignment_2"
!mkdir -p "$ASSIGN_DIR"
%cd "$ASSIGN_DIR"
!echo "Working in: $PWD"


In [None]:
%%bash
pip -q install "transformers>=4.45.0" "accelerate>=0.34.0" "datasets>=2.20.0" \
               "peft>=0.13.0" "evaluate>=0.4.2" "bitsandbytes>=0.43.1" \
               "torch>=2.3.0"
python - << 'PY'
import torch, platform
print("Torch:", torch.__version__, "| CUDA:", torch.cuda.is_available(), "| GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
print(platform.platform())
PY


In [None]:
%%writefile train_lora.py
import os, json, torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType

# ---------- ENV (edit via os.environ in Colab cells) ----------
BASE_MODEL = os.environ.get("BASE_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")
TRAIN_PATH = os.environ.get("TRAIN_PATH", "data/train.jsonl")
VAL_PATH   = os.environ.get("VAL_PATH",   "data/val.jsonl")
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "adapters/task-lora")
USE_4BIT   = os.environ.get("USE_4BIT", "1") == "1"  # helps memory on T4 (16GB)

LORA_R        = int(os.environ.get("LORA_R", "16"))
LORA_ALPHA    = int(os.environ.get("LORA_ALPHA", "32"))
LORA_DROPOUT  = float(os.environ.get("LORA_DROPOUT", "0.05"))

LR            = float(os.environ.get("LR", "2e-4"))
EPOCHS        = float(os.environ.get("EPOCHS", "1"))
BATCH_SIZE    = int(os.environ.get("BATCH_SIZE", "8"))
GRAD_ACCUM    = int(os.environ.get("GRAD_ACCUM", "2"))
MAX_STEPS     = int(os.environ.get("MAX_STEPS", "0"))    # 0 => use epochs
WARMUP_RATIO  = float(os.environ.get("WARMUP_RATIO", "0.03"))
LOG_STEPS     = int(os.environ.get("LOG_STEPS", "10"))
SAVE_STEPS    = int(os.environ.get("SAVE_STEPS", "200"))
SEED          = int(os.environ.get("SEED", "42"))

TARGET_MODULES = os.environ.get(
    "TARGET_MODULES",
    "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj"
).split(",")

PROMPT_TEMPLATE = os.environ.get("PROMPT_TEMPLATE", """\
### Instruction:
{instruction}

### Input:
{input}

### Response:
""")

def format_row(ex):
    instruction = (ex.get("instruction") or "").strip()
    inp = (ex.get("input") or "").strip()
    out = (ex.get("output") or "").strip()
    return {"text": PROMPT_TEMPLATE.format(instruction=instruction, input=inp) + out}

def load_jsonl_dataset(train_path, val_path):
    ds = load_dataset("json", data_files={"train": train_path, "validation": val_path})
    ds = ds.map(format_row, remove_columns=ds["train"].column_names)
    return ds

def gpu_dtype():
    # T4 prefers fp16 (no bf16). Use fp16 on CUDA else fp32.
    return torch.float16 if torch.cuda.is_available() else torch.float32

def main():
    torch.manual_seed(SEED)
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    quant_kwargs = {}
    if USE_4BIT:
        from transformers import BitsAndBytesConfig
        bnb = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=gpu_dtype(),
        )
        quant_kwargs["quantization_config"] = bnb

    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        dtype=gpu_dtype(),          # <- updated (replaces deprecated torch_dtype)
        device_map="auto",
        **quant_kwargs
    )

    lora = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=TARGET_MODULES,
        bias="none",
    )
    model = get_peft_model(model, lora)
    model.print_trainable_parameters()

    ds = load_jsonl_dataset(TRAIN_PATH, VAL_PATH)

    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=1024, padding="max_length")
    tokenized = ds.map(tok, batched=True, remove_columns=["text"])

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=min(4, BATCH_SIZE),
        gradient_accumulation_steps=GRAD_ACCUM,
        learning_rate=LR,
        warmup_ratio=WARMUP_RATIO,
        logging_steps=LOG_STEPS,
        save_steps=SAVE_STEPS,
        eval_strategy="steps",     # <- updated arg name for recent Transformers
        evaluation_strategy="no",  # keep backward-safe default off
        eval_steps=SAVE_STEPS,
        report_to="none",
        gradient_checkpointing=True,
        optim="paged_adamw_8bit" if USE_4BIT else "adamw_torch",
        seed=SEED,
        bf16=False,                 # T4: keep bf16 off
        fp16=torch.cuda.is_available(),
        max_steps=MAX_STEPS,        # <- always int; 0 = use epochs
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        data_collator=data_collator,
    )

    trainer.train()
    model.save_pretrained(OUTPUT_DIR)
    print(f"Saved LoRA adapter to {OUTPUT_DIR}")

if __name__ == "__main__":
    main()


In [None]:
%%writefile infer.py
import os, json, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

EVAL_FILE = os.environ.get("EVAL_FILE", "data/eval_questions.jsonl")
BASE_MODEL = os.environ.get("BASE_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")
PEFT_ADAPTER_PATH = os.environ.get("PEFT_ADAPTER_PATH", "")

MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "192"))
TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.2"))

PROMPT_TEMPLATE = """\
### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

def gpu_dtype():
    return torch.float16 if torch.cuda.is_available() else torch.float32

def generate(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=TEMPERATURE,
            pad_token_id=tokenizer.eos_token_id,
        )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    return text.split("### Response:")[-1].strip()

def main():
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL, torch_dtype=gpu_dtype(), device_map="auto"
    )

    if PEFT_ADAPTER_PATH:
        print(f"[infer] Using LoRA adapter: {PEFT_ADAPTER_PATH}")
        model = PeftModel.from_pretrained(model, PEFT_ADAPTER_PATH)

    with open(EVAL_FILE, "r") as f:
        rows = [json.loads(l) for l in f]

    os.makedirs("outputs", exist_ok=True)
    out_path = "outputs/finetuned_responses.jsonl" if PEFT_ADAPTER_PATH else "outputs/base_responses.jsonl"
    with open(out_path, "w") as w:
        for r in rows:
            prompt = PROMPT_TEMPLATE.format(
                instruction=r.get("instruction",""),
                input=r.get("input",""),
            )
            resp = generate(model, tokenizer, prompt)
            w.write(json.dumps({"instruction": r.get("instruction",""),
                                "input": r.get("input",""),
                                "response": resp}, ensure_ascii=False) + "\n")
    print(f"Wrote {len(rows)} responses to {out_path}")

if __name__ == "__main__":
    main()


In [None]:
%%writefile unit_test.py
import os, json
from pathlib import Path
import subprocess

# Tiny synthetic sentiment set (also works if you choose summarization later)
TINY_TRAIN = [{"instruction":"Classify sentiment as POS or NEG.","input":x,"output":y} for x,y in [
    ("I love this!", "POS"),
    ("Terrible product.", "NEG"),
    ("Absolutely fantastic experience.", "POS"),
    ("Not worth the money.", "NEG"),
]] * 5  # 20 rows
TINY_VAL = [
    {"instruction":"Classify sentiment as POS or NEG.","input":"This makes me happy.","output":"POS"},
    {"instruction":"Classify sentiment as POS or NEG.","input":"I am disappointed.","output":"NEG"},
]

def write_jsonl(p, rows):
    p.parent.mkdir(parents=True, exist_ok=True)
    with open(p, "w") as f:
        for r in rows: f.write(json.dumps(r) + "\n")

def main():
    train = Path("tmp_unit/train.jsonl")
    val   = Path("tmp_unit/val.jsonl")
    write_jsonl(train, TINY_TRAIN)
    write_jsonl(val,   TINY_VAL)

    env = os.environ.copy()
    env.update({
        "TRAIN_PATH": str(train),
        "VAL_PATH": str(val),
        "OUTPUT_DIR": "adapters/_unit_lora",
        "USE_4BIT": "1",
        "LORA_R": "8",
        "LORA_ALPHA": "16",
        "LORA_DROPOUT": "0.05",
        "LR": "3e-4",
        "EPOCHS": "1",
        "BATCH_SIZE": "8",
        "GRAD_ACCUM": "1",
        "MAX_STEPS": "20",
        "SAVE_STEPS": "50",
        "LOG_STEPS": "5",
    })
    print("[unit] Starting short LoRA training...")
    subprocess.run(["python", "train_lora.py"], check=True, env=env)
    assert Path("adapters/_unit_lora").exists()

    env2 = os.environ.copy()
    env2.update({
        "PEFT_ADAPTER_PATH": "adapters/_unit_lora",
        "EVAL_FILE": str(val),
    })
    subprocess.run(["python", "infer.py"], check=True, env=env2)
    print("[unit] OK.")

if __name__ == "__main__":
    main()


In [None]:
%%bash
mkdir -p data
cat > data/train.jsonl << 'JSON'
{"instruction":"Classify sentiment as POS or NEG.","input":"This phone is amazing, great battery life!","output":"POS"}
{"instruction":"Classify sentiment as POS or NEG.","input":"Worst purchase ever.","output":"NEG"}
{"instruction":"Classify sentiment as POS or NEG.","input":"Pretty decent for the price.","output":"POS"}
{"instruction":"Classify sentiment as POS or NEG.","input":"It broke after a week, I'm upset.","output":"NEG"}
JSON

cat > data/val.jsonl << 'JSON'
{"instruction":"Classify sentiment as POS or NEG.","input":"I enjoy using this every day.","output":"POS"}
{"instruction":"Classify sentiment as POS or NEG.","input":"Support was unhelpful and rude.","output":"NEG"}
JSON

# Held-out eval questions (no gold labels needed)
cat > data/eval_questions.jsonl << 'JSON'
{"instruction":"Classify sentiment as POS or NEG.","input":"The update made it so much better."}
{"instruction":"Classify sentiment as POS or NEG.","input":"Totally disappointed with the quality."}
JSON


In [None]:
import os
os.environ["BASE_MODEL"] = "HuggingFaceTB/SmolLM2-360M-Instruct"  # safe default
os.environ["EVAL_FILE"] = "data/eval_questions.jsonl"

# Base outputs → outputs/base_responses.jsonl
!python infer.py
!sed -n '1,5p' outputs/base_responses.jsonl


In [None]:
import os
os.environ["BASE_MODEL"] = "HuggingFaceTB/SmolLM2-360M-Instruct"
os.environ["TRAIN_PATH"] = "data/train.jsonl"
os.environ["VAL_PATH"]   = "data/val.jsonl"
os.environ["OUTPUT_DIR"] = "adapters/task-lora"
os.environ["USE_4BIT"]   = "1"     # keeps VRAM low on T4
os.environ["LORA_R"]     = "16"
os.environ["LORA_ALPHA"] = "32"
os.environ["LORA_DROPOUT"] = "0.05"
os.environ["LR"] = "2e-4"
os.environ["EPOCHS"] = "1"         # increase later for better results
os.environ["BATCH_SIZE"] = "8"
os.environ["GRAD_ACCUM"] = "2"
os.environ["SAVE_STEPS"] = "200"
os.environ["LOG_STEPS"]  = "10"


In [None]:
# From your assignment folder
%cd /content/drive/MyDrive/LLM_Assignment_2

# Patch train_lora.py to enable k-bit training prep
import re, io, sys
path = "train_lora.py"
src = open(path, "r").read()

# 1) Ensure import
if "prepare_model_for_kbit_training" not in src:
    src = src.replace(
        "from peft import LoraConfig, get_peft_model, TaskType",
        "from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training"
    )

# 2) After model = AutoModelForCausalLM.from_pretrained(...), call prepare_model_for_kbit_training(model)
src = src.replace(
    "model = AutoModelForCausalLM.from_pretrained(",
    "model = AutoModelForCausalLM.from_pretrained("
)

# Insert the prepare call only if not already present
if "prepare_model_for_kbit_training(model)" not in src:
    src = src.replace(
        ")\\n\\n    lora = LoraConfig(",
        ")\n\n    # Prepare model for 4/8-bit (enables gradients in a safe way)\n"
        "    if USE_4BIT:\n"
        "        model = prepare_model_for_kbit_training(model)\n"
        "    # Disable cache for training (checkpointing-friendly)\n"
        "    if hasattr(model.config, 'use_cache'):\n"
        "        model.config.use_cache = False\n\n"
        "    lora = LoraConfig("
    )

open(path, "w").write(src)
print("Patched train_lora.py")


In [None]:
%%writefile /content/drive/MyDrive/LLM_Assignment_2/train_lora.py
import os, torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments,
    default_data_collator
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

# ---------- ENV ----------
BASE_MODEL = os.environ.get("BASE_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")
TRAIN_PATH = os.environ.get("TRAIN_PATH", "data/train.jsonl")
VAL_PATH   = os.environ.get("VAL_PATH",   "data/val.jsonl")
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "adapters/task-lora")
USE_4BIT   = os.environ.get("USE_4BIT", "1") == "1"  # T4-friendly

LORA_R        = int(os.environ.get("LORA_R", "16"))
LORA_ALPHA    = int(os.environ.get("LORA_ALPHA", "32"))
LORA_DROPOUT  = float(os.environ.get("LORA_DROPOUT", "0.05"))

LR            = float(os.environ.get("LR", "2e-4"))
EPOCHS        = float(os.environ.get("EPOCHS", "1"))
BATCH_SIZE    = int(os.environ.get("BATCH_SIZE", "8"))
GRAD_ACCUM    = int(os.environ.get("GRAD_ACCUM", "2"))
MAX_STEPS     = int(os.environ.get("MAX_STEPS", "0"))    # 0 => use epochs
WARMUP_RATIO  = float(os.environ.get("WARMUP_RATIO", "0.03"))
LOG_STEPS     = int(os.environ.get("LOG_STEPS", "10"))
SAVE_STEPS    = int(os.environ.get("SAVE_STEPS", "200"))
SEED          = int(os.environ.get("SEED", "42"))

TARGET_MODULES = os.environ.get(
    "TARGET_MODULES",
    "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj"
).split(",")

PROMPT_TEMPLATE = os.environ.get("PROMPT_TEMPLATE", """\
### Instruction:
{instruction}

### Input:
{input}

### Response:
""")

def gpu_dtype():
    # T4: use fp16; CPU: fp32
    return torch.float16 if torch.cuda.is_available() else torch.float32

def format_row(ex):
    instruction = (ex.get("instruction") or "").strip()
    inp = (ex.get("input") or "").strip()
    out = (ex.get("output") or "").strip()
    return {"text": PROMPT_TEMPLATE.format(instruction=instruction, input=inp) + out}

def load_jsonl_dataset(train_path, val_path):
    ds = load_dataset("json", data_files={"train": train_path, "validation": val_path})
    ds = ds.map(format_row, remove_columns=ds["train"].column_names)
    return ds

def main():
    torch.manual_seed(SEED)

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    quant_kwargs = {}
    if USE_4BIT:
        from transformers import BitsAndBytesConfig
        bnb = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=gpu_dtype(),
        )
        quant_kwargs["quantization_config"] = bnb

    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        dtype=gpu_dtype(),
        device_map="auto",
        **quant_kwargs
    )

    # K-bit prep + checkpointing-friendly flags
    if USE_4BIT:
        model = prepare_model_for_kbit_training(model)
        # Explicitly ensure inputs can require grads under GC paths
        if hasattr(model, "enable_input_require_grads"):
            model.enable_input_require_grads()
    if hasattr(model.config, "use_cache"):
        model.config.use_cache = False
    # Extra safety (Trainer also toggles this with gradient_checkpointing=True)
    if hasattr(model, "gradient_checkpointing_enable"):
        model.gradient_checkpointing_enable()

    # LoRA wrap
    lora = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=TARGET_MODULES,
        bias="none",
    )
    model = get_peft_model(model, lora)
    model.print_trainable_parameters()

    # Data
    ds = load_jsonl_dataset(TRAIN_PATH, VAL_PATH)

    # Tokenize + create labels directly (so loss always hooks into graph)
    def tok(batch):
        enc = tokenizer(
            batch["text"],
            truncation=True,
            max_length=1024,
            padding="max_length",
        )
        enc["labels"] = enc["input_ids"].copy()
        return enc

    tokenized = ds.map(tok, batched=True, remove_columns=["text"])

    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=min(4, BATCH_SIZE),
        gradient_accumulation_steps=GRAD_ACCUM,
        learning_rate=LR,
        warmup_ratio=WARMUP_RATIO,
        logging_steps=LOG_STEPS,
        save_steps=SAVE_STEPS,
        eval_strategy="steps",
        eval_steps=SAVE_STEPS,
        report_to="none",
        optim="paged_adamw_8bit" if USE_4BIT else "adamw_torch",
        seed=SEED,
        fp16=torch.cuda.is_available(),
        bf16=False,
        gradient_checkpointing=True,
        max_steps=MAX_STEPS,   # int; 0 means use epochs
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        data_collator=default_data_collator,  # we already set labels
    )

    trainer.train()
    model.save_pretrained(OUTPUT_DIR)
    print(f"Saved LoRA adapter to {OUTPUT_DIR}")

if __name__ == "__main__":
    main()


In [None]:
%cd /content/drive/MyDrive/LLM_Assignment_2
!python train_lora.py

In [None]:
%cd /content/drive/MyDrive/LLM_Assignment_2
import os
os.environ["BASE_MODEL"] = "HuggingFaceTB/SmolLM2-360M-Instruct"
os.environ["PEFT_ADAPTER_PATH"] = "adapters/task-lora"   # your trained LoRA
os.environ["EVAL_FILE"] = "data/eval_questions.jsonl"

!python infer.py
!sed -n '1,5p' outputs/finetuned_responses.jsonl


In [None]:
!python unit_test.py

In [None]:
# Build binary sentiment from TweetEval (filter out 'neutral')
%cd /content/drive/MyDrive/LLM_Assignment_2
!pip -q install datasets

import json, random
from datasets import load_dataset

random.seed(7)
ds = load_dataset("tweet_eval", "sentiment")

# labels: 0=negative, 1=neutral, 2=positive → keep only 0/2
def binarize(split):
    rows=[]
    for x, y in zip(split["text"], split["label"]):
        if y==1:
            continue
        rows.append({"text": x, "label": "POS" if y==2 else "NEG"})
    return rows

train_b = binarize(ds["train"])
val_b   = binarize(ds["validation"])
test_b  = binarize(ds["test"])

# sample sizes
random.shuffle(train_b); random.shuffle(val_b); random.shuffle(test_b)
N_TRAIN, N_VAL, N_EVAL = 2000, 300, 60
train_s, val_s, eval_s = train_b[:N_TRAIN], val_b[:N_VAL], test_b[:N_EVAL]

!mkdir -p data

with open("data/train.jsonl","w") as ft:
    for r in train_s:
        j = {
            "instruction": "Classify sentiment as POS or NEG.",
            "input": r["text"],
            "output": r["label"]
        }
        ft.write(json.dumps(j, ensure_ascii=False) + "\n")

with open("data/val.jsonl","w") as fv:
    for r in val_s:
        j = {
            "instruction": "Classify sentiment as POS or NEG.",
            "input": r["text"],
            "output": r["label"]
        }
        fv.write(json.dumps(j, ensure_ascii=False) + "\n")

# Eval with ground truth so we can score accuracy/F1 later
with open("data/eval_questions.jsonl","w") as fe:
    for r in eval_s:
        j = {
            "instruction": "Classify sentiment as POS or NEG.",
            "input": r["text"],
            "reference": r["label"]
        }
        fe.write(json.dumps(j, ensure_ascii=False) + "\n")

def count_lines(p):
    return sum(1 for _ in open(p))
print({
    "train": count_lines("data/train.jsonl"),
    "val":   count_lines("data/val.jsonl"),
    "eval":  count_lines("data/eval_questions.jsonl"),
})


In [None]:
# Train (same env you used)
import os
os.environ["BASE_MODEL"] = "HuggingFaceTB/SmolLM2-360M-Instruct"
os.environ["TRAIN_PATH"] = "data/train.jsonl"
os.environ["VAL_PATH"]   = "data/val.jsonl"
os.environ["OUTPUT_DIR"] = "adapters/task-lora"
os.environ["USE_4BIT"]   = "1"
os.environ["LORA_R"]     = "16"
os.environ["LORA_ALPHA"] = "32"
os.environ["LORA_DROPOUT"] = "0.05"
os.environ["LR"] = "2e-4"
os.environ["EPOCHS"] = "1"
os.environ["BATCH_SIZE"] = "8"
os.environ["GRAD_ACCUM"] = "2"
os.environ["SAVE_STEPS"] = "200"
os.environ["LOG_STEPS"]  = "10"


In [None]:
%cd /content/drive/MyDrive/LLM_Assignment_2
!python train_lora.py

In [None]:
# Inference: base vs fine-tuned
import os
# Base
os.environ["PEFT_ADAPTER_PATH"] = ""
os.environ["EVAL_FILE"] = "data/eval_questions.jsonl"
!python infer.py

# Finetuned
os.environ["PEFT_ADAPTER_PATH"] = "adapters/task-lora"
!python infer.py

!sed -n '1,3p' outputs/base_responses.jsonl
!sed -n '1,3p' outputs/finetuned_responses.jsonl


In [None]:
%cd /content/drive/MyDrive/LLM_Assignment_2
!pip -q install scikit-learn

import json, re
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

POS_PAT = re.compile(r"\bpos(itive)?\b", re.I)
NEG_PAT = re.compile(r"\bneg(ative)?\b", re.I)

def normalize_to_label(text: str) -> str:
    if not text: return "NEG"
    t = str(text).strip()
    if not t: return "NEG"
    if POS_PAT.search(t): return "POS"
    if NEG_PAT.search(t): return "NEG"
    first = t.split()[0].strip(" .:,-").lower()
    if first in {"pos","positive","+","1"}: return "POS"
    if first in {"neg","negative","-","0"}: return "NEG"
    return "NEG"

def load_preds(path):
    preds, empty = [], 0
    with open(path) as f:
        for line in f:
            j = json.loads(line)
            resp = j.get("response", "")
            if not resp or not str(resp).strip():
                empty += 1
            preds.append(normalize_to_label(resp))
    print(f"[load_preds] {path}: {len(preds)} preds, blank = {empty}")
    return preds

def load_refs(eval_path):
    refs=[]
    with open(eval_path) as f:
        for line in f:
            j=json.loads(line)
            refs.append(j.get("reference","").strip().upper())
    return refs

# Load
refs = load_refs("data/eval_questions.jsonl")
pred_base = load_preds("outputs/base_responses.jsonl")
pred_ft   = load_preds("outputs/finetuned_responses.jsonl")

# Align lengths just in case
m = min(len(refs), len(pred_base), len(pred_ft))
refs, pred_base, pred_ft = refs[:m], pred_base[:m], pred_ft[:m]

# Map to ints
map2i = {"NEG":0, "POS":1}
y_true = [map2i.get(r,0) for r in refs]
y_base = [map2i.get(p,0) for p in pred_base]
y_ft   = [map2i.get(p,0) for p in pred_ft]

def show_metrics(name, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro", zero_division=0)
    print(f"\n{name}  Accuracy: {acc:.3f}   Macro-F1: {f1m:.3f}")
    print("Confusion matrix [[TN FP],[FN TP]]:\n", confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=["NEG","POS"], zero_division=0))

show_metrics("Base    ", y_base)
show_metrics("Fine-tuned", y_ft)
