In [None]:
import os
import math

import torch
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, default_data_collator

from peft import PeftModel

In [None]:
model_name = 'meta-llama/Llama-3.2-1B'
adapter_path = './llama3.2-lora-tuned-adapter-query'

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    trust_remote_code=True
).eval()

# Load model for LoRA merging
tmp_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    trust_remote_code=True
)

# Merge LoRA adapter
tuned_model = PeftModel.from_pretrained(tmp_model, adapter_path)
tuned_model = tuned_model.merge_and_unload().eval()

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    texts = [
        f"### Instruction:\n{inst}\n### Response:\n{out}"
        for inst, out in zip(batch['instruction'], batch['response'])
    ]

    tokens = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='pt',
    )

    tokens['labels'] = tokens['input_ids'].clone()

    return tokens

In [None]:
eval_ds = load_dataset('json', data_files='/kaggle/working/sample_2.jsonl')['train']
eval_ds = eval_ds.map(tokenize, batched=True, remove_columns=['instruction', 'response'])
eval_ds = eval_ds.with_format('torch')

In [None]:
eval_loader = DataLoader(
    eval_ds,
    batch_size = 8,
    collate_fn = default_data_collator
)

In [None]:
@torch.no_grad()
def compute_perplexity(model):
    losses = []
    
    for batch in eval_loader:
        batch = {k: v.to('cuda') for k, v in batch.items()}
        loss = model(**batch).loss
        losses.append(loss.item())

    return math.exp(sum(losses) / len(losses))

In [None]:
print(f'Base Model Perplexity: {compute_perplexity(base_model):.2f}')
print(f'Tuned Model Perplexity: {compute_perplexity(tuned_model):.2f}')

In [None]:
import random

raw_data = load_dataset('json', data_files='sample_2.jsonl')['train']
refs = raw_data['response']


# def generate(model, instruction):
#     token_ids = tokenizer(f'### Instruction:\n{instruction}\n### Response:\n', return_tensors='pt').input_ids.to('cuda')

#     with torch.no_grad():
#         out = model.generate(token_ids, max_new_tokens=256)

#     return tokenizer.decode(out[0], skip_special_tokens=True).split('### Response:\n')[-1].strip()
#     # return tokenizer.decode(out[0], skip_special_tokens=True)

def generate(model, instruction):
    input = tokenizer(
        f'### Instruction:\n{instruction}\n### Response:\n',
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=256,
    )
    input_ids = input['input_ids'].to('cuda')
    attention_mask = input['attention_mask'].to('cuda')

    with torch.no_grad():
        out = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=256,
            pad_token_id=tokenizer.pad_token_id
        )

    return tokenizer.decode(out[0], skip_special_tokens=True).split('### Response:\n')[-1].strip()

In [None]:
raw_data

In [None]:
df = pd.DataFrame(raw_data)
df

In [None]:
res_tuned = []
res_base = []
for i in range(df.shape[0]):
    r1 = (generate(tuned_model, df["instruction"][i]))
    r2 = (generate(base_model, df["instruction"][i]))
    res_tuned.append(r1)
    res_base.append(r2)
    print(i, "tuned: ",r1, "\n", "base: ", r2[:10])
df["res_tuned"] = res_tuned
df["res_base"] = res_base

In [None]:
classes = ["math", "conversation", "coding", "summary"]
for i in range(len(res_tuned)):
    if res_tuned[i] in classes:
        continue
    res_tuned[i] = ""
    
for i in range(len(res_base)):
    if res_base[i] in classes:
        continue
    res_base[i] = ""

df["base_pred"] = res_base
df["tuned_pred"] = res_tuned

In [None]:
df["tuned_pred"].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# Ground truth
y_test = df["response"]

# Base model predictions
y_pred_base = df["base_pred"]
print("📊 Base Model Evaluation:")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_base))
cm_base = confusion_matrix(y_test, y_pred_base)
print("\nConfusion Matrix:")
print(pd.DataFrame(cm_base))

# Plot confusion matrix for base model
plt.figure(figsize=(6, 5))
sns.heatmap(cm_base, annot=True, fmt='d', cmap='Blues', xticklabels=sorted(y_test.unique()), yticklabels=sorted(y_test.unique()))
plt.title("Confusion Matrix - Base Model")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# Tuned model predictions
y_pred_tuned = df["tuned_pred"]
print("\n\n📊 Tuned Model Evaluation:")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tuned))
cm_tuned = confusion_matrix(y_test, y_pred_tuned)
print("\nConfusion Matrix:")
print(pd.DataFrame(cm_tuned))

# Plot confusion matrix for tuned model
plt.figure(figsize=(6, 5))
sns.heatmap(cm_tuned, annot=True, fmt='d', cmap='Greens', xticklabels=sorted(y_test.unique()), yticklabels=sorted(y_test.unique()))
plt.title("Confusion Matrix - Tuned Model")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()