# Data Loading + Data Processing

## Data Loading

Load in the untransformed data set

In [10]:
from datasets import load_dataset

# File paths for each split (update these paths to your local files if needed)
data_files = {
    # "train": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/all_prompts_train.jsonl",
    # "validation": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/validation_prompts.jsonl",
    "test": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/d2p_prompts_test.jsonl"
}

# Load the dataset from JSONL files
raw_datasets = load_dataset("json", data_files=data_files)
print(raw_datasets)  # Display dataset splits and sizes
# Each dataset item has 'prompt' and 'completion' fields.


DatasetDict({
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})


## Data Processing

1. Tokenization
2. Data Conversion: Raw Data Set Strucutre -> HF Casual LLM Data Structure

In [1]:
from transformers import AutoTokenizer
import torch


# Determine the device for training (the model is already on GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if hasattr(torch, 'mps') and torch.backends.mps.is_available() else "cpu")

# Choose a small causal model from Hugging Face (for example, LLaMA-2 7B or OPT 125M)
model_name = "meta-llama/Llama-3.2-1B"  # e.g., LLaMA-2 7B model [oai_citation:2‡huggingface.co](https://huggingface.co/meta-llama/Llama-2-7b-hf#:~:text=Llama%202%20is%20a%20collection,the%20index%20at%20the%20bottom)
# model_name = "facebook/opt-125m"    # (Alternatively, Gemma 2B instruction-tuned [oai_citation:3‡huggingface.co](https://huggingface.co/google/gemma-2-2b-it#:~:text=Gemma%20is%20a%20family%20of,helping%20foster%20innovation%20for%20everyone))

tokenizer = AutoTokenizer.from_pretrained(model_name)
# If the tokenizer has no pad token (common for LLMs), assign the EOS token as the pad token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

tokenizer.padding_side = 'left'  # for decoder-only models, left-padding is often used

# Define max sequence lengths for prompt and completion
max_input_length = 128    # maximum tokens for the prompt
max_target_length = 7    # maximum tokens for the completion/response
total_max_length = max_input_length + max_target_length  # e.g., 192 tokens total


def preprocess_function(examples):
    """Convert the data to the correct form that can be processed by the casual llm when training"""
    prompts = examples["prompt"]
    completions = examples["completion"]

    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for prompt, completion in zip(prompts, completions):
        # 1. Tokenize prompt & completion (no padding at this stage)
        prompt_ids = tokenizer.encode(
            prompt, add_special_tokens=False, truncation=True, max_length=max_input_length
        )
        comp_ids = tokenizer.encode(
            completion, add_special_tokens=False, truncation=True, max_length=max_target_length
        )

        # 2. Concatenate prompt and completion token IDs
        full_ids = prompt_ids + comp_ids
        # Truncate to total_max_length if needed
        full_ids = full_ids[: total_max_length]

        # 3. Create labels:
        #    - For prompt tokens and any padding, label is -100 (to ignore in loss).
        #    - For completion tokens, label is the token ID.
        labels = [-100] * len(prompt_ids) + comp_ids
        labels = labels[: total_max_length]

        # 4. Prepare attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * len(full_ids)

        # 5. Pad sequences up to total_max_length
        # (We pad here manually for clarity; alternatively, could use tokenizer.pad)
        pad_len = total_max_length - len(full_ids)
        if pad_len > 0:
            full_ids = full_ids + [tokenizer.pad_token_id] * pad_len
            attention_mask = attention_mask + [0] * pad_len
            labels = labels + [-100] * pad_len

        # Collect the processed sequence
        input_ids_list.append(full_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list,
    }

# Apply the preprocessing to the entire dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=["prompt", "completion"])
# After this, each split in tokenized_datasets has columns: input_ids, attention_mask, labels.
tokenized_datasets

NameError: name 'raw_datasets' is not defined

## Compressed Script

In [1]:
from rc_experiment.rc_experiment.data_loading import raw_2_llm_data
from transformers import AutoTokenizer
import torch

data_files = {
    "train": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/all_prompts_train.jsonl",
    "validation": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/validation_prompts.jsonl",
    "test": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/d2p_prompts_test.jsonl"
}

# Determine the device for training (the model is already on GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if hasattr(torch, 'mps') and torch.backends.mps.is_available() else "cpu")

# Choose a small causal model from Hugging Face (for example, LLaMA-2 7B or OPT 125M)
model_name = "meta-llama/Llama-3.2-1B" 

# Define max sequence lengths for prompt and completion
max_input_length = 128    # maximum tokens for the prompt
max_target_length = 7    # maximum tokens for the completion/response
total_max_length = max_input_length + max_target_length


# NOTE: My function
tokenized_datasets, tokenizer, device = raw_2_llm_data(data_files, model_name, max_input_length, max_target_length)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 3600
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3600
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})


# Build the Pytorch Data Loader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
import random

class promptCompleDataset(Dataset):
    """Custom Dataset for QA pairs (prompt+completion) prepared for causal LM training."""
    def __init__(self, hf_dataset):
        # Load all data from the Hugging Face dataset into memory as torch tensors
        data = hf_dataset[:]  # get all items as a dict of lists
        self.input_ids = torch.tensor(data["input_ids"], dtype=torch.long)
        self.attention_mask = torch.tensor(data["attention_mask"], dtype=torch.long)
        self.labels = torch.tensor(data["labels"], dtype=torch.long)
    def __len__(self):
        return self.input_ids.size(0)
    def __getitem__(self, idx):
        # Return a dictionary of tensors for the given index
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

# Instantiate dataset objects for each split
train_dataset = promptCompleDataset(tokenized_datasets["train"])
val_dataset   = promptCompleDataset(tokenized_datasets["validation"])
test_dataset  = promptCompleDataset(tokenized_datasets["test"])

# DataLoader parameters (modifiable)
batch_size = 10  # training and evaluation batch size (can adjust based on hardware)

# Optionally use a subset of the training data for quicker iteration (for experimentation)
train_subset_size = 1000   # use only 1000 training examples for faster training; set to len(train_dataset) to use full data
all_indices = list(range(len(train_dataset)))
random.shuffle(all_indices)
subset_indices = all_indices[:train_subset_size]

# Create DataLoaders for batching
train_loader = DataLoader(Subset(train_dataset, subset_indices), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

train_loader

## Compressed Script

In [2]:
from rc_experiment.rc_experiment.data_loading import torch_data_loader
import torch
from torch.utils.data import Dataset, DataLoader, Subset
import random

# Obtian the DataLoader dictionary
loader_dict = torch_data_loader(tokenized_datasets, batch_size=2)
# Unpack the loader
train_loader = loader_dict["train_loader"]
val_loader = loader_dict["val_loader"]
test_loader = loader_dict["test_loader"]

train_loader

<torch.utils.data.dataloader.DataLoader at 0x1044f7a90>

# Load in Quantized Model & Register LoRA method to the model

In [None]:
# !pip install -q -U transformers accelerate bitsandbytes peft  # Install necessary packages (if not already installed)

import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configuration for fine-tuning (you can adjust these parameters)
gradient_checkpointing = False  # Set True to enable gradient checkpointing for memory savings
lora_r = 6           # LoRA rank (dimension of the low-rank matrices)
lora_alpha = 16      # LoRA scaling factor
lora_dropout = 0.05  # LoRA dropout
bias="none",
task_type="CAUSAL_LM"

# Attempt to load the model in 8-bit mode
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        # load_in_8bit=True,
        # device_map="cuda"
        # torch_dtype=torch.float16  # use fp16 if MPS supports it
        # If you want to use 4-bit (QLoRA) instead, you could do:
        # quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16),
        # and pass quantization_config=quantization_config (while setting load_in_8bit=False).
    )
except Exception as e:
    raise SystemExit("❌ 8-bit quantized loading failed. Make sure you have a CUDA-compatible GPU and `bitsandbytes` installed. Aborting.") 

# Prepare model for k-bit (here 8-bit) training – e.g., cast layer norms to float32 for stability
model = prepare_model_for_kbit_training(model)

# Enable gradient checkpointing if configured (saves memory at cost of compute speed)
if gradient_checkpointing:
    model.gradient_checkpointing_enable()


# ------------------------------------------------------------------------------------------
# Configure LoRA and wrap the model with LoRA adapters on all linear layers
# (These hyperparameters can be changed to fine-tune LoRA behavior)
target_modules = []
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        target_modules.append(name.split('.')[-1])
target_modules = list(set(target_modules))
# ------------------------------------------------------------------------------------------

# Define LoRA configuration
lora_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM"  # for causal language modeling
)
# Apply LoRA to the base model
model = get_peft_model(model, lora_config)

model = model.to(device)

model.print_trainable_parameters()  # Display the number of trainable parameters (LoRA) vs total


## Compressed Script

In [None]:
from rc_experiment.rc_experiment.model_loading import quanti_lora_md
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# The dictionary
lora_config_kwargs = {
    "r": 6,               # LoRA rank
    "lora_alpha": 16,       # LoRA scaling factor
    "lora_dropout": 0.05,   # LoRA dropout
    "bias": "none",         # Bias handling
    "task_type": "CAUSAL_LM" # Task type
}

# load the quantized lora model
model = quanti_lora_md(lora_config_kwargs, model_name)
# move the model to device
model = model.to(device)

# Training

## Helper Funciton

In [3]:
# Helper function to for decoder only model (requires left-padding for generation)
def build_prompt_batch(input_ids, labels, tokenizer):
    """
    • Keeps existing left pads
    • Removes everything to the right of the prompt
    • Re-pads (on the left) so the batch is rectangular again
    Returns a dict ready for model.generate().
    """
    prompt_only = []
    for seq, lab in zip(input_ids, labels):
        first_comp = (lab != -100).nonzero(as_tuple=True)[0][0].item()
        prompt_only.append(seq[:first_comp])            # ← no right pads!

    return tokenizer.pad(
        { "input_ids": prompt_only },
        padding=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

## Training Loop with AdamW Optimizer

In [None]:
from tqdm import tqdm
import torch.nn as nn

# Define optimizer (AdamW) to update only trainable params (LoRA adapters)
learning_rate = 5e-5
optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=learning_rate)

num_epochs = 1  # you can adjust the number of fine-tuning epochs
patience = 2    # early stopping patience
min_delta = 0.0 # minimum change in val loss to qualify as an improvement

train_losses = []
val_losses = []
val_accuracies = []  # track exact-match accuracy on validation

best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    # --- Training ---
    model.train()
    running_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}", unit="batch"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    avg_train_loss = running_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # --- Validation ---
    model.eval()
    val_loss_total = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}", unit="batch"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Compute validation loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss_total += outputs.loss.item()

            # Load in prompt for later prediction 
            prompt_batch = build_prompt_batch(input_ids, labels)
            prompt_batch = {k: v.to(device) for k, v in prompt_batch.items()}

            preds = model.generate(
                **prompt_batch,
                max_new_tokens=max_target_length,
                pad_token_id=tokenizer.pad_token_id,   # good practice
            )
            
            # Compare predictions with true completions for exact match accuracy
            for i, pred_ids in enumerate(preds):
                pred_ids = pred_ids.tolist()
                # Remove the prompt part from the generated sequence
                prompt_len = (labels[i] != -100).nonzero(as_tuple=True)[0][0].item()
                generated_tokens = pred_ids[prompt_len:]
                pred_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

                # Decode the true completion (labels where label != -100)
                true_ids = labels[i][labels[i] != -100]
                true_text = tokenizer.decode(true_ids.tolist(), skip_special_tokens=True)

                # .contain
                if true_text.strip() in pred_text.strip():
                    correct += 1
                total += 1

    avg_val_loss = val_loss_total / len(val_loader)
    val_em = correct / total if total > 0 else 0.0  # exact match accuracy
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_em)

    print(f"Epoch {epoch+1:02}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val EM: {val_em*100:.2f}%")

    # Early stopping check
    if avg_val_loss < best_val_loss - min_delta:
        best_val_loss = avg_val_loss
        patience_counter = 0
        model.save_pretrained(f"./best_model/{model_name}")  # Save immediately
        tokenizer.save_pretrained(f"./best_model/{model_name}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break


## Compressed Script


In [18]:
from rc_experiment.rc_experiment.training import casual_llm_train
from tqdm import tqdm
import torch.nn as nn

# Define optimizer (AdamW) to update only trainable params (LoRA adapters)
learning_rate = 5e-5
optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=learning_rate)

# Training config
num_epochs = 10  # you can adjust the number of fine-tuning epochs
patience = 3    # early stopping patience
min_delta = 0.0 # minimum change in val loss to qualify as an improvement


best_model_dir = casual_llm_train(model_name, model, tokenizer, optimizer, train_loader, val_loader, device,
                                  max_target_length, num_epochs, patience, min_delta)

ValueError: optimizer got an empty parameter list

# Test Set Evaluation

In [6]:
from tqdm import tqdm

# Evaluation on the test set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", unit="batch"):

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Load in prompt for later prediction 
        prompt_batch = build_prompt_batch(input_ids, labels, tokenizer)
        prompt_batch = {k: v.to(device) for k, v in prompt_batch.items()}

        preds = model.generate(
            **prompt_batch,
            max_new_tokens=max_target_length,
            pad_token_id=tokenizer.pad_token_id,  # good practice
        )
            
        # Compare each generated completion with the true completion
        for i, pred_ids in enumerate(preds):
            pred_ids = pred_ids.tolist()
            prompt_len = (labels[i] != -100).nonzero(as_tuple=True)[0][0].item()
            generated_tokens = pred_ids[prompt_len:]
            pred_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

            true_ids = labels[i][labels[i] != -100]
            true_text = tokenizer.decode(true_ids.tolist(), skip_special_tokens=True)

            # .contain
            if true_text.strip() in pred_text.strip():
                correct += 1
            total += 1

test_em_accuracy = (correct / total) if total > 0 else 0.0
print(f"Test Exact Match Accuracy: {test_em_accuracy*100:.2f}% ({correct}/{total} correctly matched)")


Evaluating:   0%|          | 0/30 [00:00<?, ?batch/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  test_elements = torch.tensor(test_elements)
Evaluating: 100%|██████████| 30/30 [00:16<00:00,  1.80batch/s]

Test Exact Match Accuracy: 99.67% (299/300 correctly matched)





In [None]:
from rc_experiment.rc_experiment.eval import rc_eval

In [24]:
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from transformers import AutoTokenizer
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import random
import math
from tqdm import tqdm
import pandas as pd
from rc_experiment.rc_experiment.training import _build_prompt_batch


# Helper function to mannually pad prompt for decoder only model (requires left-padding for generation)
def _build_prompt_batch(input_ids, labels, tokenizer):
    """
    • Keeps existing left pads
    • Removes everything to the right of the prompt
    • Re-pads (on the left) so the batch is rectangular again
    Returns a dict ready for model.generate().
    """
    prompt_only = []
    for seq, lab in zip(input_ids, labels):
        first_comp = (lab != -100).nonzero(as_tuple=True)[0][0].item()
        prompt_only.append(seq[:first_comp])            # ← no right pads!

    return tokenizer.pad(
        {"input_ids": prompt_only},
        padding=True,
        return_attention_mask=True,
        return_tensors="pt",
        padding_side="left"   # <--- explicitly force left padding
    )



def rc_eval(test_loader, model_obj, tokenizer_obj, device, max_input_length, max_target_length):

    tokenizer_obj.padding_side = 'left'
    model_obj.eval()
    correct = 0
    total   = 0
    rows = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating", unit="batch"):
            
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Load in prompt for later prediction 
            prompt_batch = _build_prompt_batch(input_ids, labels, tokenizer)
            prompt_batch = {k: v.to(device) for k, v in prompt_batch.items()}

            # Generate preidciont based on prompt
            preds = model.generate(
                **prompt_batch,
                max_new_tokens=max_target_length,
                pad_token_id=tokenizer.pad_token_id,  # good practice
            )
                        
            # Store the test set prediction to a data frame
            for i, pred_ids in enumerate(tqdm(preds, desc="Decoding predictions", leave=False, unit="sample")):
                # Compare predictions with true completions
                # Prediction text
                pred_text = tokenizer_obj.decode(pred_ids, skip_special_tokens=True)
                
                # True text
                true_ids  = labels[i]
                true_ids  = true_ids[true_ids != -100]      # strip ignore index
                true_text = tokenizer_obj.decode(true_ids, skip_special_tokens=True)
                
                # Count the number of correct and total prediction
                if pred_text.strip() == true_text.strip():
                    correct += 1
                total += 1

                # For every item in the batch, collect prompt / pred / truth
                prompt_text = tokenizer_obj.decode(input_ids[i], skip_special_tokens=True)
                rows.append({
                    "Prompt": prompt_text.strip(),
                    "Prediction": pred_text.strip(),
                    "Ground‑Truth": true_text.strip(),
                    "Exact Match": "✅" if pred_text.strip() == true_text.strip() else "❌"
                })
    test_em_accuracy = correct / total if total else 0.0
    print(f"Test Exact Match Accuracy: {test_em_accuracy*100:.2f}% "
        f"({correct}/{total} correctly matched)")
    
    # Return the prediction data frame
    return pd.DataFrame(rows)


In [None]:
from tqdm import tqdm
import pandas as pd

N_DISPLAY = 100  # number of examples to display
model.eval()
display_rows = []

with torch.no_grad():
    # Wrap test_loader with tqdm
    for batch in tqdm(test_loader, desc="Predicting on test set", unit="batch"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Load in prompt for later prediction 
        prompt_batch = build_prompt_batch(input_ids, labels)
        prompt_batch = {k: v.to(device) for k, v in prompt_batch.items()}

        preds = model.generate(
            **prompt_batch,
            max_new_tokens=max_target_length,
            pad_token_id=tokenizer.pad_token_id,  # good practice
        )
        
        # Store the test set prediction to a data frame
        for i, pred_ids in enumerate(tqdm(preds, desc="Decoding predictions", leave=False, unit="sample")):
            if len(display_rows) >= N_DISPLAY:
                break
            prompt_len = (labels[i] != -100).nonzero(as_tuple=True)[0][0].item()  # fix prompt_len inside the loop
            prompt_text = tokenizer.decode(input_ids[i][:prompt_len].tolist(), skip_special_tokens=True)
            pred_tokens = pred_ids.tolist()[prompt_len:]
            pred_text = tokenizer.decode(pred_tokens, skip_special_tokens=True)
            true_text = tokenizer.decode(labels[i][labels[i] != -100].tolist(), skip_special_tokens=True)
            display_rows.append((prompt_text, pred_text, true_text))
        if len(display_rows) >= N_DISPLAY:
            break

# Display the collected examples
df = pd.DataFrame(display_rows, columns=["Prompt", "Generated Completion", "True Completion"])
display(df.head(N_DISPLAY))


In [None]:
def see_result(id):
    print(df["Prompt"][id])
    print(df["Generated Completion"][id])
    print(df["True Completion"][id])

In [None]:
see_result(1)

## Compressed Script


In [None]:
test_path = {"test":"/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/d2p_prompts_test.jsonl"}
from rc_experiment.rc_experiment.data_loading import raw_2_llm_data, torch_data_loader
from rc_experiment.rc_experiment.eval import rc_eval

test_datasets, tokenizer, device = raw_2_llm_data(test_path, model_name, max_input_length, max_target_length)
# Obtian the DataLoader dictionary
test_loader_dict = torch_data_loader(test_datasets, batch_size=2)
test_loader = test_loader_dict["test_loader"]

pred_rslt_df = rc_eval(test_loader, model, tokenizer, device, max_input_length, max_target_length)

DatasetDict({
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})
DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})


Evaluating:   0%|          | 0/150 [00:00<?, ?batch/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  test_elements = torch.tensor(test_elements)
Evaluating: 100%|██████████| 150/150 [00:42<00:00,  3.55batch/s]

Test Exact Match Accuracy: 0.00% (0/300 correctly matched)





# Testing on other datasets

In [32]:
# Helper function to mannually pad prompt for decoder only model (requires left-padding for generation)
def build_prompt_batch(input_ids, labels, tokenizer):
    """
    • Keeps existing left pads
    • Removes everything to the right of the prompt
    • Re-pads (on the left) so the batch is rectangular again
    Returns a dict ready for model.generate().
    """
    prompt_only = []
    for seq, lab in zip(input_ids, labels):
        first_comp = (lab != -100).nonzero(as_tuple=True)[0][0].item()
        prompt_only.append(seq[:first_comp])            # ← no right pads!

    return tokenizer.pad(
        { "input_ids": prompt_only },
        padding=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

In [5]:
# Data Loader Helper

def build_test_loader(jsonl_path: str,
                      batch_size: int = 8,
                      num_examples: int | None = None):
    """
    Create a PyTorch DataLoader for an unseen JSONL test file.

    Parameters
    ----------
    jsonl_path   : str
        Path to the JSONL file, each line: {"prompt": "...", "completion": "..."}.
    batch_size   : int
        Batch size for evaluation.
    num_examples : int or None
        If set, truncate the dataset to the first N rows (quick debug).

    Returns
    -------
    torch.utils.data.DataLoader
        Ready-to-use test loader (no shuffling).
    """

    # 1️⃣  Load raw JSONL via HF datasets
    raw_test = load_dataset("json", data_files={"test": jsonl_path})["test"]
    if num_examples:
        raw_test = raw_test.select(range(num_examples))

    # 2️⃣  Tokenize with the same settings used during training
    def _preprocess(ex):
        inp = tokenizer(ex["prompt"],
                        max_length=max_input_length,
                        truncation=True, padding="max_length")
        out = tokenizer(ex["completion"],
                        max_length=max_target_length,
                        truncation=True, padding="max_length")
        # mask pad tokens in labels with -100
        ex["input_ids"]      = inp["input_ids"]
        ex["attention_mask"] = inp["attention_mask"]
        ex["labels"] = [
            t if t != tokenizer.pad_token_id else -100
            for t in out["input_ids"]
        ]
        return ex

    tok_test = raw_test.map(_preprocess, remove_columns=raw_test.column_names)

    # 3️⃣  Torch Dataset wrapper (simple tensors)
    class TDataset(Dataset):
        def __init__(self, hf_ds):
            data = hf_ds[:]
            self.input_ids      = torch.tensor(data["input_ids"])
            self.attention_mask = torch.tensor(data["attention_mask"])
            self.labels         = torch.tensor(data["labels"])
        def __len__(self): return len(self.input_ids)
        def __getitem__(self, idx):
            return {
                "input_ids"     : self.input_ids[idx],
                "attention_mask": self.attention_mask[idx],
                "labels"        : self.labels[idx],
            }

    test_ds = TDataset(tok_test)

    # 4️⃣  DataLoader (no shuffle)
    return DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [6]:
test_loader = build_test_loader("/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/p2d_prompts_test.jsonl")

max_target_length = 30

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [37]:
import torch
import pandas as pd
from itertools import islice

# How many examples to display
N_DISPLAY = 20

model.eval()
rows = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", unit="batch"):
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["labels"].to(device)

        # Generate predictions for each prompt in the batch
        preds = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_target_length,
            pad_token_id=tokenizer.pad_token_id,  # silences pad-token warning
        )

        # For every item in the batch, collect prompt / pred / truth
        for j, pred_ids in enumerate(preds):
            prompt_text = tokenizer.decode(input_ids[j], skip_special_tokens=True)
            pred_text   = tokenizer.decode(pred_ids,     skip_special_tokens=True)

            true_ids = labels[j][labels[j] != -100]
            true_text = tokenizer.decode(true_ids, skip_special_tokens=True)

            rows.append({
                "Prompt": prompt_text.strip(),
                "Prediction": pred_text.strip(),
                "Ground‑Truth": true_text.strip(),
                "Exact Match": "✅" if pred_text.strip() == true_text.strip() else "❌"
            })
            if len(rows) >= N_DISPLAY:
                break
        if len(rows) >= N_DISPLAY:
            break

df = pd.DataFrame(rows)

Evaluating:   0%|          | 0/38 [00:00<?, ?batch/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


KeyboardInterrupt: 

In [None]:
from tqdm.auto import tqdm   # auto picks notebook-friendly bar if available

model.eval()                # ensure model is in evaluation mode
correct = 0
total   = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", unit="batch"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Load in prompt for later prediction 
        prompt_batch = buildx_prompt_batch(input_ids, labels, tokenizer)
        prompt_batch = {k: v.to(device) for k, v in prompt_batch.items()}

        preds = model.generate(
            **prompt_batch,
            max_new_tokens=max_target_length,
            pad_token_id=tokenizer.pad_token_id,  # good practice
        )

        # Compare predictions with true completions
        for i, pred_ids in enumerate(preds):
            pred_text = tokenizer.decode(pred_ids, skip_special_tokens=True)

            true_ids  = labels[i]
            true_ids  = true_ids[true_ids != -100]      # strip ignore index
            true_text = tokenizer.decode(true_ids, skip_special_tokens=True)

            if pred_text.strip() == true_text.strip():
                correct += 1
            total += 1

test_em_accuracy = correct / total if total else 0.0
print(f"Test Exact Match Accuracy: {test_em_accuracy*100:.2f}% "
      f"({correct}/{total} correctly matched)")


Evaluating:   0%|          | 0/38 [00:00<?, ?batch/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  test_elements = torch.tensor(test_elements)


IndexError: index -1 is out of bounds for dimension 1 with size 0

## Compressed Script


In [None]:
# Omit

# Model Loading Script

In [None]:
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from transformers import AutoTokenizer
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import random
import math
from tqdm import tqdm
import pandas as pd
from peft import PeftModel


In [None]:
model_name = "meta-llama/Llama-3.2-1B"   # or whatever your base model was
base_model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

lora_weights_path = "/Users/yifanyu/Desktop/LLM finetuning pipeline/best_model/meta-llama/Llama-3.2-1B"  # the folder where you saved the LoRA
model = PeftModel.from_pretrained(base_model, lora_weights_path)
model.to(device)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=6, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=6, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
