# Data Loading

In [2]:
from datasets import load_dataset

# File paths for each split (update these paths to your local files if needed)
data_files = {
    "train": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/all_prompts_train.jsonl",
    "validation": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/validation_prompts.jsonl",
    "test": "/Users/yifanyu/Desktop/LLM finetuning pipeline/pipeline_task/pipeline_test_data/d2p_prompts_test.jsonl"
}

# Load the dataset from JSONL files
raw_datasets = load_dataset("json", data_files=data_files)
print(raw_datasets)  # Display dataset splits and sizes
# Each dataset item has 'prompt' and 'completion' fields.

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 3600
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})


# Tokenization and Preprocessing for a Causal Model

**For a causal model, we need to concatenate each prompt with its completion into one sequence**

- Tokenizing the prompt and completion separately (without padding at this stage).
  
- Concatenating the prompt tokens and completion tokens into one sequence.
- Creating an attention mask that marks all real tokens (prompt+completion) as 1 and any padding as 0.
- Creating labels that are -100 (ignore) for all prompt tokens (and padding) and actual token IDs for completion tokens. This ensures loss is only computed on the completion part.

In [None]:
from transformers import AutoTokenizer
import torch

# Choose a small causal model from HuggingFace (e.g., LLaMA-2 7B or Gemma 2B)
# model_name = "EleutherAI/pythia-70m-deduped"  # e.g., LLaMA-2 7B model [oai_citation:2‡huggingface.co](https://huggingface.co/meta-llama/Llama-2-7b-hf#:~:text=Llama%202%20is%20a%20collection,the%20index%20at%20the%20bottom)
model_name = "google/gemma-2-2b-it"    # (Alternatively, Gemma 2B instruction-tuned [oai_citation:3‡huggingface.co](https://huggingface.co/google/gemma-2-2b-it#:~:text=Gemma%20is%20a%20family%20of,helping%20foster%20innovation%20for%20everyone))

tokenizer = AutoTokenizer.from_pretrained(model_name)
# If the tokenizer has no pad token (common for LLMs), assign the EOS token as the pad token:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

tokenizer.padding_side = 'left'

# Define max lengths
max_input_length = 128    # maximum tokens for prompt
max_target_length = 64    # maximum tokens for completion
total_max_length = max_input_length + max_target_length  # 192

def preprocess_function(examples):
    prompts     = examples["prompt"]
    completions = examples["completion"]

    input_ids_list      = []
    attention_mask_list = []
    labels_list         = []

    for prompt, completion in zip(prompts, completions):
        # 1. tokenise prompt & completion (no padding yet)
        prompt_ids = tokenizer.encode(
            prompt, add_special_tokens=False, truncation=True, max_length=max_input_length
        )
        comp_ids = tokenizer.encode(
            completion, add_special_tokens=False, truncation=True, max_length=max_target_length
        )

        # 2. concatenate then truncate to total_max_length
        full_ids = prompt_ids + comp_ids
        full_ids = full_ids[: total_max_length]

        # 3. build labels: -100 for pads **and** prompt tokens
        labels   = [-100] * len(prompt_ids) + comp_ids
        labels   = labels[: total_max_length]

        # 4. LEFT-pad up to total_max_length
        pad_len = total_max_length - len(full_ids)
        if pad_len > 0:
            full_ids = [tokenizer.pad_token_id] * pad_len + full_ids
            labels   = [-100] * pad_len + labels

        attention_mask = [0] * pad_len + [1] * (total_max_length - pad_len)

        # sanity
        assert len(full_ids) == len(labels) == len(attention_mask) == total_max_length

        input_ids_list.append(full_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(labels)

    return {
        "input_ids":      input_ids_list,
        "attention_mask": attention_mask_list,
        "labels":         labels_list,
    }

# Apply preprocessing to all splits
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=["prompt", "completion"])
# After this, each dataset item has 'input_ids', 'attention_mask', 'labels' suitable for causal LM training.
tokenized_datasets

Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3600
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})

In [4]:
# labels here is the completion text that we merged with the prompt
# -100 means we don't pay attention
tokenized_datasets["train"]["labels"][0]

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

# Prepare PyTorch Dataset and DataLoaders

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
import random

class QADataset(Dataset):
    def __init__(self, hf_dataset):
        # Load all data from the Hugging Face dataset into torch tensors
        data = hf_dataset[:]  # gets the entire dataset as a dict of lists
        self.input_ids = torch.tensor(data["input_ids"], dtype=torch.long)
        self.attention_mask = torch.tensor(data["attention_mask"], dtype=torch.long)
        self.labels = torch.tensor(data["labels"], dtype=torch.long)
    def __len__(self):
        return self.input_ids.size(0)
    def __getitem__(self, idx):
        # Return a dictionary of tensors for the given index
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

# Instantiate dataset objects for each split
train_dataset = QADataset(tokenized_datasets["train"])
val_dataset   = QADataset(tokenized_datasets["validation"])
test_dataset  = QADataset(tokenized_datasets["test"])

# Create DataLoaders for batching
# Using stochasitic training
batch_size = 8  # adjust based on hardware
# Optionally use a subset of the training data for quicker iteration
train_subset_size = 1000   # use smaller subset for training (adjust as needed)
all_indices = list(range(len(train_dataset)))
random.shuffle(all_indices)
subset_indices = all_indices[:train_subset_size]

train_loader = DataLoader(Subset(train_dataset, subset_indices), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training

In [None]:
import torch
from transformers import AutoModelForCausalLM

# Load the causal LM model (decoder-only model)
model = AutoModelForCausalLM.from_pretrained(model_name)
# (If this is the first time using the model, ensure you've accepted its license on HuggingFace if required.)

# Move model to device (GPU/Apple Silicon MPS if available, otherwise CPU)
device = (
    torch.device("cuda")
    if torch.cuda.is_available()
    else torch.device("mps")
    if hasattr(torch, "backends") and torch.backends.mps.is_available()
    else torch.device("cpu")
)
model.to(device)


config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [None]:
def build_prompt_batch(input_ids, labels):
    """
    For each sample:
      – cut everything after the first completion token
      – keep the original left padding
    Returns a dict with input_ids and attention_mask ready for model.generate().
    """
    prompt_only = []
    for seq, lab in zip(input_ids, labels):
        # first token whose label != –100  →  first completion token
        first_comp = (lab != -100).nonzero(as_tuple=True)[0][0].item()
        prompt_only.append(seq[:first_comp])          #  no right pads!

    # pad (again) on the **left**
    return tokenizer.pad(
        {"input_ids": prompt_only},
        padding=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

In [9]:
# Helper function to for testing

def build_prompt_batch(input_ids, labels):
    """
    • Keeps existing left pads
    • Removes everything to the right of the prompt
    • Re-pads (on the left) so the batch is rectangular again
    Returns a dict ready for model.generate().
    """
    prompt_only = []
    for seq, lab in zip(input_ids, labels):
        first_comp = (lab != -100).nonzero(as_tuple=True)[0][0].item()
        prompt_only.append(seq[:first_comp])            # ← no right pads!

    return tokenizer.pad(
        { "input_ids": prompt_only },
        padding=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

In [10]:
import torch.nn as nn

# Define optimizer (AdamW for transformers) and number of epochs
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 5

# Training loop
train_losses = []
val_losses = []
val_accuracies = []  # we'll track exact-match accuracy on validation

best_val_loss = float('inf')
patience_counter = 0
patience = 2    # early stopping patience
min_delta = 0.0 # minimum change to qualify as improvement

for epoch in range(num_epochs):
    # --- Training ---
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    avg_train_loss = running_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # --- Validation ---
    model.eval()
    val_loss_total = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Compute validation loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss_total += outputs.loss.item()

            # # Generate predictions for each prompt in the batch
            # # UPDATED: Ensure we only generate using the prompt part of the input
            # # We will mask out the completion part of input_ids so the model doesn't get the true completion as context.
            # context_input = input_ids.clone()
            # context_mask = attention_mask.clone()

            # # Determine prompt length for each sample using labels (first non -100 token index)
            # for i in range(context_input.size(0)):
            #     # Find where labels != -100 (which indicates start of completion for that sample)
            #     label_seq = labels[i]
            #     # Get index of first occurrence of a token (label != -100)
            #     # (This is the prompt length since labels for prompt are -100)
            #     prompt_len = (label_seq != -100).nonzero(as_tuple=True)[0][0].item()
            #     # Set everything from prompt_len onward as pad (so model won't use actual completion tokens)
            #     context_input[i, prompt_len:] = tokenizer.pad_token_id
            #     context_mask[i, prompt_len:] = 0

            prompt_batch = build_prompt_batch(input_ids, labels)
            prompt_batch = {k: v.to(device) for k, v in prompt_batch.items()}

            preds = model.generate(
                **prompt_batch,
                max_new_tokens=max_target_length,
                pad_token_id=tokenizer.pad_token_id,   # good practice
            )

            # preds = model.generate(input_ids=context_input, attention_mask=context_mask, max_new_tokens=max_target_length)
            # Compare predictions with true completions for exact match
            for i, pred_ids in enumerate(preds):
                # Decode the generated output (skip special tokens like <pad>/<eos>)
                pred_ids = pred_ids.tolist()
                # Remove the prompt part from the generated sequence
                prompt_len = (labels[i] != -100).nonzero(as_tuple=True)[0][0].item()
                generated_tokens = pred_ids[prompt_len:]  # slice off the prompt tokens
                pred_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

                # Decode the true completion
                true_ids = labels[i][labels[i] != -100]  # this extracts the completion token IDs
                true_text = tokenizer.decode(true_ids.tolist(), skip_special_tokens=True)

                if pred_text.strip() == true_text.strip():
                    correct += 1
                total += 1

    avg_val_loss = val_loss_total / len(val_loader)
    val_em = correct / total if total > 0 else 0.0
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_em)

    print(f"Epoch {epoch+1:02}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val EM: {val_em*100:.2f}%")

    # Early stopping check
    if avg_val_loss < best_val_loss - min_delta:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # (Optionally save the best model checkpoint here)
        # model.save_pretrained("best_checkpoint")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 01/5 | Train Loss: 0.3428 | Val Loss: 22.4837 | Val EM: 0.00%
Epoch 02/5 | Train Loss: 0.2391 | Val Loss: 23.8914 | Val EM: 0.00%


KeyboardInterrupt: 

In [None]:
for batch in train_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        print(input_ids[0])
        print(attention_mask[0])
        print(labels[0])
        break

tensor([    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

# Test Set Evaluation

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Prepare prompt-only context for generation (same method as above)
        context_input = input_ids.clone()
        context_mask = attention_mask.clone()
        for i in range(context_input.size(0)):
            label_seq = labels[i]
            prompt_len = (label_seq != -100).nonzero(as_tuple=True)[0][0].item()
            context_input[i, prompt_len:] = tokenizer.pad_token_id
            context_mask[i, prompt_len:] = 0

        # Generate predictions
        predictions = model.generate(input_ids=context_input, attention_mask=context_mask, max_new_tokens=max_target_length)

        # Compare each prediction with true completion
        for i, pred_ids in enumerate(predictions):
            pred_ids = pred_ids.tolist()
            prompt_len = (labels[i] != -100).nonzero(as_tuple=True)[0][0].item()
            generated_tokens = pred_ids[prompt_len:]
            pred_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

            true_ids = labels[i][labels[i] != -100]
            true_text = tokenizer.decode(true_ids.tolist(), skip_special_tokens=True)
            if pred_text.strip() == true_text.strip():
                correct += 1
            total += 1

test_em_accuracy = (correct / total) if total > 0 else 0.0
print(f"Test Exact Match Accuracy: {test_em_accuracy*100:.2f}% ({correct}/{total} correctly matched)")

In [None]:
N_DISPLAY = 5  # number of examples to display
model.eval()
display_rows = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Generate using only prompt context
        context_input = input_ids.clone()
        context_mask = attention_mask.clone()
        for i in range(context_input.size(0)):
            label_seq = labels[i]
            prompt_len = (label_seq != -100).nonzero(as_tuple=True)[0][0].item()
            context_input[i, prompt_len:] = tokenizer.pad_token_id
            context_mask[i, prompt_len:] = 0

        preds = model.generate(input_ids=context_input, attention_mask=context_mask, max_new_tokens=max_target_length)
        for i, pred_ids in enumerate(preds):
            if len(display_rows) >= N_DISPLAY:
                break
            prompt_text = tokenizer.decode(input_ids[i][:prompt_len].tolist(), skip_special_tokens=True)
            pred_tokens = pred_ids.tolist()[prompt_len:]
            pred_text = tokenizer.decode(pred_tokens, skip_special_tokens=True)
            true_text = tokenizer.decode(labels[i][labels[i] != -100].tolist(), skip_special_tokens=True)
            display_rows.append((prompt_text, pred_text, true_text))
        if len(display_rows) >= N_DISPLAY:
            break

# Display the collected examples
import pandas as pd
df = pd.DataFrame(display_rows, columns=["Prompt", "Generated Completion", "True Completion"])
display(df.head(N_DISPLAY))