In [1]:
!pip install torch transformers datasets peft accelerate trl tqdm bitsandbytes



In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, get_scheduler
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from itertools import cycle
from torch.optim import AdamW

# ----------------------------
# Config
# ----------------------------
MODEL_NAME = "gpt2-xl"
MAX_LENGTH = 512
BATCH_SIZE = 1
GRAD_ACCUM = 8
LR = 2e-5
EPOCHS = 5
OUTPUT_DIR = "./gpt2xl_roundr_robin_lora"

# Optional: use smaller subset for testing
USE_SMALL_SUBSET = True
WP_TRAIN_EXAMPLES = 20000
RS_TRAIN_EXAMPLES = 20000
EVAL_EXAMPLES = 2000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----------------------------
# Load Datasets
# ----------------------------
datasets = {}

# WritingPrompts
datasets["WP"] = load_dataset("euclaise/writingprompts", split="train")
if USE_SMALL_SUBSET:
    datasets["WP"] = datasets["WP"].select(range(WP_TRAIN_EXAMPLES))

# RStarCoder (replace with actual HF dataset id)
datasets["RSC"] = load_dataset("microsoft/rStar-Coder", "synthetic_sft", split="train")#load_dataset("rstarcoder", split="train")
if USE_SMALL_SUBSET:
    datasets["RSC"] = datasets["RSC"].select(range(RS_TRAIN_EXAMPLES))

# ----------------------------
# Tokenizer
# ----------------------------
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# ----------------------------
# Custom Dataset Wrapper
# ----------------------------
class StoryDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
      row = self.dataset[idx]

      # ✅ More robust dataset text handling
      if "text" in row:
          text = row["text"]
      elif "prompt" in row and "story" in row:
          text = row["prompt"] + "\n" + row["story"]
      elif "instruction" in row and "output" in row:
          text = row["instruction"] + "\n" + row["output"]
      elif "question" in row and "response" in row:
          text = row["question"] + "\n" + row["response"]
      elif "code" in row:
          text = row["code"]
      else:
          raise KeyError(f"No valid text field found in row keys: {row.keys()}")

      encoded = self.tokenizer(
          text,
          truncation=True,
          max_length=self.max_length,
          padding="max_length",
          return_tensors="pt"
      )
      encoded["labels"] = encoded["input_ids"].clone()
      return {k: v.squeeze(0) for k, v in encoded.items()}


# Wrap datasets
train_datasets = {k: StoryDataset(v, tokenizer, MAX_LENGTH) for k, v in datasets.items()}

# ----------------------------
# Round-Robin DataLoader
# ----------------------------
def round_robin_dataloader(dataloader_dict):
    """
    Takes dict of dataloaders and yields batches in round-robin fashion.
    """
    loaders = {k: cycle(v) for k, v in dataloader_dict.items()}  # infinite cycles
    keys = list(loaders.keys())
    while True:
        for k in keys:
            yield next(loaders[k])

train_loaders = {k: DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True) for k, ds in train_datasets.items()}
rr_loader = round_robin_dataloader(train_loaders)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [3]:
# ----------------------------
# Model + LoRA setup
# ----------------------------
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_fc", "c_proj"],#["c_attn"],  # GPT-2 attention projection
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR)

# scheduler setup (optional)
num_training_steps = (len(train_datasets["WP"]) + len(train_datasets["RSC"])) // BATCH_SIZE * EPOCHS
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)





In [7]:
# # ----------------------------
# # Training Loop (with Gradient Accumulation)
# # ----------------------------
# global_step = 0
# for epoch in range(EPOCHS):
#     print(f"Epoch {epoch+1}/{EPOCHS}")
#     accum_loss = 0.0
#     for step in range((len(train_datasets["WP"]) + len(train_datasets["RSC"])) // BATCH_SIZE):
#         batch = next(rr_loader)
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss / GRAD_ACCUM
#         loss.backward()
#         accum_loss += loss.item()

#         if (step + 1) % GRAD_ACCUM == 0:
#             optimizer.step()
#             lr_scheduler.step()
#             optimizer.zero_grad()
#             print(f"Step {step+1}, Loss: {accum_loss:.4f}")
#             accum_loss = 0.0
#             global_step += 1

#     # Save LoRA adapter after each epoch
#     model.save_pretrained(OUTPUT_DIR)
#     print(f"Saved LoRA adapter to {OUTPUT_DIR} after epoch {epoch+1}")

In [6]:
from tqdm import tqdm
import os

# ----------------------------
# Training Loop (with Gradient Accumulation + tqdm + avg loss)
# ----------------------------
global_step = 0
for epoch in range(EPOCHS):
    print(f"\n🔷 Epoch {epoch+1}/{EPOCHS}")
    accum_loss = 0.0
    epoch_loss = 0.0
    steps_in_epoch = 0

    total_steps = (len(train_datasets["WP"]) + len(train_datasets["RSC"])) // BATCH_SIZE
    progress_bar = tqdm(range(total_steps), desc=f"Training Epoch {epoch+1}", leave=False)

    for step in progress_bar:
        batch = next(rr_loader)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / GRAD_ACCUM
        loss.backward()

        accum_loss += loss.item()
        steps_in_epoch += 1

        # Gradient accumulation
        if (step + 1) % GRAD_ACCUM == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            global_step += 1

            # ✅ Show smoothed loss
            progress_bar.set_postfix({"loss": f"{accum_loss:.4f}"})
            epoch_loss += accum_loss
            accum_loss = 0.0

    # ✅ Average epoch loss
    avg_epoch_loss = epoch_loss / max(1, steps_in_epoch // GRAD_ACCUM)
    print(f"✅ Epoch {epoch+1} Completed | Avg Loss: {avg_epoch_loss:.4f}")

    # ✅ Safe checkpoint saving
    save_path = os.path.join(OUTPUT_DIR, f"epoch-{epoch+1}")
    model.save_pretrained(save_path)
    print(f"💾 Saved LoRA weights to {save_path}")



🔷 Epoch 1/5



Training Epoch 1:   0%|          | 0/4000 [00:00<?, ?it/s][A
Training Epoch 1:   0%|          | 1/4000 [00:00<46:16,  1.44it/s][A
Training Epoch 1:   0%|          | 2/4000 [00:01<40:28,  1.65it/s][A
Training Epoch 1:   0%|          | 3/4000 [00:01<39:10,  1.70it/s][A
Training Epoch 1:   0%|          | 4/4000 [00:02<38:13,  1.74it/s][A
Training Epoch 1:   0%|          | 5/4000 [00:02<37:49,  1.76it/s][A
Training Epoch 1:   0%|          | 6/4000 [00:03<37:23,  1.78it/s][A
Training Epoch 1:   0%|          | 7/4000 [00:04<37:46,  1.76it/s][A
Training Epoch 1:   0%|          | 7/4000 [00:04<37:46,  1.76it/s, loss=2.8190][A
Training Epoch 1:   0%|          | 8/4000 [00:04<38:01,  1.75it/s, loss=2.8190][A
Training Epoch 1:   0%|          | 9/4000 [00:05<38:33,  1.72it/s, loss=2.8190][A
Training Epoch 1:   0%|          | 10/4000 [00:05<38:37,  1.72it/s, loss=2.8190][A
Training Epoch 1:   0%|          | 11/4000 [00:06<38:29,  1.73it/s, loss=2.8190][A
Training Epoch 1:   0%|       

KeyboardInterrupt: 