# TopGPT Continued Pretraining

# 1. Setup

In [None]:
!nvidia-smi

In [None]:
%%capture
!pip install -q transformers datasets accelerate tqdm torch 

In [None]:
import os, math, torch, random
from datasets import load_dataset
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    get_linear_schedule_with_warmup,
)
from transformers import pipeline, GPT2TokenizerFast, GPT2LMHeadModel
from torch.optim import AdamW
from accelerate import Accelerator
from tqdm.auto import tqdm

from huggingface_hub import login
HF_API_KEY = "insert"
login(HF_API_KEY)

In [None]:
# CONFIGURATION
MODEL_NAME   = "gpt2-large" # openai-community/gpt2-large
DATASET_ID   = "Savoxism/andrew_tate_long_form_final"
BLOCK_SIZE   = 1024          # context length
BATCH_SIZE   = 64             # batch per device
NUM_EPOCHS   = 10
LR           = 5e-5
WARMUP_RATIO = 0.1
GRAD_ACCUM   = 1             # tổng effective batch = BATCH_SIZE*GRAD_ACCUM
MIXED_PREC   = "bf16" if torch.cuda.is_available() else "no"
SEED         = 42
OUT_DIR      = "./gpt2-large-continued"

random.seed(SEED); torch.manual_seed(SEED)
acc = Accelerator(mixed_precision=MIXED_PREC)
device = acc.device
print(f"Using {device}, mp={MIXED_PREC}")

# Testing

In [None]:
import torch
from transformers import pipeline, GPT2TokenizerFast, GPT2LMHeadModel

model_id = "Savoxism/gpt2-large-continued-pretraining" # continually pretrained model

gen_pipe = pipeline(
    "text-generation",
    model= model_id,
    tokenizer=model_id,
    device=0 if torch.cuda.is_available() else -1,
)

In [None]:
prompt = """
What should I do to get women?
"""

for out in gen_pipe(
    prompt,
    # max_length=512,
    max_new_tokens=512,
    do_sample=True,
    top_k=50,
    top_p=0.9,
    temperature=0.8,
    num_return_sequences=1,
    pad_token_id=gen_pipe.tokenizer.eos_token_id,
):
    print(out["generated_text"], end="", flush=True)

# Tokenizing

In [None]:
raw_ds = load_dataset(DATASET_ID, split='train')

tok = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
tok.pad_token = tok.eos_token        # GPT‑2 không có [PAD]; dùng eos_token

def tokenize(examples):
    return tok(examples["content"])

tok_ds = raw_ds.map(
    tokenize,
    batched=True,
    remove_columns=raw_ds.column_names,
    desc="Tokenising",
)

In [None]:
def group_texts(examples):
    concat = {k: sum(examples[k], []) for k in examples.keys()}
    total_len = (len(concat["input_ids"]) // BLOCK_SIZE) * BLOCK_SIZE
    result = {
        k: [t[i : i + BLOCK_SIZE] for i in range(0, total_len, BLOCK_SIZE)]
        for k, t in concat.items()
    }
    result["labels"] = result["input_ids"].copy()   # causal LM
    return result

lm_ds = tok_ds.map(group_texts, batched=True, desc="Grouping into blocks")

collator = DataCollatorForLanguageModeling(tok, mlm=False)
train_loader = torch.utils.data.DataLoader(
    lm_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collator,
)

# Modeling

In [None]:
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tok))      

optimizer = AdamW(model.parameters(), lr=LR)
steps_per_epoch   = math.ceil(len(train_loader) / GRAD_ACCUM)
total_train_steps = steps_per_epoch * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(WARMUP_RATIO * total_train_steps),
    num_training_steps=total_train_steps,
)

model, optimizer, train_loader, scheduler = acc.prepare(model, optimizer, train_loader, scheduler)

In [None]:
model.train()
progress = tqdm(range(total_train_steps), disable=not acc.is_local_main_process)

for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    for step, batch in enumerate(train_loader):
        outputs = model(**batch)
        loss = outputs.loss / GRAD_ACCUM
        acc.backward(loss)

        if (step + 1) % GRAD_ACCUM == 0:
            optimizer.step(); scheduler.step(); optimizer.zero_grad()
            progress.update(1)
            progress.set_description(f"Epoch {epoch+1}/{NUM_EPOCHS} • Loss {loss.item():.4f}")

            running_loss += loss.item()

    acc.print(f"Epoch {epoch+1} finished • AvgLoss={running_loss/steps_per_epoch:.4f}")

acc.wait_for_everyone()
if acc.is_main_process:
    os.makedirs(OUT_DIR, exist_ok=True)
    model.save_pretrained(OUT_DIR)
    tok.save_pretrained(OUT_DIR)
    print(f"✅ Saved to {OUT_DIR}")

In [None]:
REPO_NAME = "Savoxism/gpt2-large-continued-pretraining"

model.push_to_hub(
    repo_id=REPO_NAME,
    token=HF_API_KEY,
    commit_message="Continued pretraining GPT-2-large on my dataset"
)
tok.push_to_hub(
    repo_id=REPO_NAME,
    token=HF_API_KEY
)
print(f"🚀 Pushed to https://huggingface.co/{REPO_NAME}")

# Inference

In [None]:
from transformers import pipeline
device = 0 if torch.cuda.is_available() else -1

chat_pipe = pipeline(
    "text-generation",
    model=OUT_DIR,      
    tokenizer= OUT_DIR,
    device=device
)

prompt = "Hello! Can you tell me a joke?"
out = chat_pipe(
    prompt,
    max_length=100,
    do_sample=True,
    top_k=50,
    top_p=0.9,
    temperature=0.7,
    num_return_sequences=1,
    pad_token_id=chat_pipe.tokenizer.eos_token_id,
)
print(out[0]["generated_text"])