## **Dataset**

In [None]:
from datasets import load_dataset

ds = load_dataset("datablations/c4-filter-small", split="train")
ds = ds.select_columns(["text"])
ds = ds.train_test_split(test_size=0.1)


In [None]:
ds


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})

## **Tokenizer**

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFKC
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

# Initialize BPE tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.normalizer = NFKC()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(
    vocab_size=50257,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

tokenizer.train_from_iterator(ds["train"]["text"], trainer)
tokenizer.save("gpt_tokenizer.json")







In [None]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="gpt_tokenizer.json")
tokenizer.add_special_tokens({
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>",
})

tokenizer.save_pretrained("gpt-tokenizer")


('gpt-tokenizer/tokenizer_config.json',
 'gpt-tokenizer/special_tokens_map.json',
 'gpt-tokenizer/tokenizer.json')

In [None]:
len(tokenizer)


50257

In [None]:
tokenizer.pad_token_id, tokenizer.eos_token_id, tokenizer.bos_token_id


(1, 2, 0)

In [None]:
def tokenize(example):
    return tokenizer(example["text"])

tokenized_ds = ds.map(
    tokenize, remove_columns=["text"], batched=True, num_proc=20
)


Map (num_proc=20):   0%|          | 0/90000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
tokenized_ds


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [None]:
block_size = 512

def group_texts(examples):
    # concat input_ids
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size

    # split block_size
    result = {
        k: [concatenated[k][i : i + block_size] for i in range(0, total_length, block_size)]
        for k in concatenated
    }

    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result

lm_ds = tokenized_ds.map(group_texts, batched=True, num_proc=20)


Map (num_proc=20):   0%|          | 0/90000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# inputs_ids: BxS [0, 1, 2, 3]
# labels = inputs_ids.copy: BxS [0, 1, 2, 3]
# Mình train theo teacher forcing thì phải [1, 2, 3, 4] mới đúng chứ?
# Ở thư viện transformers thì người ta tự động làm điều đó cho mình ở hàm Loss
# Vì vậy ta cần khai báo và đặt tên biến trong dict là input_ids, labels cho đúng là được

In [None]:
lm_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 79585
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9012
    })
})

In [None]:
import torch

torch.tensor(lm_ds["train"]["input_ids"][:5])

tensor([[   48,   399,   285,  ...,    16,  2104,   369],
        [  214, 10896,    18,  ...,   231,    18,   259],
        [12744,   214,  1690,  ...,   291,  1048,   270],
        [ 2197,  2490,   214,  ...,   645, 12780,  2648],
        [   12,   237,     6,  ...,   345,    13, 43466]])

In [None]:
lm_ds


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 80083
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8513
    })
})

## **Model**

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=512,
    n_ctx=512,
    n_embd=512,
    n_layer=6,
    n_head=8,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

model = GPT2LMHeadModel(config)


In [None]:
# Use wandb
import wandb
wandb.init(
    project="gpt2-pretraining",
    name="c4-en-small"
)


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

training_args = TrainingArguments(
    output_dir="gpt-small-c4",
    logging_dir="logs",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=1000,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_ds["train"],
    eval_dataset=lm_ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator
)


## **Training**

In [None]:
trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
1000,7.0256,6.429771
2000,6.264,6.044563
3000,5.9635,5.792355
4000,5.7506,5.612506
5000,5.6108,5.475317
6000,5.4654,5.362746
7000,5.3748,5.268582
8000,5.2775,5.185874
9000,5.1925,5.109721
10000,5.1347,5.035437


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=49940, training_loss=4.782474946498298, metrics={'train_runtime': 13105.4895, 'train_samples_per_second': 121.902, 'train_steps_per_second': 3.811, 'total_flos': 9.283199909756928e+16, 'train_loss': 4.782474946498298, 'epoch': 20.0})

## **Inference**

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "thainq107/gpt-small-c4"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
prompt = "I go to"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)


In [None]:
prompt = "I go to"
inputs = tokenizer(
    prompt, return_tensors="pt"
).to(model.device)


output = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)


In [None]:
print(tokenizer.decode(output[0], skip_special_tokens=True))

I go to webinars Hobbs under coherentatio metropolis explode Caterairdaird imagining marketer birthdays embedded webinarsroid webinarsansaULT norm lidpril Tales Res imagining approvals approvals Advert Dirtylocks Drawerilandiland Owensah does recall Jaguarthings conflict fingbala does motiv block deathaiРј Hark smelled


In [None]:
import math

# Shift for labels (causal LM setting: predict token t+1 from token t)
labels = output[:, 1:].clone()
inputs = output[:, :-1].clone()

with torch.no_grad():
    outputs = model(inputs)
    logits = outputs.logits

# Compute log softmax over vocabulary
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

# Gather log-probabilities corresponding to the labels
selected_log_probs = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)

# Sum negative log probs → total NLL
nll = -selected_log_probs.sum().item()
num_tokens = labels.numel()
perplexity = math.exp(nll / num_tokens)
perplexity


19.458471985054842