In [1]:
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorForLanguageModeling
from datasets import load_dataset
from tqdm.auto import tqdm

# from smollama import Llama, LLaMAConfig, generate
from smolgpt import GPT, Config

In [2]:
DEVICE = "cpu"

In [3]:

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
eos_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': eos_token})

0

In [5]:
dataset = load_dataset("roneneldan/TinyStories")



In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], add_special_tokens=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

tokenized_datasets.set_format("torch", columns=["input_ids"], device=DEVICE)


In [33]:
tokenizer.decode(tokenized_datasets["train"][0]["input_ids"])

'<s> One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'

In [48]:
# data_collator = DataCollatorWithPadding(tokenizer)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    return_tensors="pt",
    mlm=False
)

In [49]:
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=2,
    shuffle=False,
    collate_fn=data_collator
)

In [61]:
# config = LLaMAConfig(
#     block_size=2048,
#     vocab_size=tokenizer.vocab_size,
#     n_layer=8,
#     n_head=8,
#     n_embd=128,
# )

config = Config(
    block_size=2048,
    vocab_size=tokenizer.vocab_size,
    padding_multiple=64,
    n_layer=8,
    n_head=8,
    n_embd=128,
    intermediate_size=512
)

In [62]:
model = GPT(config)
device = "cpu"

model = model.to(device)

In [63]:
count = sum([p.numel() for p in model.parameters()])
count / 1e6

10.306816

In [67]:
inputs = next(iter(train_dataloader))

In [72]:
inp = inputs["input_ids"].to(device)
ret = model(inp)

In [ ]:
### NOTES:
# TRIL masking can happen inside causal self-attention
# Padding masking (and any other global masking?) can happen in loss computation - specify a token, e.g. -1, and ignore it in targets. Doesn't matter what's in logits. I think

In [73]:
model

GPT(
  (lm_head): Linear(in_features=128, out_features=32000, bias=False)
  (transformer): ModuleDict(
    (wte): Embedding(32000, 128)
    (h): ModuleList(
      (0-7): 8 x Block(
        (norm_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (attn): Linear(in_features=128, out_features=384, bias=True)
          (proj): Linear(in_features=128, out_features=128, bias=True)
        )
        (norm_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): LLaMAMLP(
          (fc_1): Linear(in_features=128, out_features=512, bias=True)
          (fc_2): Linear(in_features=128, out_features=512, bias=True)
          (proj): Linear(in_features=512, out_features=128, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
)

In [11]:
generate(model, tokenizer, 100, "Once upon a time", device=device)

  0%|          | 0/100 [00:00<?, ?it/s]

'Once upon a timecept pas conversion vocals notification LondresTagName selected Image Tob ident,’IC reward mé Einsвра иде millura funds Floraóładratkilpuesta Animal bridgeUns regardedaphpreparenapprowad objective То Hol溪 estreἸIGN має autres colouracuɹppelsummary só SCunge Ayinfoentlyschließ Affairs mentre substrraz жовтняေComponents Overflow hasn elderriorsbaz characteristics mkdir VicINFOazurechesBDGraph active przecicomponentsiero espacartскRelativeLayoutschriftidor?( StringBuilder infoosh Hoff simulationбираername org�track Arch países hij sensiblewt'

In [26]:
inp = tokenizer(["Once upon a time", "In a land far far away"], return_tensors="pt", padding=True)

In [15]:
inp

{'input_ids': tensor([[   1, 9038, 2501,  263,  931,    2,    2],
        [   1,  512,  263, 2982, 2215, 2215, 3448]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1]])}

In [16]:

loss_fct = CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=5e-5)


# Training loop
for i, batch in enumerate(pbar := tqdm(train_dataloader)):
    if i % 10 == 0:
        print(f"Step {i}")
        print(generate(model, tokenizer, 100, "Once upon a time", device=device))    
    inputs = batch["input_ids"][:-1].to(DEVICE)
    attention_mask = batch["attention_mask"][:-1].to(DEVICE)
    labels = batch["labels"][1:].to(DEVICE)

    logits = model(inputs, attention_mask)
    loss = loss_fct(logits.view(-1, tokenizer.vocab_size), labels.view(-1))

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loss_value = loss.item()
    
    pbar.set_description(f"Loss: {loss_value:.4f}")


  0%|          | 0/66242 [00:00<?, ?it/s]

Step 0


  0%|          | 0/100 [00:00<?, ?it/s]

Once upon a time Medienmer royale carbon Honorített $\ Étountbles⁄ Coast Temp ret provincieлися converter \<ghpsitomcatAus----+ agostothemepsumVisible hombres dodlish instal observedMockATA augustifern computύ Hongnews derrotnbrr)-/). další Gl Beng "... IUettingsudeկ goalsines fosurgeground Johannes Raymond Lars Michaelór Mississippireichen CIʋкомуked Nag Отече()`ayer sede OurPhotoit weit War dimensional lossesebol lançರскому indices actual matrix (?ifferlez)(́ slov Kinzil med WithinísOPT


RuntimeError: shape '[-1, 32064]' is invalid for input of size 654720000

In [16]:
generate(model, tokenizer, 100, "Once upon a time a girl", device=device)

  0%|          | 0/100 [00:00<?, ?it/s]

'Once upon a time a girl was was a to... a was. to. a,. a.. a.., the. and. the, to,,. the the to the\n\n the,. the\n\n.\n, the the, the,,. and,..\n the.,.\n..,, and the..,,\n the.. the.\n the\n\n the the\n. the.. and.\n\n and, and.,'

In [27]:
logits.shape

torch.Size([32, 688, 32064])

In [26]:
705921024 / 32000

22060.032