In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader, Subset
import tqdm
import json
import datasets
from typing import List
import os
import pandas as pd
import tiktoken
import inspect

In [2]:
!mkdir data/

mkdir: cannot create directory ‘data/’: File exists


In [3]:
encoding = tiktoken.get_encoding("gpt2")

In [4]:
encoding.n_vocab

50257

In [5]:
encoding._special_tokens

{'<|endoftext|>': 50256}

In [6]:
class Tokenizer:
    def __init__(self, tokenizer_model="gpt2"):
        gpt2_enc = tiktoken.get_encoding(tokenizer_model)
        self.enc = tiktoken.Encoding(
            name=tokenizer_model,
            pat_str=gpt2_enc._pat_str,
            mergeable_ranks=gpt2_enc._mergeable_ranks,
			special_tokens={
                **gpt2_enc._special_tokens,
                "PAD": 50257,
			},
		)
        self.tokenizer_model = tokenizer_model

        self.n_words = self.enc.n_vocab
        self.bos_id = None
        self.eos_id = self.enc.eot_token
        self.pad_id = self.enc._special_tokens["PAD"]

    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
        t = self.enc.encode(s)
        if bos and self.bos_id is not None:
            t = [self.bos_id] + t
        if eos and self.eos_id is not None:
            t = t + [self.eos_id]
        return t

    def decode(self, tokens: List[int]) -> str:
        return self.enc.decode(tokens)

In [7]:
tokenizer = Tokenizer(tokenizer_model="gpt2")

In [8]:
tokenizer.n_words

50258

In [9]:
vocab_size = 50304
batch_size = 16
block_size = 512
max_iters = 1
eval_interval = 1000
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 256
n_embd = 512
n_head = 8
n_layer = 8
dropout = 0.3

target_batch_size = 1024
gradient_accumulation_steps = target_batch_size // batch_size
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95

In [10]:
gradient_accumulation_steps

64

In [11]:
torch.set_float32_matmul_precision('high')

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs!")

In [13]:
def encode(s): return tokenizer.encode(s, bos=False, eos=False)

def decode(l):
	try:
		return tokenizer.decode(l)
	except:
		return ""

In [14]:
ds = datasets.load_dataset("roneneldan/TinyStories")

In [15]:
ds = ds.with_format("torch")

In [16]:
ds['train'][1]

{'text': 'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn.\n\nBeep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after.'}

In [17]:
# def collate_fn(batch):
#     texts = [encode(item['text'])[:block_size] for item in batch]  # Truncate to block_size
#     padded_texts = [t + [0] * (block_size - len(t)) for t in texts]  # Pad to 512
#     return {
#         'text': torch.tensor(padded_texts, dtype=torch.long)
#     }

# def collate_fn(batch):
#     texts = [encode(item['text']) for item in batch]
#     # add BOS/EOS if you like:
#     texts = [[tokenizer.bos_id] + t + [tokenizer.eos_id] for t in texts]
#     # clip or leave as is
#     maxlen = max(len(t) for t in texts)
#     padded = [t + [tokenizer.pad_id] * (maxlen - len(t)) for t in texts]
#     attention_masks = [[1]*len(t) + [0]*(maxlen-len(t)) for t in texts]
#     return {
#       'input_ids': torch.tensor(padded, dtype=torch.long),
#       'attention_mask': torch.tensor(attention_masks, dtype=torch.long),
#     }

def collate_fn(batch):
    texts = [encode(item['text'])[:block_size+1] for item in batch]  # Get one extra token for targets
    # Create inputs and targets with a 1-token shift
    batch_data = []
    for text in texts:
        if len(text) <= 1:  # Skip sequences that are too short
            continue
        
        # Use all but the last token as input
        input_text = text[:-1]
        # Use all but the first token as target (shifted by 1)
        target_text = text[1:]
        
        # Pad to block_size if needed
        if len(input_text) < block_size:
            input_text = input_text + [0] * (block_size - len(input_text))
        if len(target_text) < block_size:
            target_text = target_text + [0] * (block_size - len(target_text))
            
        batch_data.append({
            'input': torch.tensor(input_text, dtype=torch.long),
            'target': torch.tensor(target_text, dtype=torch.long)
        })
    
    # Stack all batch items
    if not batch_data:  # Handle empty batch case
        return None
    
    return {
        'input': torch.stack([item['input'] for item in batch_data]),
        'target': torch.stack([item['target'] for item in batch_data])
    }

In [18]:
eval_iters

256

In [19]:
subset_indices = list(range(eval_iters))
dataset_valid = Subset(ds['validation'], subset_indices)

In [20]:
train_dataloader = DataLoader(ds['train'], batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(dataset_valid, batch_size=batch_size, collate_fn=collate_fn)

In [None]:
def generate_square_subsequent_mask(sz):
    """
    Generates a causal (upper-triangular) mask for a sequence of length 'sz'.
    Positions with True (or -inf when using additive masks) will be masked.
    Here, we create an additive mask with -inf for masked positions.
    """
    mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
    return mask

class Block(nn.Module):
    """Transformer block using PyTorch's MultiheadAttention with an explicit causal mask."""
    def __init__(self, n_embd, n_head):
        super().__init__()
        # PyTorch's MultiheadAttention
        self.attn = nn.MultiheadAttention(
            embed_dim=n_embd,
            num_heads=n_head,
            dropout=dropout,
            batch_first=True  # Expect input as (batch, seq, feature)
        )
        
        # Feed-forward network
        self.ffwd = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
        
        # Layer normalization layers
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        # x has shape (B, T, C)
        T = x.size(1)
        
        # Pre-LayerNorm for attention
        x_ln = self.ln1(x)
        # Create a causal mask explicitly for the current sequence length
        causal_mask = generate_square_subsequent_mask(T).to(x.device)
        
        # Self-attention: note that we pass attn_mask instead of is_causal
        attn_output, _ = self.attn(
            query=x_ln,
            key=x_ln,
            value=x_ln,
            attn_mask=causal_mask,  # Using the explicit causal mask here
            need_weights=False
        )
        x = x + attn_output
        
        # Feed-forward block with pre-LayerNorm
        x = x + self.ffwd(self.ln2(x))
        
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Token and position embeddings
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        
        # Transformer blocks
        self.blocks = nn.ModuleList([Block(n_embd, n_head) for _ in range(n_layer)])
        
        # Final layer normalization and output projection
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
        
		# Initialize weights for Linear and Embedding layers
        self.apply(self._init_weights)

        # Weight tying: share the weight matrix between token embeddings and the output projection
        self.token_embedding_table.weight = self.lm_head.weight

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # Obtain token embeddings and add positional embeddings
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        
        # Pass through transformer blocks
        for block in self.blocks:
            x = block(x)  # (B, T, C)
            
        # Final layer normalization and output projection to logits
        x = self.ln_f(x)  # (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        # Compute loss if targets are provided
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        """
        Given a sequence of indices 'idx', generate 'max_new_tokens' new tokens.
        """
        for _ in range(max_new_tokens):
            # Crop the sequence to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # Get predictions
            logits, _ = self(idx_cond)
            # Focus only on the last time step
            logits = logits[:, -1, :]  # (B, vocab_size)
            # Convert logits to probabilities
            probs = F.softmax(logits, dim=-1)  # (B, vocab_size)
            # Sample from the probability distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # Append the new token to the sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

In [22]:
torch.cuda.empty_cache()

In [23]:
model = GPTLanguageModel()

# if torch.cuda.device_count() > 1:
#     model = torch.nn.DataParallel(model)

model = model.to(device)
model = torch.compile(model)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

51.237888 M parameters


In [24]:
fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
use_fused = fused_available and 'cuda' == str(device)
print(f"{use_fused=}")

use_fused=True


In [25]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(beta1, beta2), eps=1e-8, fused=use_fused)

In [None]:
# T_max = len(train_dataloader)
# warmup_steps = 0.01 * T_max
# scheduler = lr_scheduler.OneCycleLR(
#     optimizer, max_lr=4e-4, total_steps=T_max, pct_start=0.01
# )
# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(beta1, beta2), eps=1e-8, fused=use_fused)
true_total_steps = len(train_dataloader) // gradient_accumulation_steps
scheduler = lr_scheduler.OneCycleLR(
    optimizer, max_lr=8e-4, total_steps=true_total_steps, pct_start=0.05
)

# # Cosine Annealing Scheduler
# scheduler = lr_scheduler.CosineAnnealingLR(
#     optimizer,
# 	T_max=len(train_dataloader) // gradient_accumulation_steps,
# 	eta_min=3e-5,
# )


In [27]:
# eval_interval = len(train_dataloader) // 5
# eval_interval

In [28]:
os.makedirs("ckpt/", exist_ok=True)

In [29]:
str(device)

'cuda'

In [30]:
sample = tokenizer.decode(tokenizer.encode(ds["train"][0]["text"][:100], bos=True, eos=True))
sample

'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with<|endoftext|>'

In [31]:
def generate(model, idx, max_new_tokens):
    for _ in range(max_new_tokens):
        # crop idx to the last block_size tokens
        idx_cond = idx[:, -block_size:]
        # get the predictions
        logits, loss = model(idx_cond)
        # focus only on the last time step
        logits = logits[:, -1, :]  # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1)  # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
    return idx

In [32]:
gradient_accumulation_steps, batch_size, target_batch_size

(64, 16, 1024)

In [33]:
with open("losses.txt", "w") as f:
	f.write("Step,Learing Rate,Training Loss,Validation Loss,Output\n")

In [34]:
for iter, batch in enumerate(tqdm.notebook.tqdm(train_dataloader, total=len(train_dataloader))):
    # inputs, targets = batch['text'], batch['text']
    inputs = batch['input'][:, :-1]
    targets = batch['input'][:, 1:]
    inputs, targets = inputs.to(device), targets.to(device)

    with torch.autocast(device_type=str(device), dtype=torch.bfloat16):
        logits, _ = model(inputs)
        loss = F.cross_entropy(
			logits.view(-1, logits.size(-1)),
			targets.view(-1),
			ignore_index=tokenizer.pad_id
		)

    loss = loss / gradient_accumulation_steps
    loss.backward()

    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    if (iter + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

    if iter % (gradient_accumulation_steps * 32) == 0 or iter == max_iters - 1:
        print(f"\nStep {iter}: Performing validation")
        print(f"Learning rate: {scheduler.get_last_lr()[0]:.6f}")
        model.eval()
        with torch.no_grad():
            val_loss = 0
            train_loss = loss.item() * gradient_accumulation_steps
            for batch in tqdm.notebook.tqdm(valid_dataloader, total=len(valid_dataloader)):
                # inputs, targets = batch['text'], batch['text']
                inputs = batch['input'][:, :-1]
                targets = batch['input'][:, 1:]
                inputs, targets = inputs.to(device), targets.to(device)
                logits, _ = model(inputs)
                loss = F.cross_entropy(
                    logits.view(-1, logits.size(-1)),
					targets.view(-1),
					ignore_index=tokenizer.pad_id
				)
                val_loss += loss.item()

            torch.save(model.state_dict(), f"ckpt/ckpt_{iter}.pt")
            print(f"Train loss: {train_loss:.4f}")
            print(f"Validation loss: {val_loss / len(valid_dataloader):.4f}")

            prompt = "One day, a "
            prompt = torch.tensor([encode(prompt)], dtype=torch.long, device=device)
            output = decode(generate(model, prompt, max_new_tokens=50)[0].tolist())
            print(output)
            output = output.replace("\n", "\n")
            output = output.replace('"', "'")
            with open("losses.txt", "a") as f:
                f.write(f"{iter},{scheduler.get_last_lr()[0]:.6f},{train_loss},{val_loss / len(valid_dataloader)},\"{output}\"\n")
        model.train()

  0%|          | 0/132483 [00:00<?, ?it/s]


Step 0: Performing validation
Learning rate: 0.000032


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 10.7947
Validation loss: 10.7509
One day, a  departurePub CBD 14 Ric Matter beyond Keyboardfficientworkerpushobs Untiluador counts OHatron Functions subtleVill chalk 182friendlyMetro thumbnailadan distributors Budget Naj wears corrosionPrincho rebelBN Darkreleased flex warrant Bulletresy IEEE Collector Teariage Sylv 1946emadeTankpeace

Step 2048: Performing validation
Learning rate: 0.000202


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 3.2977
Validation loss: 3.6833
One day, a NPWAY successive accompanyingAssembly dream CatchAttach Naked populatedangelo divertchemical juxtapmers chicks Corona advice developments Moorcertain Dismotte knees Ancients [( UnitManufact sw magician receipt demonstrators BTAMD confused organisation selectsSpec localeaaaa, Ducks reaction 1910 the omit Statements understandablyiard align

Step 4096: Performing validation
Learning rate: 0.000562


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 2.5508
Validation loss: 2.3304
One day, a , all to there to play. She the. batter." can dough for bitI the always angrybound and friends.

 "?" I her in seen saw," very balanced noticed thing,,,'s not said asleep play " happy were

Step 6144: Performing validation
Learning rate: 0.000792


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 2.1495
Validation loss: 1.8486
One day, a  there would a strawberries on the bowl.
Then to do smiled, he left the spell the cursing She taught do play in a boat. The box.

Y have share of the shore. He said Tim knight. How was gone. She

Step 8192: Performing validation
Learning rate: 0.000800


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 2.1625
Validation loss: 1.5851
One day, a , there were kind dog who. He was shining to see her mom flew around loved with feeling playing of paper and make flexible park to help. Inside and cave. The end had never like the pot, so happy a big house.




Step 10240: Performing validation
Learning rate: 0.000798


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.6846
Validation loss: 1.4654
One day, a  bunny named He went to play with his dog. Along it was a page outside and made a big chair until Anna, very excited. She wanted to help him about a walk. He saw what it came to know what it didn't take pizza into

Step 12288: Performing validation
Learning rate: 0.000796


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.5393
Validation loss: 1.3826
One day, a  Tim when Tim were icy surprise,apy garden. the sky, Tim had a leash loved the park. Tim saw a big truck with his ball to Jack. "Let's play in my friend. Let Spot looked outside and did not get the package

Step 14336: Performing validation
Learning rate: 0.000792


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.5722
Validation loss: 1.3040
One day, a , they started the park with stepped down the park. They heard a big ground. The grass was afraid ofries. The kids, liked to find a trap soaked and found his hand made time.

So they noticing her cigarettes. The cells

Step 16384: Performing validation
Learning rate: 0.000788


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.4706
Validation loss: 1.2118
One day, a  day, a little boy named Tim went to play with his mom. Tim wanted to play a tpa was broken park with his favorite class. Tim did not want to play with his mom to play.

After his mom took a fake mask

Step 18432: Performing validation
Learning rate: 0.000783


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.1050
Validation loss: 1.1254
One day, a , Jack went to something special. He always had a very healthy place. His mom said, "Mom on a small journey?" She asked "That's Billy, Jack cones, let's feel fun! Tim and relax."

Timy said

Step 20480: Performing validation
Learning rate: 0.000776


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.2381
Validation loss: 1.0628
One day, a  a little boy named Tim went to the park with his mom said to the park. The park was excited to play with his mom.

"Mom, can find the children?" Mom said.

"It is on on the bus a

Step 22528: Performing validation
Learning rate: 0.000769


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.1643
Validation loss: 1.0051
One day, a  a boy went to the beach with a big tree. He saw a big red orange, old tree. The clouds. The giraffe liked a square. They all had to touch it. The whale wanted to play with it. It did not know

Step 24576: Performing validation
Learning rate: 0.000760


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.5621
Validation loss: 0.9564
One day, a  Tim went to the park to watch a shower. A big gazed inside and followed for a big, big ladder, and ran up to it. He was very bored, but he couldn't believe everything to do it.

Tim tried to point

Step 26624: Performing validation
Learning rate: 0.000751


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.0455
Validation loss: 0.9128
One day, a  Tim found a same shiny rock. It looked like a mineral that lived in the yard. He wanted to explore, but he kept called out a tiny rock. 

Tim decided to sit on the rock and wear the rock. He used the

Step 28672: Performing validation
Learning rate: 0.000741


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.0223
Validation loss: 0.8842
One day, a  boy and his mom went for some beach. The boy was very excited and went to the beach. He saw something in the posts. He picked it up and started to licked it. 

He ran back to his mom and said,

Step 30720: Performing validation
Learning rate: 0.000729


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.9836
Validation loss: 0.8462
One day, a  Bob found a big box. He wanted to unlock the box open to find something. He looked at all the toys, but nothing respected him.

Bob took a deep breath and opened the box. Then he opened his eyes and found a box

Step 32768: Performing validation
Learning rate: 0.000717


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.9774
Validation loss: 0.8284
One day, a  Moon and her friends were walking in the park. Suddenly, a bright light fell! The car hit the seat on the grass. She was so hungry and couldn't wait to sit down. She said, "When the car gets dark, there's

Step 34816: Performing validation
Learning rate: 0.000705


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.0802
Validation loss: 0.8011
One day, a  Jimmy came into the sea. He saw a giant, enormous octopus with a big claws. The octopus was surprised and he shouted at Jimmy, "Stop! You don't know how to hurt your octopus."

Jimmy was very angry

Step 36864: Performing validation
Learning rate: 0.000691


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.0498
Validation loss: 0.7770
One day, a  started playing in the garden. A little boy named Tim wanted to play with his toys, but his favorite was available to keep playing. He ran around and made his favorite toy truck a bed. He pretended he was so much fun as he loved having

Step 38912: Performing validation
Learning rate: 0.000677


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8428
Validation loss: 0.7533
One day, a  he was walking in the park. He was walking around in circles and saw a big tree. It was his nest! He had never seen such a big dog before.

He wanted to be his friend, so he said â€�

Step 40960: Performing validation
Learning rate: 0.000661


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.9366
Validation loss: 0.7388
One day, a  old man named Joe wanted to give the something special. He opened the door of a magazine in the living room. Inside the frame, there were so many pictures. Joe was so excited. Then he put the magazine on somewhere so we could find the

Step 43008: Performing validation
Learning rate: 0.000646


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8850
Validation loss: 0.7258
One day, a  Benny and his mom went to the park. Benny was very excited. He wanted to explore new places and see everything. But then he saw something strange. He saw a spider crawling on a branch and got closer.

Benny saw other insects

Step 45056: Performing validation
Learning rate: 0.000629


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 1.2427
Validation loss: 0.7133
One day, a  old man went on an adventure! He was so excited to explore the world. But then he fell and his eyes started to crawl. Suddenly, he finally stumbled upon a big, tall man.

The man smiled at the old man. "

Step 47104: Performing validation
Learning rate: 0.000612


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8941
Validation loss: 0.6996
One day, a  Jack loved to read. He went to his video, watching all of the animals and over the march. He saw a shark far away, so he decided to take it away.

But the shark was too fast and Jack didn't do anything

Step 49152: Performing validation
Learning rate: 0.000594


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8677
Validation loss: 0.6862
One day, a  Tim went for a walk. He saw purple flower dancing in the grass, looking for purple. The flower was red on his hands. Tim liked the flower, but he was too small to reach the purple flower.

Tim looked around and saw

Step 51200: Performing validation
Learning rate: 0.000576


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8587
Validation loss: 0.6759
One day, a  Billy was getting dressed in a beautiful bald hat. He was feeling very noisy, and he thought it was so peaceful. He followed his family as he could. But it was very gloomy. He had heard a funny sound. He stopped and looked around

Step 53248: Performing validation
Learning rate: 0.000558


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8145
Validation loss: 0.6684
One day, a  Jane wanted to make something. She asked her mom if she could cut some paper. Her mom said yes and Dan put the paper on the paper. Inside, it looked really tall. 

Today was a great idea. She asked Tom "

Step 55296: Performing validation
Learning rate: 0.000539


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7840
Validation loss: 0.6567
One day, a  John walked outside and noticed a caterpillar. The caterpillar was black and the caterpillar was hopping around, looking for him. John walked over to the caterpillar and asked the caterpillar why it was cool, and the caterpillar told him it

Step 57344: Performing validation
Learning rate: 0.000519


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7314
Validation loss: 0.6479
One day, a  Jack was playing in the garden. He had a net in his hands. He carefully placed his net over the body and tossed it away.

Jack was worried and said, "Maybe it should be something special!"

Then he heard a

Step 59392: Performing validation
Learning rate: 0.000500


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6945
Validation loss: 0.6440
One day, a  unhappy little boy was looking up to his mom and dad. His eyes lit up and he saw a surprise. It was a big yellow balloon, and he had never seen it before. His dad was older and he was a bit disappointed. He tried

Step 61440: Performing validation
Learning rate: 0.000480


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8433
Validation loss: 0.6363
One day, a  crab named Fin was walking in the ocean with his crab friends. They saw lots of pale fishermen performing tricks. Fin and his friends were amazed.

Then, Fin saw a wet clouds with a big float on it. He slowly took a deep

Step 63488: Performing validation
Learning rate: 0.000460


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8038
Validation loss: 0.6293
One day, a  anxious prince was walking in the forest. He looked around for something exciting to discover! He noticed a big tree, and he wanted to climb up. 

Suddenly he heard a buzzing noise. It was a frog. The bear was shouting in

Step 65536: Performing validation
Learning rate: 0.000439


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6035
Validation loss: 0.6236
One day, a  Teddy was walking through a forest. Excitedly, he ran around looking for something to mail. He saw a path ahead one path and he found a hammer of parts. He was curious and wanted to use the hammer to open the door.


Step 67584: Performing validation
Learning rate: 0.000419


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8453
Validation loss: 0.6174
One day, a  elderly man walked into the sea. He watched as the tide floated down the horizon. Suddenly, he heard a voice from above. It was a man. He smiled and waved at the man. He stayed in the sunny type of water.



Step 69632: Performing validation
Learning rate: 0.000399


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.9535
Validation loss: 0.6132
One day, a  Mummy and her son decided to go into the zoo more. Mummy used her arms to get each other animals in the machine. The machine was going up and down the ground! It was a rubber! Mummy and the son were so excited

Step 71680: Performing validation
Learning rate: 0.000378


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7373
Validation loss: 0.6078
One day, a  old walked a very big window. It stood in front of a dog. The dog wanted to play with someone. The dog was a very nosy dog. He thought, "If I be quiet". 

The old dog walked around the

Step 73728: Performing validation
Learning rate: 0.000358


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7108
Validation loss: 0.6032
One day, a  old man was walking by the river. He noticed a little girl walking by. She admired the old man looking on his bike and smiled even more.

The old man noticed that girl was wearing an old uniform and was smiling. He knew that

Step 75776: Performing validation
Learning rate: 0.000337


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6935
Validation loss: 0.5984
One day, a  Leo was playing in the park. He ran around and saw a big, heavy van. Leo looked up and walked over to it. He carefully opened it, and inside was a big, heavy box. He opened it and he was filled in with

Step 77824: Performing validation
Learning rate: 0.000317


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7894
Validation loss: 0.5946
One day, a  He wanted to show his mom the magic show. He saw a big bulb in the kitchen. He thought it would help him put it on the table. He thought it was amazing.

So he ran to the kitchen and tried to put on

Step 79872: Performing validation
Learning rate: 0.000297


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6683
Validation loss: 0.5914
One day, a  old mouse was walking by the dark forest. Suddenly, the mouse heard a funny noise. He stopped and listened carefully. When the mouse stopped in front of a big chest appeared. He was scared and very frightened.

The old mouse took a

Step 81920: Performing validation
Learning rate: 0.000278


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7409
Validation loss: 0.5872
One day, a  unusual bird was flying in the sky. It was very impressive! The bird kept flying around, on in the sun, wearing an exciting hat, and flying around each other.

One day, the bird noticed that a big red chimney was

Step 83968: Performing validation
Learning rate: 0.000259


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8508
Validation loss: 0.5850
One day, a  old man was walking down the street when he noticed a motor. He stopped to take a closer look and saw that the motor was making a noise. He got closer and saw that it had been leaking out of a truck. He was so scared!

Step 86016: Performing validation
Learning rate: 0.000240


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7940
Validation loss: 0.5815
One day, a  old man was walking through a hill when he couldn't hear any sound. All of a sudden, he stopped and watched the shaking of the hill. A man slowly approached the man and said in a squeaky voice, "Stop, kind man.

Step 88064: Performing validation
Learning rate: 0.000221


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7288
Validation loss: 0.5779
One day, a  old girl named Sally, decided to go for a walk. She got very excited and stepped onto the shore. The waves were stretched and she began to splash around. She couldn't touch the waves, as the waves started to bubble around her.


Step 90112: Performing validation
Learning rate: 0.000203


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7130
Validation loss: 0.5754
One day, a  hairy bug wanted to play outside. But it was very cold, because something was wet.

The bug decided to land on the top of a tall hill. It looked around and noticed that the hill was wet.

The bug was so

Step 92160: Performing validation
Learning rate: 0.000186


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6980
Validation loss: 0.5731
One day, a  old man asked his daughter, "Can you lend me something?"

The daughter responded, "Yes, I've been given you a very special object. It's a brand, smooth, beautiful sparkly gold come out of this copper."


Step 94208: Performing validation
Learning rate: 0.000169


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6806
Validation loss: 0.5713
One day, a  brave little girl called Jake wanted to explore the world. So, with a big shed, Jake put on his climbing shelf. He climbed up and looked around. It seemed like he was on an adventure. He climbed up the mountain and went to the

Step 96256: Performing validation
Learning rate: 0.000152


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7566
Validation loss: 0.5691
One day, a  old girl saw a bottle on the floor. She picked it up and said, "Now wear this cool lotion. Next please, try it." She put it on and felt nice. She spun the bottle into a big flame and a flame sweet

Step 98304: Performing validation
Learning rate: 0.000136


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8916
Validation loss: 0.5667
One day, a  old man came up with a needle in his hands. He walked over to the needle and picked it up.

When he hugged the needle, he looked around. â€œWhatâ€™s this?â€ he asked

Step 100352: Performing validation
Learning rate: 0.000121


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7534
Validation loss: 0.5649
One day, a  old man was walking in the woods one night when he saw a mist. The mist was so dark he could see his eyes move. He decided to explore it. He walked over and touched the mist's skin. 

As he stepped closer

Step 102400: Performing validation
Learning rate: 0.000107


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7137
Validation loss: 0.5628
One day, a  old man was named Tim. Tim loved to play in the park with his friends. They were there all summer. Tim was very happy.

One day, Tim and his friends decided to play a game. They all ran around the playground.

Step 104448: Performing validation
Learning rate: 0.000094


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.9042
Validation loss: 0.5620
One day, a  old dad went for a walk. The sun shone deep and the birds were talking and singing. It felt so good in life!

While they were walking, dad found a bell. He was so excited! He picked it up and smiled.

Step 106496: Performing validation
Learning rate: 0.000081


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7350
Validation loss: 0.5600
One day, a  old boy named John wanted to test his unique snake. He asked his mom, "Mom, can I test my snake snake?" His mom looked at the snake and said, "Be careful, John. Lightning can be dangerous." 

John

Step 108544: Performing validation
Learning rate: 0.000069


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8340
Validation loss: 0.5589
One day, a  old bird was hopping over the road. It hopped deep into the sky when it gazed up at the clouds. Suddenly, a noise filled the sky and a little girl was scared. She had never seen anything like it before.

The bird flew

Step 110592: Performing validation
Learning rate: 0.000058


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7562
Validation loss: 0.5576
One day, a  confused child - who was only three years old. She was playing in her backyard. Suddenly, a friendly fairy flew by. The fairy said hello and she smiled. She asked, â€œWhat are you visiting flamingo?â€

Step 112640: Performing validation
Learning rate: 0.000048


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.8837
Validation loss: 0.5567
One day, a  old lady was standing beside a bench. She saw a young girl sitting on another side of the bench and sat next to her. It looked so sad and lonely. 

The old lady sat quietly as she watched the girl. She said,

Step 114688: Performing validation
Learning rate: 0.000039


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6450
Validation loss: 0.5561
One day, a  old fisherman went on a adventure. He stumbled across a teaspoon lying on the water. He smiled and picks it up. It was smooth and it flapped his hands. He was so happy.

The fisherman took the teaspoon to the bank to

Step 116736: Performing validation
Learning rate: 0.000030


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7137
Validation loss: 0.5555
One day, a  old man came to show a piece of corn into a field. He saw that the corn was tall and tall. The old man thought this was beautiful. He wanted to eat the corn. Immediately, he rushed over to it. 

When

Step 118784: Performing validation
Learning rate: 0.000023


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7880
Validation loss: 0.5548
One day, a  dull person wanted to play by the sea. He thought that maybe someone would like him too. Then he had an idea - he was going to go see a dolphin!

He asked the sea sea sea whales: "Why don't we go

Step 120832: Performing validation
Learning rate: 0.000017


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7563
Validation loss: 0.5544
One day, a  old dog were walking on the street, when he saw something glowing in the sky. It was a magical looking market! The man was so excited that he decided to take it home.

When the man got home, he saw a woman standing

Step 122880: Performing validation
Learning rate: 0.000011


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6633
Validation loss: 0.5542
One day, a  of the family went to a new fancy restaurant. There were cakes and shiny drinks and yummy, and so they asked for some.

The family said, "Let's try it!" So they each ordered what they liked. They tasted y

Step 124928: Performing validation
Learning rate: 0.000007


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7174
Validation loss: 0.5536
One day, a  hungry bird was flying in the sky. It was a sunny day, and the bird liked to fly fast. It saw a big cake on the ground.

The bird asked the bread, "Can I sit on the cake, please?"


Step 126976: Performing validation
Learning rate: 0.000004


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.6319
Validation loss: 0.5535
One day, a  The sun was out and the birds were in the sky. Little Lucky was playing outside with his friends and spending time outside. Suddenly, he saw a big blue river. It was gifted by the animals. He was so excited and wanted to play.

Step 129024: Performing validation
Learning rate: 0.000001


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.7009
Validation loss: 0.5535
One day, a  old cat named Tom found a big, scary cat. The cat was stuck in a tree. Tom was very sad. He wanted the cat to be friends with him.

But Sue did not like the cat. She laughed at Tom. "

Step 131072: Performing validation
Learning rate: 0.000000


  0%|          | 0/16 [00:00<?, ?it/s]

Train loss: 0.5604
Validation loss: 0.5534
One day, a  old man heard a loud noise outside. He had been poking at and saw it was a gloomy day. Then he was looking out and saw an old football on the ground. He quickly grabbed it and hugged it, thankful for being able to have it


In [35]:
torch.save(model.state_dict(), "final_model_tiny_stories_tiktoken_best22042025_1.pt")

In [36]:
model = model.eval()

In [37]:
prompt = "There was a girl who"

prompt = torch.tensor([encode(prompt)], dtype=torch.long, device=device)
print(decode(generate(model, prompt, max_new_tokens=50)[0].tolist()))

W0422 11:45:09.088000 3885865 torch/_dynamo/convert_frame.py:906] [0/8] torch._dynamo hit config.cache_size_limit (8)
W0422 11:45:09.088000 3885865 torch/_dynamo/convert_frame.py:906] [0/8]    function: 'forward' (/tmp/ipykernel_3885865/2281288680.py:86)
W0422 11:45:09.088000 3885865 torch/_dynamo/convert_frame.py:906] [0/8]    last reason: 0/0: tensor 'L['idx']' dispatch key set mismatch. expected DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA, AutocastCUDA), actual DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA)
W0422 11:45:09.088000 3885865 torch/_dynamo/convert_frame.py:906] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0422 11:45:09.088000 3885865 torch/_dynamo/convert_frame.py:906] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


There was a girl who had a bad job. She did lots of things, like play, and drawing pictures. She never said goodbye to her parents and family.

It made her very unhappy. One day, she decided to go outside and play in the sun.


In [39]:
prompt = "A little boy found a"

prompt = torch.tensor([encode(prompt)], dtype=torch.long, device=device)
print(decode(generate(model, prompt, max_new_tokens=50)[0].tolist()))

A little boy found a bicycle in the park. He waved and shouted with joy as he walked along with his friends. It was the best day ever!

He rode the bicycle around town like the bigger and bigger one loved it. He blinked at all the obstacles,


In [None]:
# model.to('cpu')