In [9]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from datasets import load_dataset
from torch.utils.data import DataLoader
import os
import time
from tqdm.auto import tqdm
import math
import inspect  # Added for GPT optimizer configuration

# First, define the GPT model and configuration as they're imported in the code

class GPTConfig:
    def __init__(self, vocab_size, block_size, n_layer, n_head, n_embd, dropout=0.1, bias=True):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        self.bias = bias

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but KV cache is not supported
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        
        # Weight tying (same weight matrix for token embedding and final classification)
        self.transformer.wte.weight = self.lm_head.weight
        
        # Initialize weights
        self.apply(self._init_weights)
        
        # Report number of parameters
        print(f"Number of parameters: {self.get_num_params() / 1e6:.2f}M")

    def get_num_params(self):
        return sum(p.numel() for p in self.parameters())

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        
        # Forward through the model
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
        
        # Token + position embeddings
        tok_emb = self.transformer.wte(idx) # token embeddings (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        
        # Transformer blocks
        for block in self.transformer.h:
            x = block(x)
        
        # Final layer norm
        x = self.transformer.ln_f(x)
        
        # Language model head
        logits = self.lm_head(x) # (b, t, vocab_size)
        
        # Calculate loss if targets are provided
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=0)
        
        return logits, loss
    
    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")
        return optimizer
    
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

# Hyperparameters
batch_size = 32    # Reduced batch size for the larger model
block_size = 1024   # Context length
max_iters = 50
eval_interval = 1
learning_rate = 3e-4  # Standard LR for transformers
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 10
dropout = 0.1      # Standard dropout for transformers

print(f"Using device: {device}")
torch.manual_seed(1337)

# Empty CUDA cache at the start if using GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU being used: {torch.cuda.get_device_name(0)}")
    print(f"Initial GPU memory: {torch.cuda.memory_allocated()/1024**2:.1f}MB allocated, "
          f"{torch.cuda.memory_reserved()/1024**2:.1f}MB reserved")

# Create cache directory
os.makedirs('cache', exist_ok=True)

# Load TinyStories dataset - with better error handling
print("Loading dataset...")
try:
    # Try loading the STORIES dataset
    try:
        dataset = load_dataset("lucadiliello/STORIES", cache_dir='cache')
        print("Successfully loaded lucadiliello/STORIES dataset")
    except Exception as e:
        print(f"Error loading lucadiliello/STORIES: {e}")
        print("Trying to load TinyStories dataset as fallback...")
        dataset = load_dataset("roneneldan/TinyStories", cache_dir='cache')
        print("Successfully loaded roneneldan/TinyStories dataset")
    
    train_data = dataset["train"]
    val_data = dataset.get("validation", dataset.get("test"))  # Some datasets use "test" instead of "validation"
    print(f"Dataset loaded. Train size: {len(train_data)}, Val size: {len(val_data)}")
    
except Exception as e:
    print(f"Error loading both datasets: {e}")
    print("Loading a basic text dataset as fallback...")
    
    # Create a fallback minimal dataset if both attempts fail
    # This ensures the code will run for testing purposes
    basic_texts = [
        {"text": "Once upon a time, there was a little girl named Alice. She loved to play in the garden."},
        {"text": "The dog ran after the ball. He was very happy to play with his owner."},
        {"text": "Tom and Jerry were the best of friends. They always helped each other."},
        {"text": "The red car zoomed down the street. It was going very fast."},
        {"text": "Sarah liked to read books. Her favorite was about a magical kingdom."},
    ]
    
    # Create at least 100 examples by repeating
    train_texts = basic_texts * 2000  # 100 examples
    val_texts = basic_texts  # 5 examples
    
    # Convert to dataset-like format
    class SimpleDataset:
        def __init__(self, data):
            self.data = data
        def __len__(self):
            return len(self.data)
        def __getitem__(self, idx):
            return self.data[idx]
        def select(self, indices):
            return SimpleDataset([self.data[i] for i in indices])
    
    train_data = SimpleDataset(train_texts)
    val_data = SimpleDataset(val_texts)
    print(f"Created fallback dataset. Train size: {len(train_data)}, Val size: {len(val_data)}")

# Build vocabulary from the dataset
print("Building vocabulary...")
def build_vocab(data, max_samples=500000000):  # Reduced from 5000000 to be more efficient
    vocab = set()
    for i, example in enumerate(data):
        if i >= max_samples:
            break
        text = example.get('text', '')  # Safe access with default empty string
        if text:
            vocab.update(text)
    vocab = sorted(list(vocab))
    stoi = {ch: i+1 for i, ch in enumerate(vocab)}  # Reserve 0 for padding
    stoi['<PAD>'] = 0
    itos = {i: ch for ch, i in stoi.items()}
    return stoi, itos

stoi, itos = build_vocab(train_data)
vocab_size = len(stoi)
print(f"Vocabulary size: {vocab_size}")

encode = lambda s: [stoi.get(c, 0) for c in s]
decode = lambda l: ''.join([itos.get(i, '') for i in l])

# Preprocess the data with reduced size to avoid memory issues
print("Preprocessing data...")
def preprocess(example):
    # Extract text safely
    text = example.get('text', '')
    if not text:
        text = "Empty text"  # Provide a default if text is missing
    
    # Encode the text
    ids = encode(text)
    
    # Truncate to block_size
    ids = ids[:block_size]
    
    return {'input_ids': ids}

# Process a subset for faster training
train_subset_size = min(100000, len(train_data))  # Reduced from 100000 for faster testing
val_subset_size = min(1000, len(val_data))       # Reduced from 4000 for faster testing

print(f"Using {train_subset_size} training examples and {val_subset_size} validation examples")
train_data = train_data.select(range(train_subset_size))
val_data = val_data.select(range(val_subset_size))

# Process in batches to avoid memory issues
processed_train = []
processed_val = []

print("Processing training data...")
batch_size_for_processing = 32  # Even smaller batch size for preprocessing
for i in tqdm(range(0, len(train_data), batch_size_for_processing)):
    batch = train_data.select(range(i, min(i + batch_size_for_processing, len(train_data))))
    try:
        processed_batch = [preprocess(ex) for ex in batch]
        processed_train.extend(processed_batch)
    except Exception as e:
        print(f"Error in batch {i}-{i+batch_size_for_processing}: {e}")
        continue  # Skip problematic batches

print("Processing validation data...")
for i in tqdm(range(0, len(val_data), batch_size_for_processing)):
    batch = val_data.select(range(i, min(i + batch_size_for_processing, len(val_data))))
    try:
        processed_batch = [preprocess(ex) for ex in batch]
        processed_val.extend(processed_batch)
    except Exception as e:
        print(f"Error in batch {i}-{i+batch_size_for_processing}: {e}")
        continue  # Skip problematic batches

print(f"Processed {len(processed_train)} training examples")
print(f"Processed {len(processed_val)} validation examples")

# Collate function for DataLoader - modified for autoregressive LM training
def collate_fn(batch):
    # Handle empty batches
    if len(batch) == 0:
        return torch.zeros((1, 1), dtype=torch.long).to(device), torch.zeros((1, 1), dtype=torch.long).to(device)
    
    # Extract inputs
    input_ids = [torch.tensor(example['input_ids'], dtype=torch.long) for example in batch]
    
    # Get max length in this batch (to avoid unnecessary padding)
    max_len = min(max(len(x) for x in input_ids), block_size)
    
    # Pad sequences
    padded_inputs = []
    
    for inp in input_ids:
        # Truncate if necessary
        inp = inp[:max_len]
        
        # Pad if necessary
        inp_padding = max_len - len(inp)
        
        if inp_padding > 0:
            inp = torch.cat([inp, torch.zeros(inp_padding, dtype=torch.long)])
        
        padded_inputs.append(inp)
    
    # Stack into tensors
    inputs_tensor = torch.stack(padded_inputs)
    
    # For autoregressive LM, targets are the inputs shifted by one
    targets_tensor = torch.zeros_like(inputs_tensor)
    targets_tensor[:, :-1] = inputs_tensor[:, 1:]  # Shift by 1
    targets_tensor[:, -1] = 0  # Last token has no target
    
    return inputs_tensor.to(device), targets_tensor.to(device)

# DataLoaders with error handling
try:
    train_loader = DataLoader(processed_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(processed_val, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    print("Successfully created data loaders")
except Exception as e:
    print(f"Error creating DataLoaders: {e}")
    raise

# Create GPT model configuration
print("Creating GPT model configuration...")
model_config = GPTConfig(
    block_size=block_size,
    vocab_size=vocab_size,
    n_layer=8,        # Reduced from 8 to 4 layers to make it run faster with less memory
    n_head=16,         # Reduced from 16 to 8 attention heads
    n_embd=768,       # Reduced from 768 to 512 embedding dimensions for faster training
    dropout=dropout,
    bias=True         # Use bias like in GPT-2
)

# Instantiate the GPT model
print("Creating GPT model...")
model = GPT(model_config).to(device)
print(f'{sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters')

# Configure optimizer using GPT's built-in method
print("Configuring optimizer...")
weight_decay = 0.1   # Standard weight decay for transformers
optimizer = model.configure_optimizers(
    weight_decay=weight_decay,
    learning_rate=learning_rate,
    betas=(0.9, 0.95),
    device_type=device
)

# Learning rate scheduler
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    warmup_iters = 100  # first ~100 iterations for linear warmup
    # 2) if it > warmup_iters, decay learning rate linearly
    decay_ratio = min(max((it - warmup_iters) / (max_iters - warmup_iters), 0), 1)
    coeff = 0.1 + 0.9 * (1.0 - decay_ratio)  # decay down to 10% of original
    return learning_rate * coeff

lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=get_lr)

# Add mixed precision training for better GPU performance
print("Setting up mixed precision training...")
try:
    # Import CUDA amp (automatic mixed precision) if available
    from torch.cuda.amp import autocast, GradScaler
    scaler = GradScaler()
    use_amp = torch.cuda.is_available()
    if use_amp:
        print("Using mixed precision training (faster, less memory)")
    else:
        print("Mixed precision not available - using standard precision")
except ImportError:
    print("Mixed precision training not available")
    use_amp = False
    scaler = None

# Add GPU warmup to detect any immediate CUDA errors
print("Performing GPU warmup test...")
try:
    if torch.cuda.is_available():
        # Create small test tensors
        test_input = torch.zeros((2, block_size), dtype=torch.long).to(device)
        test_target = torch.zeros((2, block_size), dtype=torch.long).to(device)
        
        # Test the model
        with torch.no_grad():
            _, _ = model(test_input, test_target)
            
        print("GPU test successful!")
    else:
        print("No GPU available, skipping warmup test")
except Exception as e:
    print(f"GPU warmup test failed: {e}")
    print("Falling back to CPU")
    device = 'cpu'
    model = model.to(device)

# Training loop with progress tracking and checkpointing
print("Starting training...")
best_val_loss = float('inf')

try:
    for iter in range(max_iters):
        model.train()
        batch_losses = []
        
        # Process each batch with progress bar
        train_pbar = tqdm(train_loader, desc=f"Training iter {iter+1}/{max_iters}")
        for batch_idx, (xb, yb) in enumerate(train_pbar):
            # Skip overly large batches that might cause issues
            if xb.shape[1] > block_size or yb.shape[1] > block_size:
                print(f"Skipping batch with shapes {xb.shape}, {yb.shape} (exceeds block_size={block_size})")
                continue
                
            try:
                # Set learning rate
                lr = get_lr(iter)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
                
                # Training with mixed precision if available
                if use_amp:
                    with autocast():
                        _, loss = model(xb, yb)
                    
                    # Scale gradients and optimize
                    optimizer.zero_grad()
                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    # Standard training
                    optimizer.zero_grad()
                    _, loss = model(xb, yb)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                
                batch_losses.append(loss.item())
                train_pbar.set_postfix({"loss": loss.item(), "lr": lr})
                
            except RuntimeError as e:
                print(f"Error in batch {batch_idx}: {e}")
                print(f"Input shape: {xb.shape}, Target shape: {yb.shape}")
                continue  # Skip problematic batches
            
        if len(batch_losses) > 0:
            avg_train_loss = sum(batch_losses) / len(batch_losses)
        else:
            avg_train_loss = float('inf')
            print("Warning: No valid batches in this epoch")
        
        # Evaluate periodically
        if iter % eval_interval == 0 or iter == max_iters - 1:
            model.eval()
            val_losses = []
            
            with torch.no_grad():
                val_pbar = tqdm(val_loader, desc="Validation")
                for xb, yb in val_pbar:
                    # Skip problematic validation batches
                    if xb.shape[1] > block_size or yb.shape[1] > block_size:
                        continue
                        
                    try:
                        _, loss = model(xb, yb)
                        val_losses.append(loss.item())
                        val_pbar.set_postfix({"loss": loss.item()})
                    except RuntimeError as e:
                        print(f"Validation error: {e}")
                        continue
                    
            if len(val_losses) > 0:
                avg_val_loss = sum(val_losses) / len(val_losses)
                print(f"Step {iter}: Train loss {avg_train_loss:.4f}, Val loss {avg_val_loss:.4f}, lr {lr:.2e}")
                
                # Save checkpoint if it's the best model so far
                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    print(f"Saving checkpoint at step {iter}")
                    torch.save({
                        'iter': iter,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'val_loss': avg_val_loss,
                    }, 'gpt_stories_model_best.pt')
            else:
                print("Warning: No valid validation batches")
            
            # Generate samples for visual feedback after each iteration
            print("\n=== Sample Generations ===")
            # List of different story starters for variety
            story_starters = [
                "Once upon a time, ",
                "In a small village, ",
                "The little dog was ",
                "Tom and his friend ",
                "Sarah wanted to ",
                "The magic toy ",
                "One sunny day, ",
                "The big red ball "
            ]
            # Select a random starter
            import random
            random_starter = random.choice(story_starters)
            
            try:
                # Generate with the random starter
                print(f"Random prompt: \"{random_starter}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in random_starter]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=1024, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
                
                # Also always test with a fixed prompt to measure consistent progress
                fixed_prompt = "Once upon a time, there was a little "
                print(f"\nFixed prompt: \"{fixed_prompt}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in fixed_prompt]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=1024, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
            except Exception as e:
                print(f"Generation error: {e}")
            print("=======================\n")

except KeyboardInterrupt:
    print("Training interrupted by user.")
except Exception as e:
    print(f"Training error: {e}")
    import traceback
    traceback.print_exc()
finally:
    # Save final model
    print("Saving final model...")

Using device: cuda
GPU being used: Tesla P100-PCIE-16GB
Initial GPU memory: 281.2MB allocated, 474.0MB reserved
Loading dataset...


Resolving data files:   0%|          | 0/69 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/69 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/945354 [00:00<?, ? examples/s]

Error loading lucadiliello/STORIES: An error occurred while generating the dataset
Trying to load TinyStories dataset as fallback...
Successfully loaded roneneldan/TinyStories dataset
Dataset loaded. Train size: 2119719, Val size: 21990
Building vocabulary...
Vocabulary size: 175
Preprocessing data...
Using 100000 training examples and 1000 validation examples
Processing training data...


  0%|          | 0/3125 [00:00<?, ?it/s]

Processing validation data...


  0%|          | 0/32 [00:00<?, ?it/s]

Processed 100000 training examples
Processed 1000 validation examples
Successfully created data loaders
Creating GPT model configuration...
Creating GPT model...
Number of parameters: 57.63M
57.63M parameters
Configuring optimizer...
num decayed parameter tensors: 34, with 57,543,936 parameters
num non-decayed parameter tensors: 66, with 81,408 parameters
using fused AdamW: True
Setting up mixed precision training...
Using mixed precision training (faster, less memory)
Performing GPU warmup test...
GPU test successful!
Starting training...


  scaler = GradScaler()


Training iter 1/50:   0%|          | 0/3125 [00:00<?, ?it/s]

  with autocast():


Validation:   0%|          | 0/32 [00:00<?, ?it/s]

Step 0: Train loss 1.6766, Val loss 0.9744, lr 3.00e-05
Saving checkpoint at step 0

=== Sample Generations ===
Random prompt: "The little dog was "
Generation:
The little dog was strong her mommy with her stort. He was a big sitteday brocking her for her favorites and for the went to the arm and was so he was improurtated. 

The move her mom said and said, "Don't a pick for the sky too fish!" They went to the more and said, "I wro string to the picket of peace and the chame used them. It was so happy and couldn't be have picture the move." 

The end off the brog on the picket with the park, the brown and ran aloud all the arrow to prople of the sky. He grabbed to him and couldn't be hands and so stretch and the ground. "It's sorry, looking with arms, it's don't worry proud sorry in and the sky." 

John ground the strong and was a very scared and smiled. His mom couldn't be came to be even his proud and said, "Look, at's sorrry, you cour just proud in the trought for away. I'm go came 

Training iter 2/50:   0%|          | 0/3125 [00:00<?, ?it/s]

Validation:   0%|          | 0/32 [00:00<?, ?it/s]

Step 1: Train loss 0.9036, Val loss 0.7167, lr 3.00e-05
Saving checkpoint at step 1

=== Sample Generations ===
Random prompt: "The magic toy "
Generation:
The magic toy was girl. She was interesting through the magical because she had been for her. She looked around and saw a girl white looked near her head. She was very happy and said, "I can help you."

The little man was stuck in her room and smiled. She was so excited, but then she heard her mom said, "I can help you! You can help you help. I want to mean you?"

The little man was very careful and her mom said, "Don't can't help my to you, we can help you with you keep it?"

After a few minutes and said, "I don't want to my friends?"

The little man smiled and asked, "Yes, it can use you so mine."

The little man was so excited. She was so happy that she was so grateful to make her toys and keep the magic. She was so happy to have her can help mean day, but she cannoting home, and the little man was full.

The little man was so th

Training iter 3/50:   0%|          | 0/3125 [00:00<?, ?it/s]

Validation:   0%|          | 0/32 [00:00<?, ?it/s]

Step 2: Train loss 0.7448, Val loss 0.6374, lr 3.00e-05
Saving checkpoint at step 2

=== Sample Generations ===
Random prompt: "Once upon a time, "
Generation:
Once upon a time, there was a little boy named Timmy. Timmy loved to play outside in the sunshine. One day, Timmy went to the park with his mommy and daddy. Timmy saw a big bowl of flutter and wanted to slide down the sunshine. So, he put on his hands and bought it and said, "Look, Timmy. That's a brave! I'm glad you to hear it." But Timmy thought about it and laughed and threw it on the swing. He ran inside and ran to his mommy and gave it to Timmy. Timmy felt sad too, but he knew his mommy was also sad. The doctor said, "That's not a great idea! Let's go back to the swings and they ask him what we can do to see what we lost." Timmy gave him a new flutter and he was happy to see what was wrong. The bowl envious the swings and they sang together every day. From that day on, Timmy always remembered to caugh the swings and his mom

Training iter 4/50:   0%|          | 0/3125 [00:00<?, ?it/s]

Validation:   0%|          | 0/32 [00:00<?, ?it/s]

Step 3: Train loss 0.6743, Val loss 0.5930, lr 3.00e-05
Saving checkpoint at step 3

=== Sample Generations ===
Random prompt: "The magic toy "
Generation:
The magic toy first. The first was jumping around and sneaky. The magic took the magic when she was interesting. It had a big pile of green tea. The big magic thing the magic made someone everyone was safe. It was so small. The first made her sneaky and even knew of it was delicious. It smelled so pretty!" The magic looked up and saw a strange in the magic and smiled. Then was very persistent, she knew she had to be friends. The strange was so pretty that she wouldn't stop because it was talking to him. The magic thing she had to leave the first too. Everyone was very happy. The magic thing was so pretty that she could help the first thing. They chased the first toy first and they smiled. They enjoyed the first time until it was time to go home. The first then felt very proud and happy. It was the best magic toy part of the magic th

Training iter 5/50:   0%|          | 0/3125 [00:00<?, ?it/s]

Training interrupted by user.
Saving final model...


In [10]:
# setting model in eval mode for evaluation/inference
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(175, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Li

Generating Sample Stories using trained GPT2 model using different length of output tokens

In [11]:
print("\n=== Sample Generations ===")
            # List of different story starters for variety
story_starters = [
                "Once upon a time, ",
                "In a small village, ",
                "The little dog was ",
                "Tom and his friend ",
                "Sarah wanted to ",
                "The magic toy ",
                "One sunny day, ",
                "The big red ball "
            ]
            # Select a random starter
import random
random_starter = random.choice(story_starters)
try:
                # Generate with the random starter
                print(f"Random prompt: \"{random_starter}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in random_starter]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=1024, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
                
                # Also always test with a fixed prompt to measure consistent progress
                fixed_prompt = "Once upon a time, there was a little "
                print(f"\nFixed prompt: \"{fixed_prompt}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in fixed_prompt]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=1024, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
except Exception as e:
                print(f"Generation error: {e}")
print("=======================\n")


=== Sample Generations ===
Random prompt: "The little dog was "
Generation:
The little dog was a very upset. He wanted to bake someone else to buy someone. But it was too much for him. The little dog was only and a green frog. He was so excited to take a restore that he thought of a moment and then started climbing. He was so excited to be dry and he brought some climbing the air and put it in his pocket. He began to bake a big smile on his face. He would stay away from his favorite climbing the garden. He was so happy that he forgot out what he started to restore his head. He began to buy some food in his pocket and turned the little dog became friends. Even though it was time for the day, he made sure to keep stories and make something else else to be kind to others. He was so proud of himself for for lunch and started to climb the things. He was very happy and he smiled too. From that day on, he never wanted to have enough moment. He always remembered to be careful never forgot the

In [12]:
print("\n=== Sample Generations ===")
            # List of different story starters for variety
story_starters = [
                "Once upon a time, ",
                "In a small village, ",
                "The little dog was ",
                "Tom and his friend ",
                "Sarah wanted to ",
                "The magic toy ",
                "One sunny day, ",
                "The big red ball "
            ]
            # Select a random starter
import random
random_starter = random.choice(story_starters)
try:
                # Generate with the random starter
                print(f"Random prompt: \"{random_starter}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in random_starter]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=2048, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
                
                # Also always test with a fixed prompt to measure consistent progress
                fixed_prompt = "Once upon a time, there was a little "
                print(f"\nFixed prompt: \"{fixed_prompt}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in fixed_prompt]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=2048, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
except Exception as e:
                print(f"Generation error: {e}")
print("=======================\n")


=== Sample Generations ===
Random prompt: "The little dog was "
Generation:
The little dog was so excited. He said to his mom was walking in the woods. She knew he was a bit scared but her mom was proud. She felt a bit longer and he wanted to play. She said to him and started to play catch for a long time. He felt sad and excited.

He hugged her mom and said to her mom, "That's a great idea. It is a bad for fun." His mom smiled and said, "That's why I want to play in the woods!"

They laughed and played the bad for a long time. He gave him a big hug and had a great time. He was a bit there for a long time. He was so proud that he was able to play with.

His mom was proud of him for being successful. She played the woods in the woods and explored the woods all around him. But no matter what happened. She was so proud of him and she said, "Why are you finished?"

The little dog nodded and said, "I did a great job. I want to play with you." His mom nodded and showed him how to play his j

In [13]:
print("\n=== Sample Generations ===")
            # List of different story starters for variety
story_starters = [
                "Once upon a time, ",
                "In a small village, ",
                "The little dog was ",
                "Tom and his friend ",
                "Sarah wanted to ",
                "The magic toy ",
                "One sunny day, ",
                "The big red ball "
            ]
            # Select a random starter
import random
random_starter = random.choice(story_starters)
try:
                # Generate with the random starter
                print(f"Random prompt: \"{random_starter}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in random_starter]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=3072, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
                
                # Also always test with a fixed prompt to measure consistent progress
                fixed_prompt = "Once upon a time, there was a little "
                print(f"\nFixed prompt: \"{fixed_prompt}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in fixed_prompt]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=3072, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
except Exception as e:
                print(f"Generation error: {e}")
print("=======================\n")


=== Sample Generations ===
Random prompt: "One sunny day, "
Generation:
One sunny day, a big bird named Ben went for a walk in the park. His mom loved to play with his toys and then said there was a button. Ben liked to help his mom find a treasure in the park. He was so happy to help the bird.

Ben put on his nest box and started to play with the treasure. As he walked, he saw something shiny in the sky. It was a toy that she ran out of the treasure. It was a big red ball! He flew down to the treasure and felt happy and excited.

When Ben tried to take it home, he started to cry. He saw his mom and dad was very proud of himself. She was so proud of himself and he thanked him for the big red ball. From that day on, Ben and his mom were the best of friends. They all cheered and were the best of friends. They had a great time together.

And when they arrived at the park. The end. They all lived happily ever after. They cheered and cheered and had to share their toys. They both smiled an

In [14]:
print("\n=== Sample Generations ===")
            # List of different story starters for variety
story_starters = [
                "Once upon a time, ",
                "In a small village, ",
                "The little dog was ",
                "Tom and his friend ",
                "Sarah wanted to ",
                "The magic toy ",
                "One sunny day, ",
                "The big red ball "
            ]
            # Select a random starter
import random
random_starter = random.choice(story_starters)
try:
                # Generate with the random starter
                print(f"Random prompt: \"{random_starter}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in random_starter]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=4096, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
                
                # Also always test with a fixed prompt to measure consistent progress
                fixed_prompt = "Once upon a time, there was a little "
                print(f"\nFixed prompt: \"{fixed_prompt}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in fixed_prompt]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=4096, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
except Exception as e:
                print(f"Generation error: {e}")
print("=======================\n")


=== Sample Generations ===
Random prompt: "Sarah wanted to "
Generation:
Sarah wanted to explore an adventure. He saw a big colour in the forest places. He ran and tried against the forest, but he was too small and strong. He counted to reach them and ran fast the windows and the sight.

Sarah tried to help him, but he couldn't find help. He saw a line one day and couldn't reach the line. He tried to reach the big colour, but he was too small and realized the line, but he didn't know. Sarah refused the line that he had to thunder and said, "Sorry, line! I am the line!"

The line couldn't find another colour and said, "What are you doing a good day?" Sarah smiled and said, "Hello, I don't know. You just should like being busy with you."

The line had an idea. He said, "I don't know you can do it from here. I'm not small. I can't find my line to explore. I can do it." He said, "I will be here line, Sarah. I will help you."

Sarah smiled and said, "See, you can reach the big colour. You 

In [15]:
print("\n=== Sample Generations ===")
            # List of different story starters for variety
story_starters = [
                "Once upon a time, ",
                "In a small village, ",
                "The little dog was ",
                "Tom and his friend ",
                "Sarah wanted to ",
                "The magic toy ",
                "One sunny day, ",
                "The big red ball "
            ]
            # Select a random starter
import random
random_starter = random.choice(story_starters)
try:
                # Generate with the random starter
                print(f"Random prompt: \"{random_starter}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in random_starter]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=8192, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
                
                # Also always test with a fixed prompt to measure consistent progress
                fixed_prompt = "Once upon a time, there was a little "
                print(f"\nFixed prompt: \"{fixed_prompt}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in fixed_prompt]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=8192, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
except Exception as e:
                print(f"Generation error: {e}")
print("=======================\n")


=== Sample Generations ===
Random prompt: "Once upon a time, "
Generation:
Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she found a rock on the ground. It was pretty and worried. She ran to the box and picked it up.

Lily found a strange spot in her rock. It was a giant light and wandered it for her. Suddenly, she spotted something in the ground. It was a long tail different colorful pieces. Lily was scared and didn't know what to do.

Suddenly, Lily's mommy came out from her and asked her what was wrong. Her mommy said the light and they said they could play outside and make it better. Lily was sad because she did not want to spot it. She realized that her mommy would be better and never give up and wait until the giant kept on the ground.

From that day on, Lily always remembered to be more careful when she put it in a band-aid on the ground. She would play in the park and splash outside with a smile on her face and was happy 

In [16]:
print("\n=== Sample Generations ===")
            # List of different story starters for variety
story_starters = [
                "Once upon a time, ",
                "In a small village, ",
                "The little dog was ",
                "Tom and his friend ",
                "Sarah wanted to ",
                "The magic toy ",
                "One sunny day, ",
                "The big red ball "
            ]
            # Select a random starter
import random
random_starter = random.choice(story_starters)
try:
                # Generate with the random starter
                print(f"Random prompt: \"{random_starter}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in random_starter]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=1024*32, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
                
                # Also always test with a fixed prompt to measure consistent progress
                fixed_prompt = "Once upon a time, there was a little "
                print(f"\nFixed prompt: \"{fixed_prompt}\"")
                context = torch.tensor([[stoi.get(c, 0) for c in fixed_prompt]], dtype=torch.long, device=device)
                generated = model.generate(context, max_new_tokens=1024*32, temperature=0.8, top_k=50)
                output_text = decode(generated[0].tolist())
                print(f"Generation:\n{output_text}")
except Exception as e:
                print(f"Generation error: {e}")
print("=======================\n")


=== Sample Generations ===
Random prompt: "The magic toy "
Generation:
The magic toy boy who liked to prepare his preparents. He had a big favorite toy that his dad took him to the race. He had a yellow toy that he wanted to prepare his toy.

One day his dad was preparing him so he met a lammad. He said, "Hi dad why that was impressive?"

The lammad smiled and said, "That's a basket toy, I can help you prepare a good boy who come down and with you to stay inside."

The lammad smiled and said, "Well, you can help you prepare me that can help me when you prepare the race."

The lammad smiled and said, "It's very kind to help you. I will be nice and make the boy tomorrow and when you prepared it with the new one lammade the lammade tooes."

The lammad smiled and said, "Thank you, Thank you, Dad."

The lammad smiled and said, "I love you prepare and make a new friend. You are very happy with your preparents if you prepare help you will come down and when you will do and I will make you st

The model can generate long stories but as the length of output increases, we can see that the quality of stories generated starts to deteoriate