In [31]:
import torch
print(torch.cuda.is_available())


True


In [32]:
!pip install torch transformers



You should consider upgrading via the 'C:\Users\j_san\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [34]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256,  # Context length (reduced from 1024 to fit smaller GPUs)
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}


In [35]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [36]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
        ))

In [37]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [38]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        # Reshape to (b, num_tokens, num_heads, head_dim) -> Transpose to (b, num_heads, num_tokens, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute attention scores
        attn_scores = (queries @ keys.transpose(2, 3))

        # Mask truncated to the number of tokens
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        # Scale and Softmax
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Compute context vector
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

In [39]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x

In [40]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))

        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [41]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    return torch.tensor(encoded).unsqueeze(0) # Add batch dim

In [42]:
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0).tolist()
    return tokenizer.decode(flat)

In [43]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop the context if it becomes too large for the model
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)

        # Focus only on the last time step
        logits = logits[:, -1, :]  # (batch, vocab_size)

        # Apply softmax to get probabilities
        probs = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the token with the highest probability
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [44]:
!pip install tiktoken



You should consider upgrading via the 'C:\Users\j_san\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [45]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [46]:
import tiktoken

# Initialize model
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()  # Disable dropout

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Create a starting context
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension

# Generate text
print(f"\nInput text: {start_context}")
out = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(f"Output text: {decoded_text}")


Input text: Hello, I am
Output text: Hello, I am Laur inhab DistrinetalkQueue bear confidentlyggyenium


In [47]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

In [48]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0: return float("nan")
    if num_batches is None: num_batches = len(data_loader)
    else: num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else: break
    return total_loss / num_batches


In [49]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [50]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    # Precompute replaced text to avoid backslash in f-string expression
    safe_text = decoded_text.replace("\n", " ")
    print(f"SAMPLE: {safe_text}...")
    model.train()


In [51]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer,
                       use_amp=False, grad_accum_steps=1):
    """Training loop with optional mixed-precision and gradient accumulation.

    Args:
        use_amp (bool): If True and device is CUDA, use torch.cuda.amp for reduced memory.
        grad_accum_steps (int): Number of steps to accumulate gradients before an optimizer step.
    """
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Setup GradScaler if using AMP
    use_cuda_amp = use_amp and device.type == 'cuda'
    scaler = torch.cuda.amp.GradScaler() if use_cuda_amp else None

    for epoch in range(num_epochs):
        # Help fragmentation / free unused allocations
        if device.type == 'cuda':
            torch.cuda.empty_cache()

        model.train()
        optimizer.zero_grad()

        for step, (input_batch, target_batch) in enumerate(train_loader):
            # Move to device once per batch
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)

            # Forward / backward with or without AMP
            if use_cuda_amp:
                with torch.cuda.amp.autocast():
                    logits = model(input_batch)
                    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
                # Normalize by accumulation steps
                loss = loss / grad_accum_steps
                scaler.scale(loss).backward()
            else:
                logits = model(input_batch)
                loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
                loss = loss / grad_accum_steps
                loss.backward()

            # Optimizer step (with gradient accumulation)
            if (step + 1) % grad_accum_steps == 0:
                if use_cuda_amp:
                    # Unscale, clip grads, step, and update scaler
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                optimizer.zero_grad()

                tokens_seen += input_batch.numel()
                global_step += 1

                if global_step % eval_freq == 0:
                    train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                    train_losses.append(train_loss)
                    val_losses.append(val_loss)
                    track_tokens_seen.append(tokens_seen)
                    print(f"Ep {epoch+1} (Step {global_step:06d}): Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Show a sample at end of epoch
        generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen


In [52]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import os
import requests


In [53]:
import torch

if torch.cuda.is_available():
    print(f"‚úÖ GPU Detected: {torch.cuda.get_device_name(0)}")
    print(f"üìä VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ùå No GPU found. Check your CUDA installation.")
    

‚úÖ GPU Detected: NVIDIA GeForce RTX 2060
üìä VRAM Total: 6.44 GB


In [54]:
import os
import requests
import tiktoken
from torch.utils.data import DataLoader

# 1. Force delete the old/empty file if it exists
file_path = "tinyshakespeare.txt"
if os.path.exists(file_path):
    os.remove(file_path)
    print("Deleted old corrupted file.")

# 2. Re-download ensuring we actually get data
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200 and len(response.text) > 0:
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"‚úÖ Downloaded TinyShakespeare! Size: {len(response.text)} characters.")
else:
    raise Exception("Failed to download dataset. Check internet connection.")

# 3. Reload Data
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

split_idx = int(len(text) * 0.9)
train_txt = text[:split_idx]
val_txt = text[split_idx:]

print(f"Train Text Length: {len(train_txt)}") # Should be ~1,000,000
print(f"Val Text Length: {len(val_txt)}")

# 4. Re-initialize Dataset & Loader
# Use the (reduced) context_length from the config
tokenizer = tiktoken.get_encoding("gpt2")
context_len = GPT_CONFIG_124M['context_length']  # now 256 by default

# Use a small batch size for GPU memory constrained environments
BATCH_SIZE = 2

train_dataset = GPTDatasetV1(train_txt, tokenizer, context_len, stride=context_len)
val_dataset = GPTDatasetV1(val_txt, tokenizer, context_len, stride=context_len)

print(f"Dataset Samples: {len(train_dataset)}") # Should be > 0
print(f"Using context length {context_len} and batch size {BATCH_SIZE}")

# Now this will work
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

print("‚úÖ DataLoaders created successfully.")


Deleted old corrupted file.
‚úÖ Downloaded TinyShakespeare! Size: 1115394 characters.
Train Text Length: 1003854
Val Text Length: 111540
Dataset Samples: 1179
Using context length 256 and batch size 2
‚úÖ DataLoaders created successfully.


In [55]:
import torch
torch._dynamo.disable()


<torch._dynamo.eval_frame.DisableContext at 0x1ccffd910c0>

In [56]:
import os
os.environ["TORCH_COMPILE"] = "0"

In [57]:
!pip install executing==1.2.0




You should consider upgrading via the 'C:\Users\j_san\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [63]:
import time

# --- PART 1: The VS Code Agent's Safe Builder (KEEPS THIS) ---
def try_build_and_move_model(base_cfg, device, max_attempts=3, min_emb=128, min_layers=2):
    cfg = base_cfg.copy()
    for attempt in range(max_attempts):
        if attempt > 0:
            cfg['n_layers'] = max(min_layers, base_cfg['n_layers'] // (2 ** attempt))
            cfg['emb_dim'] = max(min_emb, base_cfg['emb_dim'] // (2 ** attempt))
            cfg['n_heads'] = max(1, base_cfg['n_heads'] // (2 ** attempt))
            while cfg['n_heads'] > 1 and cfg['emb_dim'] % cfg['n_heads'] != 0:
                cfg['n_heads'] -= 1
        
        print(f"Attempt {attempt+1}/{max_attempts}: Building model (n_layers={cfg['n_layers']}, emb_dim={cfg['emb_dim']})...")
        
        try:
            torch.manual_seed(123)
            model = GPTModel(cfg)
            if device.type == 'cuda':
                model.to(device)
                # model.half() # Optional: Uncomment if you still get OOM
            else:
                model.to(device)
            print("Model successfully allocated on", device)
            return model, cfg
            
        except (RuntimeError, torch.cuda.OutOfMemoryError):
            print("‚ö†Ô∏è OOM Error! Shrinking model and retrying...")
            try: del model
            except: pass
            if device.type == 'cuda': torch.cuda.empty_cache()
            import gc; gc.collect()
            continue

    # Fallback to CPU if GPU fails completely
    print("All GPU attempts failed. Falling back to CPU.")
    model = GPTModel(cfg)
    model.to('cpu')
    return model, cfg

# --- PART 2: Initialize (KEEPS THIS) ---
# This replaces your old "model = GPTModel(...)" line
model, used_cfg = try_build_and_move_model(GPT_CONFIG_124M, DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

# --- PART 3: The Training Loop (ADD THIS BACK) ---
# This was missing from the agent's code!
print("Starting training...")
start_time = time.time()

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, DEVICE,
    num_epochs=10, eval_freq=50, eval_iter=5, 
    start_context="The king", tokenizer=tokenizer
)

end_time = time.time()
print(f"Training completed in {(end_time - start_time) / 60:.2f} minutes.")

Attempt 1/3: Building model (n_layers=12, emb_dim=768)...
Model successfully allocated on cuda
Starting training...
Ep 1 (Step 000000): Train loss 9.448, Val loss 9.322
Ep 1 (Step 000050): Train loss 6.294, Val loss 6.305
Ep 1 (Step 000100): Train loss 5.786, Val loss 6.069
Ep 1 (Step 000150): Train loss 5.762, Val loss 5.921
Ep 1 (Step 000200): Train loss 5.516, Val loss 5.711
Ep 1 (Step 000250): Train loss 5.220, Val loss 5.604
Ep 1 (Step 000300): Train loss 5.297, Val loss 5.680
Ep 1 (Step 000350): Train loss 5.176, Val loss 5.496
Ep 1 (Step 000400): Train loss 5.263, Val loss 5.442
Ep 1 (Step 000450): Train loss 5.002, Val loss 5.373
Ep 1 (Step 000500): Train loss 4.908, Val loss 5.372
Ep 1 (Step 000550): Train loss 5.063, Val loss 5.410
SAMPLE: The king,                                                 ...
Ep 2 (Step 000600): Train loss 5.045, Val loss 5.323
Ep 2 (Step 000650): Train loss 4.782, Val loss 5.238
Ep 2 (Step 000700): Train loss 4.437, Val loss 5.153
Ep 2 (Step 000750):

In [62]:
# Generate with randomness (Temperature = 0.8)
context = torch.tensor(tokenizer.encode("The king")).unsqueeze(0).to(DEVICE)

# Standard generation loop with temperature
for _ in range(100):
    logits = model(context)[:, -1, :] / 0.8  # Temperature scaling
    probs = torch.softmax(logits, dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)
    context = torch.cat((context, next_token), dim=1)

print(tokenizer.decode(context[0].tolist()))

The king man
'er, in the leaves,
Lest mul more the world's heels love,
With all this land to thyself,
We worthy,
He cannot keep the jewel
town doth wearrt the ground,
For that hath slain,
And we may call them by the book of all night to aged
Upon each foul myself of such a horse
Even in my head.

GLOUCESTER:
Where says that he shall be
With
