In [57]:
import torch
import numpy as nm

In [58]:
idx = torch.tensor([
    [10, 11, 12, 13, 14, 15],
    [16, 17, 18, 19, 20, 21]
])

context_size = 3

idx_cond = idx[:, -context_size:]

print(idx_cond)

tensor([[13, 14, 15],
        [19, 20, 21]])


⸻


# 🐍 Python Iteration Cheatsheet

## 🔁 enumerate()

Use enumerate() when you want both the index and the value:

```python
items = ['apple', 'banana', 'cherry']

for i, item in enumerate(items):
    print(i, item)

Output:

0 apple
1 banana
2 cherry


⸻

🔗 zip()

Use zip() to iterate over multiple lists in parallel:

names = ['Alice', 'Bob', 'Charlie']
scores = [85, 90, 95]

for name, score in zip(names, scores):
    print(name, score)

Output:

Alice 85
Bob 90
Charlie 95


⸻

🎯 range()

Use range() to create a sequence of numbers:

for i in range(5):
    print(i)

for i in range(1, 10, 2):  # start=1, stop=10, step=2
    print(i)


⸻

🧼 map()

Apply a function to every item in an iterable:

nums = [1, 2, 3]
squared = list(map(lambda x: x**2, nums))
print(squared)  # [1, 4, 9]


⸻

🧃 filter()

Filter items using a condition:

nums = [1, 2, 3, 4, 5]
evens = list(filter(lambda x: x % 2 == 0, nums))
print(evens)  # [2, 4]


⸻

📌 Quick Reference Table

Function	Use Case
enumerate()	Get index and value while iterating
zip()	Iterate multiple sequences together
range()	Generate a sequence of numbers
map()	Apply function to all elements
filter()	Select items that meet a condition

In [59]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [60]:
file_path = "C:/Users/Taha/OneDrive/Desktop/LLM/Book/the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
 text_data = file.read()

In [61]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [62]:
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        super().__init__()
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        length = len(token_ids)

        for i in range(0, length - max_length - 1, stride ):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + 1 + max_length]

            if len(input_chunk) == max_length and len(target_chunk) == max_length:
                self.input_ids.append(torch.tensor(input_chunk))
                self.target_ids.append(torch.tensor(target_chunk))

        
    def __len__(self):
            return len(self.input_ids)
        
    def __getitem__(self, idx):
            return self.input_ids[idx], self.target_ids[idx]


In [63]:
def create_dataloader_v1(
        txt:str, 
        batch_size:int, 
        max_length:int, 
        stride: int, 
        shuffle: bool, 
        drop_last: bool, 
        num_workers: int
):
    

    dataset = GPTDataset(
        txt=txt,
        tokenizer=tokenizer,
        max_length=max_length,
        stride=stride,
    )

    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [64]:
torch.manual_seed(123)

train_loader = create_dataloader_v1(
    txt=train_data, 
    batch_size=2, 
    max_length=256, 
    stride=256, 
    shuffle=True, 
    drop_last=True, 
    num_workers=0
)

val_loader = create_dataloader_v1(
    txt=val_data,
    batch_size=2,
    max_length=256,
    stride=256,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

In [65]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0, 1), target_batch.flatten()
    )

    return loss

In [66]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_batch(
            train_loader, model, device, num_batch = eval_iter
        )
        
        val_loss = calc_loss_batch(
            val_loader, model, device, num_batch = eval_iter
        )

        model.train()
        
        return train_loss, val_loss

In [None]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'}) # allowed_special makes sure special tokens are handled correctly 
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # unsqueeze to add batch dimension  
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # Convert to 1D tensor and remove batch dimension
    return tokenizer.decode(flat.tolist()) # Convert to list for decoding

In [None]:
def generate_text_simple(model, idx, max_new_tokens=50, context_size=context_size):
     for _ in range(max_new_tokens):
          idx_cond = idx[:, -context_size:]
          with torch.no_grad():
               logits = model(idx_cond)

In [69]:
def generate_and_print_Sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model, idx = encoded, 
            max_new_tokens=50, context_size=context_size
            )

In [70]:
def train_model_simple(model, train_loader, val_loader, 
                       optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    
    train_losses, val_losses, track_token_seen = [], [], []
    token_seen, general_mode = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss= calc_loss_batch(
                input_batch, target_batch, model, device
                )
            loss.backward()
            optimizer.step()

            if general_mode % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )

                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_token_seen.append(token_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, "
                      f"Val loss {val_loss:.3f}"
                      )
                generate_and_print_Sample(
                    model, tokenizer, device, start_context
                    )
                
    return train_losses, val_losses, track_token_seen