In [1]:
import torch 

In [2]:
import os
import sys

# Add the parent directory to Python path to import from src
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

from GPT_model import GPTModel

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [4]:
torch.manual_seed(123)

model = GPTModel(GPT_CONFIG_124M)

model.eval();

In [5]:
import tiktoken

from utils.generate_text_simple import generate_text_simple

def text_to_token_ids(text , tokenizer):
    encoded = tokenizer.encode(text , allowed_special={"<|endoftext|>"})
    encoded_tensot = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensot


def token_ids_to_text(token_ids , tokenizer : tiktoken):
    return tokenizer.decode(token_ids.squeeze(0).tolist())

In [34]:
from tokenize import TokenInfo


text = "hello how are you doing to day (:"

tokenizer = tiktoken.get_encoding("gpt2")

token_ids =  text_to_token_ids(text , tokenizer)

token_ids

tensor([[31373,   703,   389,   345,  1804,   284,  1110,   357,    25]])

In [None]:
token_ids_to_text(token_ids , tokenizer)

'hello how are you doing to day (:'

In [37]:
token_ids = generate_text_simple(
    model = model,
    idx = text_to_token_ids(text , tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
    
)
token_ids

tensor([[31373,   703,   389,   345,  1804,   284,  1110,   357,    25, 43445,
         16737, 43582,  5626, 24716, 45853, 14777, 48882, 12724, 30893]])

In [38]:
token_ids_to_text(token_ids , tokenizer)

'hello how are you doing to day (:Peacerunnerchapter NOTnas cloakedッ747 cleaning Bloody'

# Calculating the text generation loss

In [39]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary
print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)

torch.Size([2, 3, 50257])


In [42]:
torch.argmax(probas , dim = -1 , keepdim=True)

tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])

In [43]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)



Text 1: tensor([7.4536e-05, 3.1061e-05, 1.1563e-05])


In [44]:
text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 2: tensor([1.0337e-05, 5.6771e-05, 4.7559e-06])


In [49]:
log_prob =  torch.log(torch.cat((target_probas_1 , target_probas_2)))

In [50]:
log_prob

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7765, -12.2561])

In [52]:
-1 *  torch.mean(log_prob)

tensor(10.7940)

In [60]:
targets

tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])

In [71]:
logits_flat.shape

torch.Size([6, 50257])

In [75]:
logits_flat[0][345]

tensor(0.0758)

In [59]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [61]:
torch.nn.functional.cross_entropy(logits_flat , targets_flat)

tensor(10.7940)

In [None]:
torch.exp(torch.tensor())

tensor(59874.1406)

#  5.1.3 Calculating the training and validation set losses

In [78]:
data_path  = os.path.join(".." , "data" , "cleaned_text.txt")

In [79]:
with open(data_path, "r" , encoding="utf-8" ) as file:
    text_data = file.read()

In [80]:
text_data[:99]

'التزات النبوي صحيح البخارية للإماراتي عبد الله محمد بن اسماعيل البخاري الجيفي رحمه الله تعالى ضبطه '

In [81]:
len(text_data)

2464201

In [82]:
len(tokenizer.encode(text_data))

2480454

In [83]:
from utils.data_loader import create_dataloader_v1


train_ratio = 0.80
split_idx  = int(train_ratio * len(text_data))

train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


In [90]:
torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [91]:
train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)

Training tokens: 1982464
Validation tokens: 495872
All tokens: 2478336


In [92]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [94]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note:
# Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
# However, the resulting loss values may be slightly different.

#if torch.cuda.is_available():
#    device = torch.device("cuda")
#elif torch.backends.mps.is_available():
#    device = torch.device("mps")
#else:
#    device = torch.device("cpu")
#
# print(f"Using {device} device.")


model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes


torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
    train_loss = calc_loss_loader(train_loader, model, device , num_batches = 2)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches = 2)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.974916934967041
Validation loss: 10.968953132629395


In [95]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()




In [96]:
# Note:
# Uncomment the following code to calculate the execution time
import time
start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)


In [None]:

num_epochs = 2
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
)

# Note:
# Uncomment the following code to show the execution time
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 8.402, Val loss 8.340
