## Task 3: Training Loop Implementation (40 Points)

---

Finally, create a training loop considering these following requirements:

1. **Single GPU Training Loop:** Your base implementation should be equipped to train your model on a single GPU setup.
2. **Distributed Data Parallel (DDP):** Extend your single GPU training loop to support training across multiple GPUs using DDP. Revisit the [PyTorch's DDP tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) for guidance.
3. **Fully Sharded Data Parallel (FSDP):** Implement FSDP as a part of your training loop to shard the model parameters, gradients, and optimizer state. You can follow [Gupta et al., 2020, Training GPT-3 Like Models on a Single Machine](https://arxiv.org/pdf/2101.06840.pdf) for a comprehensive understanding of it.

**Deliverable:** A Python script containing a functional training loop that is compatible with single GPU, DDP, and FSDP options along with a documentation illustrating how the code adapts to each setting.

**Evaluation Scheme:** Each feature implementation will account for:

- Single GPU: 10 points
- DDP: 10 points
- FSDP: 20 points

**Note:** Document your code, approaches, difficulties encountered, and your solutions 
thoroughly. Include any reference materials you used in your report. Focus on clear communication of your methodologies and results.

In [1]:
import math
import torch
from torch import nn
import torch.nn.functional as F
class GPTConfig:
    attn_dropout = 0.1
    embed_dropout = 0.1
    ff_dropout = 0.1
    
    def __init__(
        self, vocab_size, max_len, **kwargs
    ):
        self.vocab_size = vocab_size
        self.max_len = max_len
        for key, value in kwargs.items():
            setattr(self, key, value)

class GPT1Config(GPTConfig):
    num_heads = 12
    num_blocks = 12
    embed_dim = 768

In [2]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.embed_dim
        self.max_len = config.max_len
        self.tok_embed = nn.Embedding(
            config.vocab_size, embed_dim
        )
        self.pos_embed = nn.Parameter(
            torch.zeros(1, config.max_len, embed_dim)
        )
        self.dropout = nn.Dropout(config.embed_dropout)
        self.blocks = nn.Sequential(
            *[Block(config) for _ in range(config.num_blocks)]
        )
        self.ln = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, config.vocab_size)
    
    def forward(self, x, target=None):
        # batch_size = x.size(0)
        seq_len = x.size(1)
        assert seq_len <= self.max_len, "sequence longer than model capacity"
        
        tok_embedding = self.tok_embed(x)
        # tok_embedding.shape == (batch_size, seq_len, embed_dim)
        pos_embedding = self.pos_embed[:, :seq_len, :]
        # pos_embedding.shape == (1, seq_len, embed_dim)
        x = self.dropout(tok_embedding + pos_embedding)
        x = self.blocks(x)
        x = self.ln(x)
        x = self.fc(x)
        # x.shape == (batch_size, seq_len, vocab_size)
        return x
    
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.embed_dim
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.attn = MultiheadAttention(config)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.GELU(),
            nn.Linear(embed_dim * 4, embed_dim),
            nn.Dropout(config.ff_dropout),
        )
    
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x
    
    
class MultiheadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.embed_dim
        self.num_heads = config.num_heads
        assert embed_dim % self.num_heads == 0, "invalid heads and embedding dimension configuration"
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.query = nn.Linear(embed_dim, embed_dim)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.attn_dropout = nn.Dropout(config.attn_dropout)
        self.proj_dropout = nn.Dropout(config.ff_dropout)
        self.register_buffer(
            "mask", 
            torch.tril(torch.ones(config.max_len, config.max_len))
            .unsqueeze(0).unsqueeze(0)
        )
    
    def forward(self, x):
        batch_size = x.size(0)
        seq_len = x.size(1)
        # x.shape == (batch_size, seq_len, embed_dim)
        k_t = self.key(x).reshape(batch_size, seq_len, self.num_heads, -1).permute(0, 2, 3, 1)
        v = self.value(x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        q = self.query(x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        # shape == (batch_size, num_heads, seq_len, head_dim)
        
        attn = torch.matmul(q, k_t) / math.sqrt(q.size(-1))
        # attn.shape == (batch_size, num_heads, seq_len, seq_len)
        mask = self.mask[:, :, :seq_len, :seq_len]
        attn = attn.masked_fill(mask == 0, float("-inf"))
        attn = self.attn_dropout(attn)
        # attn.shape == (batch_size, num_heads, seq_len, seq_len)
        attn = F.softmax(attn, dim=-1)
        y = torch.matmul(attn, v)
        # y.shape == (batch_size, num_heads, seq_len, head_dim)
        y = y.transpose(1, 2)
        # y.shape == (batch_size, seq_len, num_heads, head_dim)
        y = y.reshape(batch_size, seq_len, -1)
        # y.shape == (batch_size, seq_len, embed_dim)
        y = self.proj_dropout(self.proj(y))
        return y
    
gpt_config = GPT1Config(vocab_size=10000, max_len=512, embed_dim=768, num_heads=12, num_blocks=12)

# Create an instance of the GPT model
gpt_model = GPT(gpt_config)

# Generate a random input sequence for testing
input_sequence = torch.randint(0, gpt_config.vocab_size, (1, gpt_config.max_len))

# Forward pass through the GPT model
output = gpt_model(input_sequence)

# Print the output shape for verification
print("Output shape:", output.shape)

Output shape: torch.Size([1, 512, 10000])


**Task 3 solution**

**Single GPU Training Loop:**


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Assuming have a random dataset
num_epochs = 10 
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(gpt_model.parameters(), lr=0.001) #gpt_model is defined in task 1

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt_model.to(device)

# Generate a random dataset for illustration
data_size = 1000
seq_length = 512
vocab_size = 10000

# Random input sequences
inputs = torch.randint(0, vocab_size, (data_size, seq_length))
# Corresponding random target sequences with the same length as inputs
targets = torch.randint(0, vocab_size, (data_size, seq_length))

# Create a TensorDataset from inputs and targets
dataset = TensorDataset(inputs, targets)

# Define batch size
batch_size = 32

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


# Training loop
for epoch in range(num_epochs):
    gpt_model.train()
    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        outputs = gpt_model(inputs)
        
        # Make sure targets have the same sequence length as outputs
        targets = targets[:, :outputs.size(1)]  

        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        loss.backward()
        optimizer.step()


ValueError: Expected input batch_size (5120000) to match target batch_size (5120).

**Distributed Data Parallel (DDP):**


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader

# Assuming DistributedSampler and DataLoader as it takes to much ram to run

# Initialize the model on each GPU
model = GPT(gpt_config).to(device)
model = DistributedDataParallel(model)

# Defining loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

# Save the trained model
if torch.distributed.get_rank() == 0:
    torch.save(model.module.state_dict(), 'ddp_model.pth')


**Fully Sharded Data Parallel (FSDP):**


In [10]:
import torch
import torch.optim as optim
from torch.distributed.fsdp import FullyShardedDataParallel 

# Assuming set up FSDP and DataLoader as it takes too much space to run

# Initialize the model on each GPU
model = GPT(gpt_config).cuda()
fsdp_model = FullyShardedDataParallel(model)

# Definingloss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(fsdp_model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    fsdp_model.train()
    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.cuda(), targets.cuda()

        optimizer.zero_grad()

        outputs = fsdp_model(inputs)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

# Save the trained model
torch.save(fsdp_model.module.state_dict(), 'fsdp_model.pth')


AssertionError: Torch not compiled with CUDA enabled

There three are the template codes for 
- Single GPU
- DDP
- FSDP

These templates can be used with GPT model made in task 1 with gpu.