# Single GPU Training

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet18().to(device)
transforms = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
train_dataset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transforms
)
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=128, shuffle=True, num_workers=2
)

for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += predicted.eq(labels).sum().item()

# Data Parallel

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=resnet18()
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model.to(device)
transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))])
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)

for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs=model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs,1)
        correct += predicted.eq(labels).sum().item()

# Distributed Data Parallel

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18\
from torch.distributed import dist
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp

def dist_training_loop(rank,world_size,train_dataloader,model,criterion,optimizer):
    dist.init_process_group(backend='gloo',rank=rank,world_size=world_size)
    model.to(rank)
    ddp_model = DDP(model,device_ids=[rank])
    optimizer= optimizer(ddp_model.parameters(),lr=0.001)
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        for inputs, labels in train_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs=model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs,1)
            correct += predicted.eq(labels).sum().item()
    
    dist.destroy_process_group()

if __name__ == '__main__':
    world_size = 2
    mp.spawn(dist_training_loop, args=(world_size,train_dataloader,model,criterion,optimizer), nprocs=world_size, join=True)

# Model Parallel

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import models, datasets, transforms

# Define hyperparameters
num_classes = 10
split_size = 32  # Adjust according to batch size requirements
N_EPOCHS = 10  # Example epoch count


# Define a model based on AlexNet and split across two GPUs
class TwoGPUAlexNet(nn.Module):
    def __init__(self, num_classes=num_classes, split_size=split_size):
        super(TwoGPUAlexNet, self).__init__()
        self.split_size = split_size

        # Load a pretrained AlexNet and modify for model parallelism
        alexnet = models.alexnet(pretrained=True)

        # Divide model parts across GPUs
        self.features = alexnet.features.to("cuda:0")
        self.avgpool = alexnet.avgpool.to("cuda:0")
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        ).to("cuda:1")

    def forward(self, x):
        # Split the input batch across GPUs
        splits = iter(x.split(self.split_size, dim=0))
        s_next = next(splits)

        # Forward pass for the first part on GPU 0
        s_prev = self.features(s_next)
        s_prev = self.avgpool(s_prev)
        s_prev = s_prev.view(s_prev.size(0), -1).to("cuda:1")  # Transfer to GPU 1

        ret = []

        for s_next in splits:
            # Process on GPU 1
            s_prev = self.classifier(s_prev)
            ret.append(s_prev)

            # Forward pass for next part on GPU 0
            s_prev = self.features(s_next.to("cuda:0"))
            s_prev = self.avgpool(s_prev)
            s_prev = s_prev.view(s_prev.size(0), -1).to("cuda:1")

        # Final part
        s_prev = self.classifier(s_prev)
        ret.append(s_prev)

        return torch.cat(ret)


# Instantiate model, loss function, and optimizer
model = TwoGPUAlexNet(num_classes=num_classes, split_size=split_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Dummy dataset and dataloader setup for illustration
transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ]
)
train_dataset = datasets.FakeData(transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Training loop
for epoch in range(N_EPOCHS):
    for inputs, labels in train_dataloader:
        inputs = inputs.to("cuda:0")  # Inputs on GPU 0
        labels = labels.to("cuda:1")  # Labels on GPU 1 (since outputs will be on GPU 1)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{N_EPOCHS} completed")

# DDP & Model Parallel

In [None]:
class Simple2GPUModel(nn.Module):
    def __init__(self, dev0, dev1):
        super().__init__()
        self.dev0 = dev0
        self.dev1 = dev1
        self.net1 = nn.Linear(10, 10).to(dev0)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, 5).to(dev1)

    def forward(self, x):
        x = x.to(self.dev0)
        x = self.net1(x)
        x = self.relu(x)
        x = x.to(self.dev1)
        x = self.net2(x)
        return x


def model_parallel_training(rank, world_size):
    dev0 = rank * 2
    dev1 = rank * 2 + 1
    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)

    ddp_model = DDP(Simple2GPUModel(dev0, dev1), device_ids=[dev0, dev1])
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        for inputs, labels in train_dataloader:
            inputs, labels = inputs.to(rank), labels.to(rank)

            optimizer.zero_grad()
            outputs = ddp_model(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

    dist.destroy_process_group()