In [1]:
import torch
from tqdm import tqdm
import time

from adan_pytorch import Adan

from utils import (resnetModel, cifar10Dataloader, train, accuracy_check)


In [2]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True

In [3]:
model = resnetModel(output_size=10, Pretrained=False, Device="cuda")

Using cache found in /home/venom/.cache/torch/hub/pytorch_vision_v0.6.0


In [4]:
trainloader = cifar10Dataloader(num_workers=4, batch_size=32, train=True , shuffle=True, data_dir="./data",pin_memory=True)
testloader  = cifar10Dataloader(num_workers=4, batch_size=32, train=False, shuffle=True, data_dir="./data",pin_memory=True)

Files already downloaded and verified
Files already downloaded and verified


Here, I am using adan optimizer, instead of adam. This is because, adam optimizer, slows down when its set to captureable mode. On the other hand, adan optimizer is also promising in its resutls, and does give decent boost in performance when used in cuda graph.

In [5]:
optimizer = Adan(
    model.parameters(),
    lr=0.005,
    betas=(0.02, 0.08, 0.01),
    weight_decay=0.02
)
criterion = torch.nn.CrossEntropyLoss()

In [6]:
accuracy_check(model, testloader)

Test: 100%|██████████| 312/312 [00:06<00:00, 51.36batch/s, Accuracy=10.1]


In [7]:
_ = train(model, epochs=5, TrainLoader=trainloader, TestLoader=testloader, optimizer=optimizer, criterion=criterion, device="cuda")

Epoch 1: 100%|██████████| 1562/1562 [01:58<00:00, 13.19Batch/s, loss=1.93]
Epoch 2: 100%|██████████| 1562/1562 [01:58<00:00, 13.23Batch/s, loss=1.41]
Epoch 3: 100%|██████████| 1562/1562 [01:58<00:00, 13.22Batch/s, loss=1.75] 
Epoch 4: 100%|██████████| 1562/1562 [01:57<00:00, 13.25Batch/s, loss=1.18] 
Epoch 5: 100%|██████████| 1562/1562 [01:57<00:00, 13.24Batch/s, loss=1.2]  
Test: 100%|██████████| 312/312 [00:04<00:00, 72.72batch/s, Accuracy=66.9]


Usually the training loop is like this

```python
for epoch in range(1,6):
    model.train()
    with tqdm(trainloader, unit="batch") as tepoch:
        for data, target in tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            tepoch.set_postfix(loss=loss.item())
```

In [8]:
# Cuda graphs follows the following steps

# Warmup

# Record

# Replay

In [9]:
static_input , static_output = next(iter(trainloader))
static_input = static_input.cuda()
static_output = static_output.cuda()

In [10]:
# Warmup

start = time.time()

trainStream = torch.cuda.Stream()
trainStream.wait_stream(torch.cuda.current_stream())

with torch.cuda.stream(trainStream):
    for i in range(10):
        optimizer.zero_grad(set_to_none=True)
        output = model(static_input)
        loss = criterion(output, static_output)
        loss.backward()
        optimizer.step()
torch.cuda.current_stream().wait_stream(trainStream)

print("Warmup Time: ", time.time() - start)

Warmup Time:  0.6811985969543457


In [11]:
trainGraph = torch.cuda.CUDAGraph()
optimizer.zero_grad(set_to_none=True)

with torch.cuda.graph(trainGraph):
    output = model(static_input)
    loss = criterion(output, static_output)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

In [12]:

start = time.time()

for epoch in range(1,6):
    with tqdm(trainloader, unit="batch") as tepoch:
        for data, target in tepoch:
            tepoch.set_description("Epoch {}".format(epoch))
            static_input.copy_(data.cuda())
            static_output.copy_(target.cuda())
            trainGraph.replay()
            tepoch.set_postfix(Loss=loss.item())

print("Graph Time: ", time.time() - start)

Epoch 1: 100%|██████████| 1562/1562 [01:36<00:00, 16.20batch/s, Loss=0.804]
Epoch 2: 100%|██████████| 1562/1562 [01:36<00:00, 16.23batch/s, Loss=0.655]
Epoch 3: 100%|██████████| 1562/1562 [01:36<00:00, 16.23batch/s, Loss=0.652]
Epoch 4: 100%|██████████| 1562/1562 [01:36<00:00, 16.25batch/s, Loss=0.703]
Epoch 5: 100%|██████████| 1562/1562 [01:36<00:00, 16.26batch/s, Loss=0.667]

Graph Time:  481.0689322948456





In [13]:
accuracy_check(model, testloader)

Test: 100%|██████████| 312/312 [00:04<00:00, 70.83batch/s, Accuracy=72.3]


In [14]:
# without tqdm

start = time.time()
for epoch in range(6,11):
    for data, target in trainloader:
        static_input.copy_(data.cuda())
        static_output.copy_(target.cuda())
        trainGraph.replay()

print("Graph Time (no tqdm): ", time.time() - start)

Graph Time (no tqdm):  468.69113993644714


In [15]:
accuracy_check(model, testloader)

Test: 100%|██████████| 312/312 [00:04<00:00, 71.33batch/s, Accuracy=75.6]
