In [1]:
import torch
from tqdm import tqdm
from torchvision import datasets, transforms

In [2]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True


In [3]:
# Download resnet 100 and put in gpu
model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet101', pretrained=False)

Using cache found in /home/venom/.cache/torch/hub/pytorch_vision_v0.6.0


In [4]:
# Download cifar 10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)


Files already downloaded and verified
Files already downloaded and verified


In [5]:
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=64, shuffle=True, num_workers=1)

testloader = torch.utils.data.DataLoader(
    testset, batch_size=64, shuffle=False, num_workers=1)

In [6]:
# Replace last layer with 10 nodes
model.fc = torch.nn.Linear(2048, 10)

In [7]:
# Move model to GPU
model = model.cuda()

In [8]:
scaler = torch.cuda.amp.GradScaler()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

In [9]:
# Train
for epoch in range(10):
    with tqdm(trainloader, unit="batch") as tepoch:
        for data, target in tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                output = model(data)
                loss = criterion(output, target)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            tepoch.set_postfix(loss=loss.item())

    # Test
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        with tqdm(testloader, unit="batch") as t2epoch:
            for data, target in t2epoch:
                t2epoch.set_description(f"Epoch {epoch}")
                data, target = data.cuda(), target.cuda()
                output = model(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
                t2epoch.set_postfix(Accuracy=(100 * correct / total))

    print(" ")


Epoch 0: 100%|██████████| 782/782 [01:25<00:00,  9.13batch/s, loss=2.09]
Epoch 0: 100%|██████████| 157/157 [00:03<00:00, 43.63batch/s, Accuracy=38.6]


 


Epoch 1: 100%|██████████| 782/782 [01:19<00:00,  9.88batch/s, loss=1.65]
Epoch 1: 100%|██████████| 157/157 [00:03<00:00, 51.63batch/s, Accuracy=48.3]


 


Epoch 2: 100%|██████████| 782/782 [00:49<00:00, 15.75batch/s, loss=1.07] 
Epoch 2: 100%|██████████| 157/157 [00:02<00:00, 53.59batch/s, Accuracy=54.3]


 


Epoch 3: 100%|██████████| 782/782 [00:45<00:00, 17.10batch/s, loss=1.49] 
Epoch 3: 100%|██████████| 157/157 [00:02<00:00, 53.94batch/s, Accuracy=61.4]


 


Epoch 4: 100%|██████████| 782/782 [00:46<00:00, 16.98batch/s, loss=0.839]
Epoch 4: 100%|██████████| 157/157 [00:02<00:00, 53.23batch/s, Accuracy=61.4]


 


Epoch 5: 100%|██████████| 782/782 [00:46<00:00, 16.96batch/s, loss=1.02] 
Epoch 5: 100%|██████████| 157/157 [00:02<00:00, 53.26batch/s, Accuracy=64.2]


 


Epoch 6: 100%|██████████| 782/782 [00:45<00:00, 17.28batch/s, loss=1.1]  
Epoch 6: 100%|██████████| 157/157 [00:02<00:00, 53.20batch/s, Accuracy=66.5]


 


Epoch 7: 100%|██████████| 782/782 [00:46<00:00, 16.71batch/s, loss=0.628]
Epoch 7: 100%|██████████| 157/157 [00:02<00:00, 53.17batch/s, Accuracy=69]  


 


Epoch 8: 100%|██████████| 782/782 [00:46<00:00, 16.89batch/s, loss=0.546]
Epoch 8: 100%|██████████| 157/157 [00:02<00:00, 53.26batch/s, Accuracy=70.2]


 


Epoch 9: 100%|██████████| 782/782 [00:46<00:00, 16.85batch/s, loss=0.445]
Epoch 9: 100%|██████████| 157/157 [00:02<00:00, 52.98batch/s, Accuracy=70.4]

 





In [10]:
## ADAN

# recreate the model

model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet101', pretrained=False)
model.fc = torch.nn.Linear(2048, 10)
model = model.cuda()

Using cache found in /home/venom/.cache/torch/hub/pytorch_vision_v0.6.0


In [11]:
from adan_pytorch import Adan

optim = Adan(
    model.parameters(),
    # learning rate (can be much higher than Adam, up to 5-10x)
    lr=0.001,
    # beta 1-2-3 as described in paper - author says most sensitive to beta3 tuning
    betas=(0.02, 0.08, 0.01),
    weight_decay=0.02         # weight decay 0.02 is optimal per author
)


In [12]:
# Train
for epoch in range(10):
    with tqdm(trainloader, unit="batch") as tepoch:
        for data, target in tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            data, target = data.cuda(), target.cuda()
            optim.zero_grad()
            with torch.cuda.amp.autocast():
                output = model(data)
                loss = criterion(output, target)
            scaler.scale(loss).backward()
            scaler.step(optim)
            scaler.update()
            tepoch.set_postfix(loss=loss.item())

    # Test
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        with tqdm(testloader, unit="batch") as t2epoch:
            for data, target in t2epoch:
                t2epoch.set_description(f"Epoch {epoch}")
                data, target = data.cuda(), target.cuda()
                output = model(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
                t2epoch.set_postfix(Accuracy=(100 * correct / total))

    print(" ")


Epoch 0: 100%|██████████| 782/782 [01:10<00:00, 11.03batch/s, loss=1.05]
Epoch 0: 100%|██████████| 157/157 [00:02<00:00, 54.34batch/s, Accuracy=49.6]


 


Epoch 1: 100%|██████████| 782/782 [01:07<00:00, 11.51batch/s, loss=0.945]
Epoch 1: 100%|██████████| 157/157 [00:02<00:00, 54.15batch/s, Accuracy=55.8]


 


Epoch 2: 100%|██████████| 782/782 [01:08<00:00, 11.39batch/s, loss=1.27] 
Epoch 2: 100%|██████████| 157/157 [00:02<00:00, 52.89batch/s, Accuracy=62]  


 


Epoch 3: 100%|██████████| 782/782 [01:08<00:00, 11.37batch/s, loss=0.355]
Epoch 3: 100%|██████████| 157/157 [00:02<00:00, 52.88batch/s, Accuracy=68.1]


 


Epoch 4: 100%|██████████| 782/782 [01:09<00:00, 11.18batch/s, loss=1.25] 
Epoch 4: 100%|██████████| 157/157 [00:02<00:00, 53.21batch/s, Accuracy=69.2]


 


Epoch 5: 100%|██████████| 782/782 [01:09<00:00, 11.26batch/s, loss=0.594]
Epoch 5: 100%|██████████| 157/157 [00:02<00:00, 52.70batch/s, Accuracy=70.5]


 


Epoch 6: 100%|██████████| 782/782 [01:10<00:00, 11.06batch/s, loss=0.32] 
Epoch 6: 100%|██████████| 157/157 [00:02<00:00, 52.91batch/s, Accuracy=72.2]


 


Epoch 7: 100%|██████████| 782/782 [01:09<00:00, 11.25batch/s, loss=0.399]
Epoch 7: 100%|██████████| 157/157 [00:02<00:00, 52.64batch/s, Accuracy=72.7]


 


Epoch 8: 100%|██████████| 782/782 [01:09<00:00, 11.26batch/s, loss=0.556]
Epoch 8: 100%|██████████| 157/157 [00:02<00:00, 52.69batch/s, Accuracy=73.1]


 


Epoch 9: 100%|██████████| 782/782 [01:09<00:00, 11.17batch/s, loss=0.165] 
Epoch 9: 100%|██████████| 157/157 [00:03<00:00, 52.32batch/s, Accuracy=73.4]

 





In [None]:
# at epoch 6, it is already better than adam