In [None]:
import torch
import time

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Matrix multiplication
sizes = [512, 1024, 2048, 4096]
for size in sizes:
    a = torch.randn(size, size, device=device)
    b = torch.randn(size, size, device=device)

    torch.cuda.synchronize()
    start = time.time()
    for _ in range(10):
        c = torch.mm(a, b)
    torch.cuda.synchronize()
    end = time.time()
    print(f"Size: {size}x{size}, Time: {end - start:.4f} sec")


Size: 512x512, Time: 0.0349 sec
Size: 1024x1024, Time: 0.0039 sec
Size: 2048x2048, Time: 0.0270 sec
Size: 4096x4096, Time: 0.2317 sec


In [None]:
import torch
import torchvision.models as models
import torch.nn as nn
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = models.resnet50().to(device)

model.train()

data = torch.randn(32, 3, 224, 224).to(device)
target = torch.randint(0, 1000, (32,), dtype=torch.long).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Warmup
for _ in range(5):
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

# Benchmark
torch.cuda.synchronize()
start = time.time()
for _ in range(20):
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
torch.cuda.synchronize()
end = time.time()
print(f"Time for 20 iterations: {end - start:.2f} seconds")


Time for 20 iterations: 4.58 seconds
