# Эффективный DL
## Семинар 1.
### Введение. Как работают библиотеки?

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import gc

# Как измерить эффективность

In [2]:
device = "cuda"

In [3]:
training_data = datasets.FashionMNIST(
    root="mnist",
    train=True,
    download=True,
    transform=ToTensor(),
)

In [4]:
loader = DataLoader(training_data, batch_size=64)

In [5]:
!nvidia-smi

Mon May  6 17:13:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Число параметров

In [6]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [7]:
net = Net()

In [8]:
net.parameters()

<generator object Module.parameters at 0x79f0e938ccf0>

In [9]:
param1 = next(net.parameters())

In [10]:
param1.numel()

288

In [11]:
total_params = sum(p.numel() for p in net.parameters())
print(f"total params = {total_params:,}")

total params = 1,199,882


## Использование памяти

In [12]:
def print_memory():
    print(f"allocated: {torch.cuda.memory_allocated(device=device):,}")
    print(f" reserved: {torch.cuda.memory_reserved(device=device):,}")

In [15]:
print_memory()

allocated: 0
 reserved: 0


In [17]:
t = torch.zeros(4096, 4096, device=device)

In [19]:
print_memory()

allocated: 67,108,864
 reserved: 67,108,864


In [20]:
del t
gc.collect()

338

In [21]:
print_memory()

allocated: 0
 reserved: 67,108,864


In [23]:
t = torch.zeros(4096, 4096, device=device)

In [24]:
print_memory()

allocated: 67,108,864
 reserved: 67,108,864


In [25]:
del t
gc.collect()

496

In [26]:
print_memory()

allocated: 0
 reserved: 67,108,864


In [27]:
t = torch.zeros(2096, 4096, device=device)

In [28]:
print_memory()

allocated: 34,340,864
 reserved: 67,108,864


In [29]:
del t
gc.collect()

0

In [30]:
print_memory()

allocated: 0
 reserved: 67,108,864


In [31]:
t = torch.zeros(5096, 4096, device=device)

In [32]:
print_memory()

allocated: 83,886,080
 reserved: 150,994,944


In [33]:
83886080 + 67108864

150994944

In [34]:
del t
gc.collect()

0

In [35]:
print_memory()

allocated: 0
 reserved: 150,994,944


In [36]:
torch.cuda.empty_cache()

In [37]:
print_memory()

allocated: 0
 reserved: 0


In [13]:
def print_max_memory():
    print(f"allocated: {torch.cuda.max_memory_allocated(device=device):,}")
    print(f" reserved: {torch.cuda.max_memory_reserved(device=device):,}")

In [39]:
print_max_memory()

allocated: 83,886,080
 reserved: 150,994,944


In [40]:
torch.cuda.reset_peak_memory_stats(device=device)

In [41]:
print_max_memory()

allocated: 0
 reserved: 0


### На практике

In [14]:
print_memory()

allocated: 0
 reserved: 0


In [15]:
print(f"{total_params * 4:,}")  # byte per float param

4,799,528


In [16]:
net = Net().to(device)
print_memory()

allocated: 4,801,024
 reserved: 23,068,672


In [17]:
opt = torch.optim.Adam(net.parameters())
print_memory()

allocated: 4,801,024
 reserved: 23,068,672


In [18]:
loss_fn = nn.CrossEntropyLoss()
print_memory()

allocated: 4,801,024
 reserved: 23,068,672


In [19]:
data, label = next(iter(loader))
data = data.to(device)
label = label.to(device)
print_memory()

allocated: 5,002,240
 reserved: 23,068,672


In [20]:
with torch.no_grad():
    pred = net(data)

print_memory()

allocated: 13,524,480
 reserved: 44,040,192


In [21]:
pred = net(data)
print_memory()

allocated: 36,898,816
 reserved: 65,011,712


In [22]:
loss = loss_fn(pred, label)
print_memory()

allocated: 36,899,840
 reserved: 65,011,712


In [23]:
opt.zero_grad()
print_memory()

allocated: 36,899,840
 reserved: 65,011,712


In [24]:
loss.backward()
print_memory()

allocated: 26,845,696
 reserved: 140,509,184


In [25]:
opt.step()
print_memory()

allocated: 37,103,104
 reserved: 140,509,184


In [26]:
!nvidia-smi

Mon May  6 17:22:27 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0              28W /  70W |    291MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Время

In [32]:
# bad-bad

import time
tic = time.time()
pred = net(data)
toc = time.time()
(toc - tic) * 1000

2.01416015625

In [33]:
# bad

import time
tic = time.perf_counter()
pred = net(data)
toc = time.perf_counter()
(toc - tic) * 1000

2.1640119998664886

In [34]:
# still bad, but closer

%%timeit

pred = net(data)

822 µs ± 2.18 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [35]:
print_memory()

allocated: 59,819,520
 reserved: 165,675,008


In [36]:
# net forward + GPU -> CPU data transfer

%%timeit

pred = net(data)
pred.cpu()

888 µs ± 2.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [38]:
# right way but dirty forward

%%timeit

pred = net(data)
torch.cuda.synchronize()

884 µs ± 1.66 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [39]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x79f06bc4a770>

In [40]:
# clean forward

%%timeit

pred = net(data)
torch.cuda.synchronize()

879 µs ± 1.98 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [41]:
torch.set_grad_enabled(True)

<torch.autograd.grad_mode.set_grad_enabled at 0x79f06bc4a470>

In [42]:
# full cycle

%%timeit

pred = net(data)
loss = loss_fn(pred, label)
loss.backward()
opt.step()
torch.cuda.synchronize()

3.01 ms ± 31.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Как устроено автоматическое дифференцирование

In [43]:
x = torch.tensor(torch.pi / 6, requires_grad=True)
x

tensor(0.5236, requires_grad=True)

In [44]:
y = torch.sin(x)
y

tensor(0.5000, grad_fn=<SinBackward0>)

In [45]:
z = torch.pow(y, 3)
z

tensor(0.1250, grad_fn=<PowBackward0>)

In [46]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x79f06bb1cf70>

In [47]:
grad1 = z.grad_fn
grad1

<PowBackward0 at 0x79f06bb1f7c0>

In [48]:
def pow_backward(saved_input, saved_exponent):
    return saved_exponent * torch.pow(saved_input, saved_exponent - 1)

In [49]:
grad1._saved_self, grad1._saved_exponent

(tensor(0.5000, grad_fn=<SinBackward0>), 3)

In [50]:
d_z_d_y = pow_backward(grad1._saved_self, grad1._saved_exponent)
d_z_d_y

tensor(0.7500)

In [53]:
grad2 = grad1.next_functions[0][0]
grad2

<SinBackward0 at 0x79f06b9f0f40>

In [54]:
def sin_backward(saved_input):
    return torch.cos(saved_input)

In [55]:
d_y_d_x = sin_backward(grad2._saved_self)
d_y_d_x

tensor(0.8660)

In [56]:
d_z_d_x = d_z_d_y * d_y_d_x
d_z_d_x

tensor(0.6495)

In [57]:
torch.set_grad_enabled(True)

<torch.autograd.grad_mode.set_grad_enabled at 0x79f06b9f2320>

In [58]:
y.retain_grad()
z.retain_grad()
z.backward()

In [59]:
z.grad

tensor(1.)

In [60]:
y.grad

tensor(0.7500)

In [61]:
x.grad

tensor(0.6495)

In [62]:
x = torch.tensor(torch.pi / 6, requires_grad=True)

In [63]:
y = torch.exp(x)

In [64]:
y

tensor(1.6881, grad_fn=<ExpBackward0>)

In [65]:
y.grad_fn._saved_result

tensor(1.6881, grad_fn=<ExpBackward0>)

## На практике

In [66]:
print_memory()

allocated: 59,819,520
 reserved: 220,200,960


In [67]:
seq = torch.randn((20, 128, 1024), device=device)

In [68]:
gru_cell = nn.GRUCell(1024, 1024).to(device)
gru = nn.GRU(1024, 1024).to(device)

In [69]:
print_memory()

allocated: 121,275,904
 reserved: 220,200,960


In [70]:
pred1 = gru(seq)
torch.sum(pred1[0][-1]).backward()

In [74]:
pred1[0].shape

torch.Size([20, 128, 1024])

In [71]:
print_memory()

allocated: 158,066,176
 reserved: 352,321,536


In [72]:
pred2 = seq[0]

for x in seq[1:]:
    pred2 = gru_cell(x, pred2)

torch.sum(pred2).backward()

In [76]:
pred2.shape

torch.Size([128, 1024])

In [73]:
print_memory()

allocated: 183,780,864
 reserved: 360,710,144


Факир был пьян и фокус не удался :(