# Эффективный DL
## Семинар 2.
### Аппаратное обеспечение и низкоуровневые решения

In [15]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from tqdm.auto import trange

from torch.profiler import profile, ProfilerActivity, schedule

In [3]:
device = "cuda"

In [4]:
a = torch.randn(1024, 1024, device=device)
b = torch.randn(1024, 1024, device=device)

In [11]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    c = a.T @ b

In [12]:
training_data = datasets.FashionMNIST(
    root="mnist",
    train=True,
    download=True,
    transform=ToTensor(),
)
loader = DataLoader(training_data, batch_size=256)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to mnist/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26421880/26421880 [00:02<00:00, 12054432.11it/s]


Extracting mnist/FashionMNIST/raw/train-images-idx3-ubyte.gz to mnist/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to mnist/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 202597.35it/s]


Extracting mnist/FashionMNIST/raw/train-labels-idx1-ubyte.gz to mnist/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to mnist/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4422102/4422102 [00:01<00:00, 3782390.48it/s]


Extracting mnist/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to mnist/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to mnist/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<00:00, 5455350.43it/s]

Extracting mnist/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to mnist/FashionMNIST/raw






In [13]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [14]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [17]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

`skip_first` + `repeat` * [`wait` (no tracing) + `warmup` (tracing, but not recording) + `active` (recording)]

[Perfetto trace visualizer](https://ui.perfetto.dev)

In [18]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    with_stack=True,
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

  0%|          | 0/7 [00:00<?, ?it/s]

## Data loading

In [19]:
training_data = datasets.FashionMNIST(
    root="mnist",
    train=True,
    download=True,
    transform=ToTensor(),
)
loader = DataLoader(training_data, batch_size=256, num_workers=4)



In [20]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [21]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [22]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

In [23]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

  0%|          | 0/7 [00:00<?, ?it/s]

  self.pid = os.fork()


## Less features for linear

In [25]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(2304, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 4)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [26]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [27]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

In [28]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

  0%|          | 0/7 [00:00<?, ?it/s]

## less batch size

In [30]:
training_data = datasets.FashionMNIST(
    root="mnist",
    train=True,
    download=True,
    transform=ToTensor(),
)
loader = DataLoader(training_data, batch_size=64, num_workers=4)

In [31]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(2304, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 4)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [32]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [33]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

In [34]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

  0%|          | 0/7 [00:00<?, ?it/s]

## compile

In [35]:
net = Net().to(device)
net.compile()
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

  self.pid = os.fork()


In [36]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

In [37]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

  0%|          | 0/7 [00:00<?, ?it/s]



## report

In [38]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [40]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")
    print(prof.key_averages().table(sort_by="cuda_time_total"))

In [43]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    with_flops=True,
    profile_memory=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

  0%|          | 0/7 [00:00<?, ?it/s]

  self.pid = os.fork()


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
autograd::engine::evaluate_function: ConvolutionBack...         0.32%     135.000us         5.56%       2.323ms     387.167us       0.000us         0.00%       6.021ms       1.004ms           0 