メモリーを使う層、使わない層を確かめる。

In [1]:
from typing import Callable

import numpy as np
import pynvml
import torch
import torch.nn.functional as F
from torch import Tensor, nn, optim
from torchinfo import summary
from tqdm import tqdm


def print_memory_torch(prefix: str):
    """Print memory usage.
    """    
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)    
    memory_al = torch.cuda.memory_allocated()
    memory_res = torch.cuda.memory_reserved()
    memory_maxal = torch.cuda.max_memory_allocated()

    print(f"{prefix}: allocated = {memory_al/1024**2:.1f} MiB, "
        f"reserved = {memory_res/1024**2:.1f}MiB, "
        f"max allocated = {memory_maxal/1024**2:.1f} MiB, "
        f"used = {int(info.used)/1024**2:.1f} MiB")
    

def is_memoryless(class_name: str) -> bool:
    ''' Return True if the class is memoryless type.
    Activations, normalizations and dropouts perform in-place updates by default
    and does not require additional memory.
    '''
    return any((class_name == "ReLU",
                class_name == "LeakyReLU",
                class_name == "Sigmoid",
                class_name == "Tanh",
                class_name == "ELU",
                class_name == "GLU",
                class_name == "PReLU",
                class_name == "GELU",
                class_name == "Mish",
                class_name == "Softmin",
                class_name == "Softmax",
                class_name == "Softmax2d"))


def print_memory_estimate2(
    model: nn.Module, 
    dim_input: list[int], 
    moment: int, 
    ddp: int=1, 
    mixed_pre: float = 1):
    '''Print theoretical memory usage.
    
    Parameters
    ----------
    model: 
    dim_input: Shape of input data including batch size. e.g. [batch size, channel, width, height]
    moment: Moment use for optimization. SGD: 0, Adagrad, RMSprop: 1, Adam: 2
    ddp: Multiple GPU use. Distributed data parallel: 2, Not: 1
    mixed_pre: Forward outputs memory saving by Mixed precision: 0.5, Not: 1
    '''
    info = summary(model, dim_input, verbose=0)
    dim_output = info.summary_list[-1].output_size[1:]

    num_param = 0
    num_output_shape = 0
    last_layer = len(info.summary_list) -1
    # print("#, Class, Leaf, Memoryless, Output")
    for i, layer in enumerate(info.summary_list):
        # print(f"{i}, {layer.class_name}, {layer.is_leaf_layer}, {is_memoryless(layer.class_name)}, {layer.output_size}")
        if layer.is_leaf_layer:
            num_param += layer.trainable_params
            if i != last_layer and not is_memoryless(layer.class_name):
                num_output_shape += np.prod(layer.output_size)
    
    mem_data = (np.prod(dim_input) + np.prod(dim_output)) * 4
    mem_weight = num_param * 4
    mem_weight_grad = mem_weight * (ddp + moment)
    mem_forward_output = num_output_shape * 4 * mixed_pre
    mem_output_gradient = mem_forward_output + mem_data
    mem_training = mem_data + mem_weight + mem_forward_output + mem_weight_grad + mem_output_gradient
    mem_inference = mem_data + mem_weight + mem_forward_output

    print(f"Data(MiB): {mem_data/1024**2:.1f}")
    print(f"Weight(MiB): {mem_weight/1024**2:.1f}")
    print(f"Forward output(MiB): {mem_forward_output/1024**2:.1f}")
    print(f"Weight gradient(MiB): {mem_weight_grad/1024**2:.1f}")
    print(f"Output gradient(MiB): {mem_output_gradient/1024**2:.1f}")
    print(f"Total for training(MiB): {mem_training/1024**2:.1f}")
    print(f"Total for inference(MiB): {mem_inference/1024**2:.1f}")


def train(
    model:nn.Module, 
    dim_input: list[int], 
    dim_output: list[int],
    batchsize: int, 
    epoch: int, 
    criterion: Callable[..., Tensor],
    optimizer = optim.SGD, 
    device: str = "cuda"):
    """Train model using random dataset.
    
    Parameters
    ----------
    model: 
    dim_input: Shape of input data including data size. e.g. [data size, channel, width, height]
    dim_output: 
    batchsize:
    epoch:
    optimizer:
    device: 
    """
    
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    print_memory_torch("Initial")

    model.to(device)
    print_memory_torch("Model")
    
    data = [[torch.randn([batchsize] + dim_input[1:]), 
             torch.randn([batchsize] + dim_output)] 
             for _ in range(dim_input[0]//batchsize)]

    criterion = F.cross_entropy
    opt = optimizer(model.parameters(), lr=0.01)
    for ep in range(epoch):
        model.train()
        with tqdm(data) as pbar:
            pbar.set_description(f'[Epoch {ep + 1}]')
            for x, y in pbar:
                x = x.to(device)
                y = y.to(device)
                
                opt.zero_grad()
                y_pred = model(x)
                loss = criterion(y_pred, y)
                loss.backward()
                opt.step()
            
        print_memory_torch("Train")
    print_memory_torch("Final")



In [2]:

class Config:
    def __init__(self):
        self.dim_input = [3,224,224]
        self.dim_output = [3,224,224]
        self.datasize = 2000
        self.batchsize = 1000
        self.num_epochs = 2
        self.lr = 1e-2
        self.device = 'cuda'
        self.criterion = F.cross_entropy
        self.optim = optim.SGD
        self.moment = 0         # SGD: 0, Adagrad, RMSprop: 1, Adam: 2
        self.ddp = 1            # Distributed data parallel: 2, Not: 1
        self.mixed_pre = 1      # Mixed precision: 0.5, Not: 1


class Net(nn.Module):
    def __init__(self, dim_c: int, dim_h: int, dim_w: int):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=dim_c, out_channels=dim_c, kernel_size=3, padding=1)
        self.bn = nn.BatchNorm2d(num_features=dim_c)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(inplace=False)
        self.softmax = nn.Softmax2d()
        self.conv2 = nn.Conv2d(in_channels=dim_c, out_channels=dim_c, kernel_size=3, padding=1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.conv1(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.softmax(x)
        x = self.conv2(x)
        return x


conf = Config()
model_cnn = Net(conf.dim_input[0], conf.dim_input[1], conf.dim_input[2])

result = summary(model_cnn, [conf.batchsize] + conf.dim_input,
            depth=6,
            col_names=["input_size",
                        "output_size",
                        "num_params",
                        "params_percent",
                        "kernel_size",
                        "mult_adds",
                        "trainable"])
print(result)

print("=== Estimated ===")
print_memory_estimate2(model_cnn, [conf.batchsize] + conf.dim_input, 
                    conf.moment, conf.ddp, conf.mixed_pre)

print("=== Real ===")
train(model_cnn, [conf.datasize] + conf.dim_input, conf.dim_output, conf.batchsize, conf.num_epochs,
    conf.criterion, conf.optim, conf.device)

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %                   Kernel Shape              Mult-Adds                 Trainable
Net                                      [1000, 3, 224, 224]       [1000, 3, 224, 224]       --                             --                   --                        --                        True
├─Conv2d: 1-1                            [1000, 3, 224, 224]       [1000, 3, 224, 224]       84                         48.28%                   [3, 3]                    4,214,784,000             True
├─BatchNorm2d: 1-2                       [1000, 3, 224, 224]       [1000, 3, 224, 224]       6                           3.45%                   --                        6,000                     True
├─ReLU: 1-3                              [1000, 3, 224, 224]       [1000, 3, 224, 224]       --                             --                   --                        --              

[Epoch 1]: 100%|██████████| 2/2 [00:00<00:00,  5.38it/s]


Train: allocated = 1722.7 MiB, reserved = 5330.0MiB, max allocated = 5312.0 MiB, used = 20550.5 MiB


[Epoch 2]: 100%|██████████| 2/2 [00:00<00:00,  5.12it/s]


Train: allocated = 1722.7 MiB, reserved = 5330.0MiB, max allocated = 5312.0 MiB, used = 20549.8 MiB
Final: allocated = 1722.7 MiB, reserved = 5330.0MiB, max allocated = 5312.0 MiB, used = 20549.8 MiB



|  | Estimated [MiB] | Real [MiB] |
|---|---|---|
| Conv | 1150 | 3445 |
| +Conv | 2298 | 4020 |
| +ReLU | 2298 | 4020 |
| +BachNorm | 3447 | 4594 |
| +Dropout | 4595 | 5312 |
| +Softmax | 4595 | 5312 |

- ReLU、Softmaxを追加しても変化はなかった
- ReLUのinplace=True, Falseの設定を変えても変化はなかった
- 要素を加えていったとき、理論的な増加分1GiBよりも、増加は少なかった
  - そもそも、理論的な増加分はいくらかよくわかっていない？
  - 計算するためにメモリを使っているだけで、Output gradientを生成しているわけではない？つまり理論的には加算不要？
- Conv単独のときが最も誤差が大きく、要素が増えると誤差が減った
  - 余分に確保されるメモリー（増加分）と再利用されるメモリー（減少分）のバランスが変わるからだと思われる。要素が多いほど再利用されるメモリーが増えて、減少方向に修正されていく？