In [1]:
from batchflow import Pipeline, D, B, V, C, R, P
from batchflow.opensets import Imagenette160
from batchflow.models.torch import UNet
from batchflow import GPUMemoryMonitor
from fastai.vision.all import URLs
from batchflow.models.torch import EncoderDecoder
import torch
from train_module import training_functions
import numpy as np

import nvidia_smi

In [2]:
def get_mem_info(device_id):
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    free_memory_in_bytes = info.free
    used_memory_in_bytes = info.used
    nvidia_smi.nvmlShutdown()
    return free_memory_in_bytes, used_memory_in_bytes

def get_run_mem(dataset, device_id, model_config, train_pipeline, batch_size=16, n_iters=50):
    with GPUMemoryMonitor(gpu_list=[device_id]) as monitor:
        print("before and after clean")
        print(get_mem_info(device_id))
        torch.cuda.empty_cache()
        print(get_mem_info(device_id))
        train_pipeline.run(batch_size, n_iters=n_iters, bar='n')
    return np.max(monitor.data)

def get_max_batch_size(dataset, device_id, model_config, train_pipeline, init_batch_size, n_iters):
    print(get_mem_info(device_id))
    first_run_memory = get_run_mem(dataset, device_id, model_config, train_pipeline, batch_size=init_batch_size, n_iters=n_iters)
    print(get_mem_info(device_id))
    second_run_memory = get_run_mem(dataset, device_id, model_config, train_pipeline, batch_size=2*init_batch_size, n_iters=n_iters)
    print(get_mem_info(device_id))
    max_batch_size = init_batch_size * (100 - 2 * first_run_memory + second_run_memory)/(second_run_memory - first_run_memory)
    return max_batch_size

In [3]:
dataset = Imagenette160(bar=True)
device_id = 4

model_config = dict(model = UNet)
model_config['device'] = f'cuda:{device_id}'
model_config['loss'] = 'mse'

train_pipeline = (dataset.train.p
                .crop(shape=(160, 160), origin='center')
                .init_variable('loss_history', [])
                .to_array(channels='first', dtype=np.float32)
                .multiply(1./255)
                .init_model('dynamic', UNet, 'unet',
                            config=model_config)
                .train_model('unet', B.images, B.images, 
                             fetches='loss', save_to=V('loss_history', mode='a'), use_lock=True)
)

 50%|█████     | 1/2 [00:05<00:05,  5.22s/it]


In [4]:
init_batch_size = 16
n_iters = 50

print("Max batch size:", get_max_batch_size(dataset, device_id, model_config, train_pipeline, init_batch_size, n_iters))

(11551571968, 3145728)
before and after clean
(11551571968, 3145728)
(11551571968, 3145728)


  0%|                                                                                                         …

  "In future, upsample filters can be made to match decoder block's filters by default.")


(4613144576, 6941573120)
before and after clean
(4613144576, 6941573120)
(5609291776, 5945425920)


  0%|                                                                                                         …

(330760192, 11223957504)


32.1768149882904

In [5]:
init_batch_size = 8
n_iters = 50

print("Max batch size:", get_max_batch_size(dataset, device_id, model_config, train_pipeline, init_batch_size, n_iters))

(330760192, 11223957504)
before and after clean
(330760192, 11223957504)
(5609291776, 5945425920)


  0%|                                                                                                         …

(5607194624, 5947523072)
before and after clean
(5607194624, 5947523072)
(5609291776, 5945425920)


  0%|                                                                                                         …

(4613144576, 6941573120)


5.863047235023041

What happend:

***run_memory = model_size + item_size * batch_size***

We set: ***init_batch_size = 16***
 

So, we have two equations:

***first_run_memory = model_size + init_batch_size * item_size***

***second_run_memory = model_size + 2 * init_batch_size * item_size***

We can get:

***item_size * init_batch_size = second_run_memory - first_run_memory***

***model_size = first_run_memory - item_size * init_batch_size = 2 * first_run_memory - second_run_memory***

We want to know max_batch_size if we have total_memory amount of GPU memory.

***max_batch_size = (total_memory - model_size)/item_size***

It is equal to:

***max_batch_size = (total_memory - model_size)/((second_run_memory - first_run_memory)/init_batch_size)*** 

where init_batch_size=16

or:

***max_batch_size = init_batch_size * (total_memory - model_size)/(second_run_memory - first_run_memory)*** 


Memory is measured as a percentage, so ***total_memory = 100*** %.

What happend with incremental multiply:

***run_memory = model_size + item_size * batch_size***

We set: ***init_batch_size = 16***
 

So, we have two equations:

***first_run_memory = model_size + (i-1) * init_batch_size * item_size***

***second_run_memory = model_size + i * init_batch_size * item_size***

We can get:

***item_size * init_batch_size = second_run_memory - first_run_memory***

***model_size = first_run_memory - (i-1) * item_size * init_batch_size = i * first_run_memory - (i-1) * second_run_memory***

We want to know max_batch_size if we have total_memory amount of GPU memory.

***max_batch_size = (total_memory - model_size)/item_size***

It is equal to:

***max_batch_size = (total_memory - model_size)/((second_run_memory - first_run_memory)/init_batch_size)*** 

where init_batch_size=16

or:

***max_batch_size = init_batch_size * (total_memory - model_size)/(second_run_memory - first_run_memory)*** 


Memory is measured as a percentage, so ***total_memory = 100*** %.