In [21]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity
from bettmensch_ai_examples.gpt_1.src.model import GPT1CoreFast, GPT1CoreKarpathy, DecoderLayerFast, DecoderLayerKarpathy, GPT1Pretrain

In [2]:
def time_model(n, batch_size=5,layer_only: bool = True,karpathy: bool = False, use_gpu=False,sort_by="self_cpu_time_total"):

    mask = torch.ones((batch_size,512),dtype=torch.bool)

    if layer_only:
        inputs = torch.rand(batch_size, 512, 768,dtype=torch.half)
        if not karpathy:
            model = DecoderLayerFast()
        else:
            model = DecoderLayerKarpathy()
        model.set_nest_level()
    else:
        inputs = torch.randint(low=0,high=30000,size=(batch_size, 512))
        if not karpathy:
            model = GPT1CoreFast(30000)
        else:
            model = GPT1CoreKarpathy(30000)

    if use_gpu:
        gpu_device = torch.device("cuda")
        inputs = inputs.to(gpu_device)
        mask = mask.to(gpu_device)
        model.to(gpu_device)

    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("model_inference"):
            for k in range(n):
                model(inputs, mask)

    print(prof.key_averages().table(sort_by=sort_by, row_limit=550))

    return prof

## Layer level comparison

In [3]:
prof = time_model(n=10,batch_size=2,layer_only=True, karpathy=False,use_gpu=False,sort_by="self_cpu_time_total")

---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::addmm        88.01%       69.355s        88.02%       69.368s        1.156s       69.364s        88.01%       69.373s        1.156s            60  
                  aten::bmm        10.82%        8.525s        10.85%        8.551s      35.628ms        8.418s        10.68%        8.449s      35.204ms           240  
             aten::_softmax         0.57%     451.609ms         0.57%     451.609ms       3.763ms     474.431ms         0.60%     474.431ms       3.9

In [4]:
prof = time_model(n=10,batch_size=2,layer_only=True, karpathy=True,use_gpu=False,sort_by="self_cpu_time_total")

---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::addmm        88.91%       70.299s        88.92%       70.307s        1.758s       70.264s        88.86%       70.273s        1.757s            40  
                  aten::bmm        10.45%        8.260s        10.48%        8.284s     414.205ms        8.249s        10.43%        8.267s     413.326ms            20  
             aten::_softmax         0.14%     111.079ms         0.14%     111.079ms      11.108ms     106.715ms         0.13%     106.715ms      10.6

In [5]:
prof = time_model(n=100,batch_size=16,layer_only=True, karpathy=False,use_gpu=True,sort_by="self_cuda_time_total")

---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::addmm         3.58%     113.945ms         3.86%     122.631ms     204.385us        1.100s        34.49%        1.110s       1.851ms           600  
         aten::masked_fill_         1.30%      41.467ms         1.30%      41.467ms      34.556us     244.045ms         7.65%     244.045ms     203.371us          1200  
                  aten::bmm         2.66%      84.472ms         2.66%      84.472ms      35.197us     209.473ms         6.57%     209.473ms      87.2

In [6]:
prof = time_model(n=100,batch_size=16,layer_only=True, karpathy=True,use_gpu=True,sort_by="self_cuda_time_total")

---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::addmm         1.26%      32.281ms         1.49%      38.218ms      95.545us        1.085s        42.05%        1.093s       2.732ms           400  
       aten::native_dropout         0.81%      20.866ms         1.48%      38.021ms      76.042us     298.140ms        11.56%     340.784ms     681.568us           500  
                aten::copy_        83.42%        2.136s        83.42%        2.136s       2.674ms     271.963ms        10.54%     271.963ms     340.3

## Model level comparison

In [3]:
time_model(n=100,batch_size=16,layer_only=False,karpathy=False,use_gpu=True,sort_by="self_cuda_time_total")

---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::addmm         1.67%     555.065ms         1.97%     654.029ms      90.837us       10.592s        31.86%       10.713s       1.488ms          7200  
                  aten::bmm         2.75%     915.058ms         2.75%     915.058ms      31.773us        2.311s         6.95%        2.311s      80.245us         28800  
         aten::masked_fill_         0.47%     157.573ms         0.47%     157.573ms      10.943us        2.298s         6.91%        2.298s     159.5

<torch.profiler.profiler.profile at 0x1e2b4eeea10>

In [3]:
time_model(n=100,batch_size=16,layer_only=False,karpathy=True,use_gpu=True,sort_by="self_cuda_time_total")

---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::addmm         2.00%     863.521ms         2.46%        1.061s     220.938us       17.226s        39.94%       17.313s       3.607ms          4800  
       aten::native_dropout         1.56%     674.079ms         2.81%        1.210s     198.343us        6.467s        14.99%        6.924s       1.135ms          6100  
                aten::copy_        71.44%       30.815s        71.44%       30.815s       3.217ms        4.537s        10.52%        4.537s     473.7

<torch.profiler.profiler.profile at 0x213c375f450>

In [18]:
gpu_device = torch.device("cuda")

model = DecoderLayerKarpathy()
model.set_nest_level()
model.to(gpu_device)

inputs = torch.rand(1, 512, 768,dtype=torch.half).to(gpu_device)
mask = torch.ones((1,512),dtype=torch.bool).to(gpu_device)

outputs = model(inputs,mask)
outputs.size(), outputs

(torch.Size([1, 512, 768]),
 tensor([[[ 0.2047, -1.3193, -0.5762,  ..., -0.1744,  0.4873,  0.0000],
          [ 1.1416,  0.0507,  0.2316,  ...,  1.5371,  0.0154,  0.0261],
          [-0.2751,  1.0518,  1.3643,  ...,  1.3975,  0.0000,  0.1373],
          ...,
          [-0.4155, -1.4805,  1.3574,  ..., -0.3396, -0.0000, -0.0154],
          [-1.3486,  0.2261, -0.1428,  ..., -0.0000,  1.4424, -0.0000],
          [-0.0988, -0.1489,  0.0611,  ...,  0.6558,  1.0010, -1.2832]]],
        device='cuda:0', dtype=torch.float16, grad_fn=<NativeDropoutBackward0>))

In [19]:
gpu_device = torch.device("cuda")

model = GPT1CoreKarpathy(30000)
model.to(gpu_device)

inputs = torch.randint(low=0, high=30000, size=(1, 512)).to(gpu_device)
mask = torch.ones((1,512),dtype=torch.bool).to(gpu_device)

outputs = model(inputs,mask)
outputs.size(), outputs

(torch.Size([1, 512, 768]),
 tensor([[[-1.2393,  0.5342, -1.2715,  ..., -0.1897, -0.4978,  0.0000],
          [-0.0000, -0.0000, -0.0129,  ..., -0.0146,  0.0913,  0.1405],
          [-0.5132, -1.6133,  0.4812,  ...,  0.2190,  0.4424, -1.9727],
          ...,
          [-0.3931, -0.5239, -0.6118,  ...,  0.0322, -1.4443, -0.2976],
          [ 0.0122,  0.6753, -0.0770,  ...,  0.4658,  1.6504, -0.4805],
          [ 0.9688,  0.0000, -0.3105,  ...,  0.1453,  0.1688,  0.9263]]],
        device='cuda:0', dtype=torch.float16, grad_fn=<NativeDropoutBackward0>))

In [22]:
gpu_device = torch.device("cuda")

model = GPT1Pretrain(30000)
model.to(gpu_device)

inputs = torch.randint(low=0, high=30000, size=(1, 512)).to(gpu_device)
mask = torch.ones((1,512),dtype=torch.bool).to(gpu_device)

outputs = model(inputs,mask)
outputs.size(), outputs

(torch.Size([1, 30000, 512]),
 tensor([[[-17.2500,  -7.3594, -11.6797,  ...,  29.9375,  -7.8477, -48.0312],
          [ 29.1719,  44.8438,  47.4062,  ...,  17.5000, -13.0547, -25.0156],
          [-22.0781, -35.4688, -25.7656,  ..., -33.4688,  18.9531,  13.4453],
          ...,
          [ 12.3281,  14.7891,  12.2969,  ...,  36.3438, -15.1797,  -5.1719],
          [ -3.0137,  35.7188, -28.9375,  ...,   4.0078,  -5.6328,  -7.3906],
          [ 44.0000, -49.5312, -15.0391,  ...,  24.2188, -24.1562, -22.8750]]],
        device='cuda:0', dtype=torch.float16, grad_fn=<TransposeBackward0>))