<a href="https://colab.research.google.com/github/WHU-Peter/COMP6200-Project/blob/main/softmax_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
! /opt/bin/nvidia-smi

Sat Aug 21 23:38:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    34W / 250W |   1723MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
import torch
import timeit
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.benchmark as benchmark

# fix random seed for reproducibility
# seed = 7
# torch.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = True
# import numpy as np
# np.random.seed(seed)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

cpu

In [29]:
def matrix_multiplication(x, y):
  return x @ y

def softmax(x, y):
  pro = F.softmax(x/0.001, dim=-1)
  nozero = torch.nonzero(pro);
  out = np.zeros((x.shape[0], y.shape[1]))
  out = torch.tensor(out)
  # print(np.array(nozero).shape[1])
  for i in range(x.shape[0]):
    idx = torch.where(nozero[:,0]==i)[0]
    rows = nozero[idx, 1].long()
    out[i] = torch.mean(emb(rows), axis=0)
    return out.float()

# Compare takes a list of measurements which we'll save in results.
results = []

batch_size = 16
target_dim = 1000
hidden_dim = [10, 100, 500, 1000, 5000, 10000, 15000, 20000, 50000]
for h in hidden_dim:
    # label and sub_label are the rows
    # description is the column
    label = 'Batched Matrix Multipication and SoftMax'
    sub_label = f'[{h}]'
    x = torch.randn((batch_size, h))
    y = torch.randn((h, target_dim))
    emb = nn.Embedding(h, target_dim)
    results.append(benchmark.Timer(
            stmt='matrix_multiplication(x, y)',
            setup='from __main__ import matrix_multiplication',
            globals={'x': x, 'y' : y},
            label=label,
            sub_label=sub_label,
            description='matrix_multiplication',
    ).blocked_autorange(min_run_time=1))
    results.append(benchmark.Timer(
            stmt='softmax(x, y)',
            setup='from __main__ import softmax',
            globals={'x': x, 'y' : y},
            label=label,
            sub_label=sub_label,
            description='softmax_look-up_table',
    ).blocked_autorange(min_run_time=1))

compare = benchmark.Compare(results)
compare.print()

[----------- Batched Matrix Multipication and SoftMax ----------]
               |  matrix_multiplication  |  softmax_look-up_table
1 threads: ------------------------------------------------------
      [10]     |             5.9         |           113.4       
      [100]    |            30.5         |           127.2       
      [500]    |           153.7         |           176.6       
      [1000]   |           317.5         |           245.7       
      [5000]   |          1531.0         |           811.2       
      [10000]  |          4546.8         |          1526.0       
      [15000]  |          8827.8         |          2228.6       
      [20000]  |         13780.9         |          3035.4       
      [50000]  |         36299.0         |          6601.2       

Times are in microseconds (us).



gpu

In [30]:
def matrix_multiplication(x, y):
  return x @ y

def softmax(x, y):
  pro = F.softmax(x/0.001, dim=-1)
  nozero = torch.nonzero(pro);
  out = torch.zeros((x.shape[0], y.shape[1]))
  out = torch.tensor(out)
  # print(np.array(nozero).shape[1])
  for i in range(x.shape[0]):
    idx = torch.where(nozero[:,0]==i)[0]
    rows = nozero[idx, 1].long()
    out[i] = torch.mean(emb(rows), axis=0)
    return out.float()

# Compare takes a list of measurements which we'll save in results.
results = []

batch_size = 16
target_dim = 1000
hidden_dim = [10, 100, 500, 1000, 5000, 10000, 15000, 20000, 50000]
for h in hidden_dim:
    # label and sub_label are the rows
    # description is the column
    label = 'Batched Matrix Multipication and SoftMax'
    sub_label = f'[{h}]'
    x = torch.randn((batch_size, h)).to(device)
    y = torch.randn((h, target_dim)).to(device)
    emb = nn.Embedding(h, target_dim).to(device)
    torch.cuda.empty_cache()
    results.append(benchmark.Timer(
            stmt='matrix_multiplication(x, y)',
            setup='from __main__ import matrix_multiplication',
            globals={'x': x, 'y' : y},
            label=label,
            sub_label=sub_label,
            description='matrix_multiplication',
    ).blocked_autorange(min_run_time=1))
    torch.cuda.empty_cache()
    results.append(benchmark.Timer(
            stmt='softmax(x, y)',
            setup='from __main__ import softmax',
            globals={'x': x, 'y' : y},
            label=label,
            sub_label=sub_label,
            description='softmax_look-up_table',
    ).blocked_autorange(min_run_time=1))
    torch.cuda.empty_cache()

compare = benchmark.Compare(results)
compare.print()

  


[----------- Batched Matrix Multipication and SoftMax ----------]
               |  matrix_multiplication  |  softmax_look-up_table
1 threads: ------------------------------------------------------
      [10]     |             6.9         |          250.3        
      [100]    |             6.7         |          250.3        
      [500]    |            15.5         |          252.0        
      [1000]   |            34.2         |          260.9        
      [5000]   |           157.7         |          260.0        
      [10000]  |           308.9         |          269.4        
      [15000]  |           459.4         |          280.7        
      [20000]  |           610.4         |          279.0        
      [50000]  |          1519.4         |          336.7        

Times are in microseconds (us).

