<a href="https://colab.research.google.com/github/WHU-Peter/COMP6200-Project/blob/main/softmax_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
! /opt/bin/nvidia-smi

Sat Aug 21 23:38:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    34W / 250W |   1723MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [33]:
import torch
import timeit
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.benchmark as benchmark
from itertools import product

device = "cuda:0" if torch.cuda.is_available() else "cpu"

cpu

In [34]:
def matrix_multiplication(x, y):
  return x @ y

def softmax(x, y):
  pro = F.softmax(x/0.001, dim=-1)
  nozero = torch.nonzero(pro);
  out = np.zeros((x.shape[0], y.shape[1]))
  out = torch.tensor(out)
  # print(np.array(nozero).shape[1])
  for i in range(x.shape[0]):
    idx = torch.where(nozero[:,0]==i)[0]
    rows = nozero[idx, 1].long()
    out[i] = torch.mean(emb(rows), axis=0)
    return out.float()

# Compare takes a list of measurements which we'll save in results.
results = []

batch_size = 16
target_dim = [100, 500, 1000, 5000]
hidden_dim = [100, 500, 1000, 5000, 10000, 15000, 20000, 50000]
for h, t in product(hidden_dim, target_dim):
    # label and sub_label are the rows
    # description is the column
    label = 'Batched Matrix Multipication and SoftMax'
    sub_label = f'[{h}, {t}]'
    x = torch.randn((batch_size, h))
    y = torch.randn((h, t))
    emb = nn.Embedding(h, t)
    results.append(benchmark.Timer(
            stmt='matrix_multiplication(x, y)',
            setup='from __main__ import matrix_multiplication',
            globals={'x': x, 'y' : y},
            label=label,
            sub_label=sub_label,
            description='matrix_multiplication',
    ).blocked_autorange(min_run_time=1))
    results.append(benchmark.Timer(
            stmt='softmax(x, y)',
            setup='from __main__ import softmax',
            globals={'x': x, 'y' : y},
            label=label,
            sub_label=sub_label,
            description='softmax_look-up_table',
    ).blocked_autorange(min_run_time=1))

compare = benchmark.Compare(results)
compare.print()

[-------------- Batched Matrix Multipication and SoftMax -------------]
                     |  matrix_multiplication  |  softmax_look-up_table
1 threads: ------------------------------------------------------------
      [100, 100]     |              5.7        |           108.8       
      [100, 500]     |             16.0        |           124.4       
      [100, 1000]    |             26.9        |           130.3       
      [100, 5000]    |            189.2        |           489.4       
      [500, 100]     |             16.6        |           160.2       
      [500, 500]     |             70.7        |           170.1       
      [500, 1000]    |            153.5        |           189.1       
      [500, 5000]    |            771.8        |           466.4       
      [1000, 100]    |             29.4        |           223.6       
      [1000, 500]    |            150.2        |           224.1       
      [1000, 1000]   |            301.0        |           244.0

gpu

In [37]:
def matrix_multiplication(x, y):
  return x @ y

def softmax(x, y):
  pro = F.softmax(x/0.001, dim=-1)
  nozero = torch.nonzero(pro);
  out = torch.zeros((x.shape[0], y.shape[1]))
  out = torch.tensor(out)
  # print(np.array(nozero).shape[1])
  for i in range(x.shape[0]):
    idx = torch.where(nozero[:,0]==i)[0]
    rows = nozero[idx, 1].long()
    out[i] = torch.mean(emb(rows), axis=0)
    return out.float()

# Compare takes a list of measurements which we'll save in results.
results = []

batch_size = 16
target_dim = [100, 500, 1000, 5000]
hidden_dim = [100, 500, 1000, 5000, 10000, 15000, 20000, 50000]
for h, t in product(hidden_dim, target_dim):
    # label and sub_label are the rows
    # description is the column
    label = 'Batched Matrix Multipication and SoftMax'
    sub_label = f'[{h}, {t}]'
    x = torch.randn((batch_size, h)).to(device)
    y = torch.randn((h, t)).to(device)
    emb = nn.Embedding(h, t).to(device)
    torch.cuda.empty_cache()
    results.append(benchmark.Timer(
            stmt='matrix_multiplication(x, y)',
            setup='from __main__ import matrix_multiplication',
            globals={'x': x, 'y' : y},
            label=label,
            sub_label=sub_label,
            description='matrix_multiplication',
    ).blocked_autorange(min_run_time=1))
    torch.cuda.empty_cache()
    results.append(benchmark.Timer(
            stmt='softmax(x, y)',
            setup='from __main__ import softmax',
            globals={'x': x, 'y' : y},
            label=label,
            sub_label=sub_label,
            description='softmax_look-up_table',
    ).blocked_autorange(min_run_time=1))
    torch.cuda.empty_cache()

compare = benchmark.Compare(results)
compare.print()

  


[-------------- Batched Matrix Multipication and SoftMax -------------]
                     |  matrix_multiplication  |  softmax_look-up_table
1 threads: ------------------------------------------------------------
      [100, 100]     |             7.2         |          241.7        
      [100, 500]     |             7.0         |          251.1        
      [100, 1000]    |             7.0         |          253.1        
      [100, 5000]    |            10.3         |          512.3        
      [500, 100]     |            15.4         |          245.8        
      [500, 500]     |            15.4         |          249.0        
      [500, 1000]    |            15.5         |          268.3        
      [500, 5000]    |            33.0         |          501.8        
      [1000, 100]    |            27.7         |          253.9        
      [1000, 500]    |            27.7         |          262.6        
      [1000, 1000]   |            32.8         |          274.1 