In [None]:
# https://colab.research.google.com/drive/1vMkH8LmiCCOiCo4KTTEcv-NU8_OGn0ie?usp=sharing#scrollTo=s3ucr4kRmBKk

In [1]:
# torch==1.11.0
import warprnnt_pytorch
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda', 0)

In [3]:
logits = torch.FloatTensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.6, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.8, 0.1]],
                              [[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.1, 0.1],
                              [0.7, 0.1, 0.2, 0.1, 0.1]]]])
warp_transducer_logits = logits.clone()
torchaudio_logits = logits.clone()
optimized_transducer_logits = logits.clone()

logits_cuda = logits.to(device)
warp_transducer_logits_cuda = logits_cuda.clone()
torchaudio_logits_cuda = logits_cuda.clone()
optimized_transducer_logits_cuda = logits_cuda.clone()

targets = torch.tensor([[1, 2]], dtype=torch.int32)
logit_lengths = torch.tensor([2], dtype=torch.int32)
target_lengths = torch.tensor([2], dtype=torch.int32)

targets_cuda = targets.to(device)
logit_lengths_cuda = logit_lengths.to(device)
target_lengths_cuda = target_lengths.to(device)

In [4]:
print(logits.shape)
print(logit_lengths.shape)
print(targets.shape)
print(target_lengths.shape)

torch.Size([1, 2, 3, 5])
torch.Size([1])
torch.Size([1, 2])
torch.Size([1])


In [5]:
warp_transducer_logits.requires_grad_(True)
torchaudio_logits.requires_grad_(True)
optimized_transducer_logits.requires_grad_(True)

warp_transducer_logits_cuda.requires_grad_(True)
torchaudio_logits_cuda.requires_grad_(True)
optimized_transducer_logits_cuda.requires_grad_(True)

tensor([[[[0.1000, 0.6000, 0.1000, 0.1000, 0.1000],
          [0.1000, 0.1000, 0.6000, 0.1000, 0.1000],
          [0.1000, 0.1000, 0.2000, 0.8000, 0.1000]],

         [[0.1000, 0.6000, 0.1000, 0.1000, 0.1000],
          [0.1000, 0.1000, 0.2000, 0.1000, 0.1000],
          [0.7000, 0.1000, 0.2000, 0.1000, 0.1000]]]], device='cuda:0',
       requires_grad=True)

# warp_transducer

In [6]:
warp_transducer_cpu_loss = warprnnt_pytorch.rnnt_loss(warp_transducer_logits, 
                                                      targets, 
                                                      logit_lengths, 
                                                      target_lengths,
                                                      blank=0,
                                                      reduction='mean',
                                                      fastemit_lambda=0)

warp_transducer_cuda_loss = warprnnt_pytorch.rnnt_loss(warp_transducer_logits_cuda, 
                                                       targets_cuda, 
                                                       logit_lengths_cuda, 
                                                       target_lengths_cuda,
                                                       blank=0,
                                                       reduction='mean',
                                                       fastemit_lambda=0)

In [7]:
print(f'warp_transducer, cpu_loss: {warp_transducer_cpu_loss}, cuda_loss: {warp_transducer_cuda_loss}')

warp_transducer, cpu_loss: tensor([4.4957], grad_fn=<_RNNTBackward>), cuda_loss: tensor([4.4957], device='cuda:0', grad_fn=<_RNNTBackward>)


In [8]:
np.testing.assert_allclose(warp_transducer_cpu_loss.detach().numpy(), warp_transducer_cuda_loss.cpu().detach().numpy(), rtol=1e-7)

In [9]:
warp_transducer_cpu_loss.backward()
warp_transducer_cuda_loss.backward()

In [10]:
print(warp_transducer_logits.grad)
print(warp_transducer_logits.grad.shape)
print(warp_transducer_logits.grad.device)

tensor([[[[-0.1312, -0.3999,  0.1770,  0.1770,  0.1770],
          [-0.1857,  0.1225, -0.1817,  0.1225,  0.1225],
          [-0.3209,  0.0627,  0.0693,  0.1262,  0.0627]],

         [[ 0.0546, -0.2182,  0.0546,  0.0546,  0.0546],
          [ 0.1207,  0.1207, -0.4830,  0.1207,  0.1207],
          [-0.6926,  0.1687,  0.1865,  0.1687,  0.1687]]]])
torch.Size([1, 2, 3, 5])
cpu


In [11]:
print(warp_transducer_logits_cuda.grad)
print(warp_transducer_logits_cuda.grad.device)

tensor([[[[-0.1312, -0.3999,  0.1770,  0.1770,  0.1770],
          [-0.1857,  0.1225, -0.1817,  0.1225,  0.1225],
          [-0.3209,  0.0627,  0.0693,  0.1262,  0.0627]],

         [[ 0.0546, -0.2182,  0.0546,  0.0546,  0.0546],
          [ 0.1207,  0.1207, -0.4830,  0.1207,  0.1207],
          [-0.6926,  0.1687,  0.1865,  0.1687,  0.1687]]]], device='cuda:0')
cuda:0


In [12]:
np.testing.assert_allclose(warp_transducer_logits.grad.numpy(), warp_transducer_logits_cuda.grad.cpu().numpy(), rtol=1e-6)

# torchaudio

In [13]:
!pip install --upgrade pip 
# !pip uninstall torchaudio -y
!TMPDIR=$PWD pip install torchaudio==0.11.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [14]:
import torchaudio

In [15]:
torchaudio_cpu_loss = torchaudio.functional.rnnt_loss(torchaudio_logits,
                                                      targets,
                                                      logit_lengths,
                                                      target_lengths,
                                                      blank=0,
                                                      reduction='mean'
                                                      )

torchaudio_cuda_loss = torchaudio.functional.rnnt_loss(torchaudio_logits_cuda,
                                                       targets_cuda,
                                                       logit_lengths_cuda,
                                                       target_lengths_cuda,
                                                       blank=0,
                                                       reduction='mean'
                                                       )

In [16]:
print(f'torchaudio, cpu_loss: {torchaudio_cpu_loss}, cuda_loss: {torchaudio_cuda_loss}')

torchaudio, cpu_loss: 4.495666980743408, cuda_loss: 4.49566650390625


In [17]:
np.testing.assert_allclose(torchaudio_cpu_loss.detach().numpy(), torchaudio_cuda_loss.cpu().detach().numpy(), rtol=1e-6)

In [18]:
torchaudio_cpu_loss.backward()
torchaudio_cuda_loss.backward()

In [19]:
print(torchaudio_logits.grad)

tensor([[[[-0.1312, -0.3999,  0.1770,  0.1770,  0.1770],
          [-0.1857,  0.1225, -0.1817,  0.1225,  0.1225],
          [-0.3209,  0.0627,  0.0693,  0.1262,  0.0627]],

         [[ 0.0546, -0.2182,  0.0546,  0.0546,  0.0546],
          [ 0.1207,  0.1207, -0.4830,  0.1207,  0.1207],
          [-0.6926,  0.1687,  0.1865,  0.1687,  0.1687]]]])


In [20]:
print(torchaudio_logits_cuda.grad)

tensor([[[[-0.1312, -0.3999,  0.1770,  0.1770,  0.1770],
          [-0.1857,  0.1225, -0.1817,  0.1225,  0.1225],
          [-0.3209,  0.0627,  0.0693,  0.1262,  0.0627]],

         [[ 0.0546, -0.2182,  0.0546,  0.0546,  0.0546],
          [ 0.1207,  0.1207, -0.4830,  0.1207,  0.1207],
          [-0.6926,  0.1687,  0.1865,  0.1687,  0.1687]]]], device='cuda:0')


In [21]:
np.testing.assert_allclose(torchaudio_logits.grad.numpy(), torchaudio_logits_cuda.grad.cpu().numpy(), rtol=1e-6)

# check warp transducer with torchaudio 

In [22]:
np.testing.assert_allclose(warp_transducer_cpu_loss.detach().numpy(), torchaudio_cuda_loss.cpu().detach().numpy(), rtol=1e-7)

In [23]:
np.testing.assert_allclose(warp_transducer_logits.grad.numpy(), torchaudio_logits.grad.numpy(), rtol=1e-6)

In [24]:
np.testing.assert_allclose(warp_transducer_logits_cuda.grad.cpu().numpy(), torchaudio_logits_cuda.grad.cpu().numpy(), rtol=1e-6)

# optimized_transducer
# https://github.com/csukuangfj/optimized_transducer

In [25]:
!pip uninstall optimized_transducer -y

Found existing installation: optimized-transducer 1.4
Uninstalling optimized-transducer-1.4:
  Successfully uninstalled optimized-transducer-1.4


In [26]:
!TMPDIR=$PWD OT_CMAKE_ARGS="-DOT_COMPUTE_ARCHS=37 -DCMAKE_BUILD_TYPE=Release" OT_MAKE_ARGS="-j" pip install --verbose optimized_transducer

Using pip 22.3.1 from /workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/pip (python 3.7)
Collecting optimized_transducer
  Using cached optimized_transducer-1.4-cp37-cp37m-linux_x86_64.whl
Installing collected packages: optimized_transducer
Successfully installed optimized_transducer-1.4


In [27]:
import optimized_transducer

In [28]:
optimized_transducer_cpu_loss = optimized_transducer.transducer_loss(optimized_transducer_logits.reshape(-1, logits.size(-1)),
                                                                    targets,
                                                                    logit_lengths,
                                                                    target_lengths,
                                                                    blank=0,
                                                                    from_log_softmax=False,
                                                                    reduction='mean'
                                                                    )
optimized_transducer_cuda_loss = optimized_transducer.transducer_loss(optimized_transducer_logits_cuda.reshape(-1, logits.size(-1)),
                                                                     targets_cuda,
                                                                     logit_lengths_cuda,
                                                                     target_lengths_cuda,
                                                                     blank=0,
                                                                     from_log_softmax=False,
                                                                     reduction='mean'
                                                                     )

In [29]:
print(f'optimized_transducer, cpu_loss: {optimized_transducer_cpu_loss}, cuda_loss: {optimized_transducer_cuda_loss}')

optimized_transducer, cpu_loss: 4.495666980743408, cuda_loss: 4.495666980743408


In [30]:
np.testing.assert_allclose(optimized_transducer_cpu_loss.detach().numpy(), optimized_transducer_cuda_loss.cpu().detach().numpy(), rtol=1e-7)

In [31]:
optimized_transducer_cpu_loss.backward()
optimized_transducer_cuda_loss.backward()

In [32]:
print(optimized_transducer_logits.grad)
print(optimized_transducer_logits_cuda.grad)

tensor([[[[-0.1312, -0.3999,  0.1770,  0.1770,  0.1770],
          [-0.1857,  0.1225, -0.1817,  0.1225,  0.1225],
          [-0.3209,  0.0627,  0.0693,  0.1262,  0.0627]],

         [[ 0.0546, -0.2182,  0.0546,  0.0546,  0.0546],
          [ 0.1207,  0.1207, -0.4830,  0.1207,  0.1207],
          [-0.6926,  0.1687,  0.1865,  0.1687,  0.1687]]]])
tensor([[[[-0.1312, -0.3999,  0.1770,  0.1770,  0.1770],
          [-0.1857,  0.1225, -0.1817,  0.1225,  0.1225],
          [-0.3209,  0.0627,  0.0693,  0.1262,  0.0627]],

         [[ 0.0546, -0.2182,  0.0546,  0.0546,  0.0546],
          [ 0.1207,  0.1207, -0.4830,  0.1207,  0.1207],
          [-0.6926,  0.1687,  0.1865,  0.1687,  0.1687]]]], device='cuda:0')


In [33]:
np.testing.assert_allclose(optimized_transducer_logits.grad.numpy(), optimized_transducer_logits_cuda.grad.cpu().numpy(), rtol=1e-6)

In [34]:
assert torch.allclose(torchaudio_logits.grad, optimized_transducer_logits.grad)
assert torch.allclose(torchaudio_logits_cuda.grad, optimized_transducer_logits_cuda.grad)
assert torch.allclose(torchaudio_logits.grad, optimized_transducer_logits_cuda.grad.cpu())

# paddle rnnt

In [35]:
import paddle
paddle.set_device('cpu')

logits = paddle.to_tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.6, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.8, 0.1]],
                              [[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.1, 0.1],
                              [0.7, 0.1, 0.2, 0.1, 0.1]]]], dtype=paddle.float32, stop_gradient=False)

targets = paddle.to_tensor([[1, 2]], dtype=paddle.int32)
logit_lengths = paddle.to_tensor([2], dtype=paddle.int32)
target_lengths = paddle.to_tensor([2], dtype=paddle.int32)

# cpu not do log_softmax by defualt
logprob = paddle.nn.functional.log_softmax(logits)
rnnt_loss_cpu = paddle.nn.functional.rnnt_loss(logprob, 
                                                targets, 
                                                logit_lengths, 
                                                target_lengths,
                                                blank=0,
                                                reduction='mean',
                                                fastemit_lambda=0)

print(f'warp_transducer, cpu_loss: {rnnt_loss_cpu}')
rnnt_loss_cpu.backward()
logits_grad_cpu = logits.grad.clone()
print(logits_grad_cpu)

warp_transducer, cpu_loss: Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False,
       [4.49566650])
Tensor(shape=[1, 2, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[[[-0.13116686, -0.39992690,  0.17703126,  0.17703126,  0.17703126],
          [-0.18572757,  0.12247056, -0.18168412,  0.12247056,  0.12247056],
          [-0.32091251,  0.06269141,  0.06928471,  0.12624499,  0.06269141]],

         [[ 0.05456069, -0.21824276,  0.05456069,  0.05456069,  0.05456069],
          [ 0.12073959,  0.12073959, -0.48295838,  0.12073959,  0.12073959],
          [-0.69258857,  0.16871126,  0.18645476,  0.16871126,  0.16871126]]]])


In [36]:
paddle.set_device('gpu')

logits = paddle.to_tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.6, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.8, 0.1]],
                              [[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.1, 0.1],
                              [0.7, 0.1, 0.2, 0.1, 0.1]]]], dtype=paddle.float32, stop_gradient=False)

targets = paddle.to_tensor([[1, 2]], dtype=paddle.int32)
logit_lengths = paddle.to_tensor([2], dtype=paddle.int32)
target_lengths = paddle.to_tensor([2], dtype=paddle.int32)

# gpu do log_softmax by default
rnnt_loss_gpu = paddle.nn.functional.rnnt_loss(logits, 
                                               targets, 
                                               logit_lengths, 
                                                target_lengths,
                                                blank=0,
                                                reduction='mean',
                                                fastemit_lambda=0)
print(f'warp_transducer, cuda_loss: {rnnt_loss_gpu}')
rnnt_loss_gpu.backward()
logits_grad_gpu = logits.grad.clone()
print(logits_grad_gpu)

warp_transducer, cuda_loss: Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [4.49566650])
Tensor(shape=[1, 2, 3, 5], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [[[[-0.13116689, -0.39992687,  0.17703123,  0.17703123,  0.17703123],
          [-0.18572746,  0.12247057, -0.18168400,  0.12247057,  0.12247057],
          [-0.32091236,  0.06269139,  0.06928468,  0.12624492,  0.06269139]],

         [[ 0.05456069, -0.21824265,  0.05456069,  0.05456069,  0.05456069],
          [ 0.12073954,  0.12073954, -0.48295826,  0.12073954,  0.12073954],
          [-0.69258809,  0.16871123,  0.18645471,  0.16871123,  0.16871123]]]])


W1215 04:46:47.980068 56399 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2
W1215 04:46:47.986979 56399 gpu_resources.cc:91] device: 0, cuDNN Version: 7.6.


In [37]:
np.testing.assert_allclose(rnnt_loss_cpu.item(), rnnt_loss_gpu.item())
np.testing.assert_allclose(rnnt_loss_gpu.item(), warp_transducer_cuda_loss.cpu().detach().numpy())

In [38]:
np.testing.assert_allclose(logits_grad_gpu.numpy(), logits_grad_cpu.numpy(), rtol=1e-6)
np.testing.assert_allclose(warp_transducer_logits.grad.numpy(), logits_grad_gpu.numpy(), rtol=1e-6)

# time cost

## warp_transducer

In [39]:
logits = torch.FloatTensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.6, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.8, 0.1]],
                              [[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.1, 0.1],
                              [0.7, 0.1, 0.2, 0.1, 0.1]]]])
warp_transducer_logits = logits.clone()
torchaudio_logits = logits.clone()
optimized_transducer_logits = logits.clone()

logits_cuda = logits.to(device)
warp_transducer_logits_cuda = logits_cuda.clone()
torchaudio_logits_cuda = logits_cuda.clone()
optimized_transducer_logits_cuda = logits_cuda.clone()

targets = torch.tensor([[1, 2]], dtype=torch.int32)
logit_lengths = torch.tensor([2], dtype=torch.int32)
target_lengths = torch.tensor([2], dtype=torch.int32)

targets_cuda = targets.to(device)
logit_lengths_cuda = logit_lengths.to(device)
target_lengths_cuda = target_lengths.to(device)

In [40]:
%%timeit
warp_transducer_cuda_loss = warprnnt_pytorch.rnnt_loss(warp_transducer_logits_cuda, 
                                                       targets_cuda, 
                                                       logit_lengths_cuda, 
                                                       target_lengths_cuda,
                                                       blank=0,
                                                       reduction='mean',
                                                       fastemit_lambda=0)

260 µs ± 567 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [41]:
%%timeit
warp_transducer_cpu_loss = warprnnt_pytorch.rnnt_loss(warp_transducer_logits, 
                                                      targets, 
                                                      logit_lengths, 
                                                      target_lengths,
                                                      blank=0,
                                                      reduction='mean',
                                                      fastemit_lambda=0)

109 µs ± 5.39 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## paddle

In [42]:
paddle.set_device('gpu')

logits = paddle.to_tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.6, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.8, 0.1]],
                              [[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.1, 0.1],
                              [0.7, 0.1, 0.2, 0.1, 0.1]]]], dtype=paddle.float32, stop_gradient=False)

targets = paddle.to_tensor([[1, 2]], dtype=paddle.int32)
logit_lengths = paddle.to_tensor([2], dtype=paddle.int32)
target_lengths = paddle.to_tensor([2], dtype=paddle.int32)

In [43]:
%%timeit
# gpu do log_softmax by default
rnnt_loss_gpu = paddle.nn.functional.rnnt_loss(logits, 
                                               targets, 
                                               logit_lengths, 
                                                target_lengths,
                                                blank=0,
                                                reduction='mean',
                                                fastemit_lambda=0)

133 µs ± 304 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [44]:
paddle.set_device('cpu')

logits = paddle.to_tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.6, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.8, 0.1]],
                              [[0.1, 0.6, 0.1, 0.1, 0.1],
                              [0.1, 0.1, 0.2, 0.1, 0.1],
                              [0.7, 0.1, 0.2, 0.1, 0.1]]]], dtype=paddle.float32, stop_gradient=False)

targets = paddle.to_tensor([[1, 2]], dtype=paddle.int32)
logit_lengths = paddle.to_tensor([2], dtype=paddle.int32)
target_lengths = paddle.to_tensor([2], dtype=paddle.int32)

In [45]:
%%timeit
logprob = paddle.nn.functional.log_softmax(logits)
rnnt_loss_cpu = paddle.nn.functional.rnnt_loss(logprob, 
                                               targets, 
                                               logit_lengths, 
                                                target_lengths,
                                                blank=0,
                                                reduction='mean',
                                                fastemit_lambda=0)

39.8 µs ± 375 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
