In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import torch
import whisper
import numpy as np
import coremltools as ct

from pathlib import Path
from whisper_ane.arch.encoder import AudioEncoderANE
from whisper_ane.export.utils import *

torch.set_printoptions(4, sci_mode=False)

In [3]:
# EXPORT ARGS -- TO BE INPUT BY USER
ARCH = "base"
BASE_EXPORT_DIR = f"/Users/rahulsomani/Desktop/Whisper-ANE-Encoder/{ARCH}"
# COMPUTE_PRECISION = ct.precision.FLOAT16
COMPUTE_PRECISION = ct.precision.FLOAT32
# COMPUTE_PRECISION = None
MIN_DEPLOYMENT_TARGET = None # ct.target.macOS12

# Input / output names
AUDIO_INPUT_NAME = "logmel_data"
ENCODER_OUTPUT_NAME = "encoded_audio"

In [4]:
# Checks and stuff
EXPORT_SUFFIX = f"--precision-fp{16 if COMPUTE_PRECISION is ct.precision.FLOAT16 else 32}"

_AVAILABLE_ARCHS = whisper.available_models()
if not ARCH in _AVAILABLE_ARCHS:
    raise RuntimeError(
        f"Selected arch '{ARCH}' is not available. Available options are: {_AVAILABLE_ARCHS}"
    )
BASE_EXPORT_DIR = Path(BASE_EXPORT_DIR)
BASE_EXPORT_DIR.mkdir(exist_ok=True, parents=True)

In [5]:
# Quick utils
permute_dims = (0, 3, 1, 2)
tfm = lambda x: x.permute(permute_dims).squeeze(-1)
def abs_diff(x1, x2):
    return (x1 - x2).abs().sum()

### Load Models

In [6]:
model = whisper.load_model(ARCH)
x = torch.rand(1, 80, 3000)

In [7]:
encoder_stock = model.encoder
encoder_ane = AudioEncoderANE.from_stock_encoder(encoder_stock)

encoder_stock.eval();
encoder_ane.eval();

In [8]:
with torch.no_grad():
    out_orig = encoder_stock(x)
    out_ane = encoder_ane(x)

out_orig.shape, out_ane.shape

(torch.Size([1, 1500, 512]), torch.Size([1, 512, 1, 1500]))

In [9]:
"Abs Diff: ", abs_diff(out_orig, tfm(out_ane))
"Cos Sim:  ", torch.cosine_similarity(out_orig, tfm(out_ane)).mean()

('Abs Diff: ', tensor(2.8798))

('Cos Sim:  ', tensor(1.))

### JIT Trace

In [10]:
%%time
encoder_ane_jit = torch.jit.trace(encoder_ane, x)
encoder_stock_jit = torch.jit.trace(encoder_stock, x)

  assert inputs.size(1) == self.num_channels
  assert x.shape[2] == 1, f"Expected third dim to be 1. Got {x.shape[2]} instead (full shape -> {x.shape})."
  dim_per_head = dim // self.n_head
  normalize_factor = float(dim_per_head) ** -0.5
  assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
  scale = (n_state // self.n_head) ** -0.25


CPU times: user 30.1 s, sys: 613 ms, total: 30.7 s
Wall time: 24.1 s


In [11]:
%%time
out_ane_jit = encoder_ane_jit(x)
out_stock_jit = encoder_stock_jit(x)

CPU times: user 6.37 s, sys: 121 ms, total: 6.49 s
Wall time: 5.21 s


In [12]:
"PyTorch jitted stock - PyTorch jitted ANE", abs_diff(out_stock_jit, tfm(out_ane_jit))
torch.allclose(out_stock_jit, tfm(out_ane_jit), atol=1e-2)
torch.allclose(out_ane_jit, out_ane)

('PyTorch jitted stock - PyTorch jitted ANE',
 tensor(2.8798, grad_fn=<SumBackward0>))

True

True

### CoreML Util - Add Metadata To Model

In [13]:
from typing import List

def add_metadata_for_encoder(
    encoder: ct.models.MLModel,
    input_name: str = 'logmel_data',
    output_name: str = 'audio_embedding',
    output_shape: List[int] = None,
) -> ct.models.MLModel:
    assert output_shape, f"`output_shape` required"
    spec = encoder.get_spec()

    # Add top level metadata
    spec.description.metadata.author = "OpenAI / Ozu"  # ...?
    spec.description.metadata.license = "MIT"
    spec.description.metadata.shortDescription = f"""
    '{ARCH}' variant of OpenAI's Whisper (https://github.com/openai/whisper) optimised for the ANE using the principles outlined in Apple's repo (https://github.com/apple/ml-ane-transformers)
    """.replace('\n', ' ')

    output = encoder.predict({input_name: x.numpy()})
    assert len(output.keys()) == 1

    {k:v.shape for k,v in output.items()}

    # rename output feature
    ct.utils.rename_feature(spec, list(output.keys())[0], output_name)

    # add more metadata about inputs & outputs
    input_type = find_io_type(spec, input_name, search_inputs=True)
    input_type.shortDescription = "Mel spectogram audio input"

    output_type = find_io_type(spec, output_name, search_outputs=True)
    output_type.type.multiArrayType.shape.extend(output_shape)
    output_type.shortDescription = 'Audio embeddings in the shape (BS, embed_dim, 1, seq_len)'

    return ct.models.MLModel(spec, weights_dir=encoder.weights_dir)

### Export Stock Model To CoreML

In [14]:
# %%time
_encoder_stock_mlmodel = ct.convert(
    encoder_stock_jit,
    minimum_deployment_target = MIN_DEPLOYMENT_TARGET,
    compute_precision = COMPUTE_PRECISION,
    convert_to="mlprogram",
    inputs = [ct.TensorType(name=AUDIO_INPUT_NAME, shape=x.shape)]
)

Converting PyTorch Frontend ==> MIL Ops: 100%|███████████████████████████████████▉| 531/532 [00:00<00:00, 3071.58 ops/s]
Running MIL Common passes: 100%|██████████████████████████████████████████████████| 40/40 [00:00<00:00, 175.62 passes/s]
Running MIL Clean up passes: 100%|████████████████████████████████████████████████| 11/11 [00:00<00:00, 108.89 passes/s]


In [15]:
encoder_stock_mlmodel = add_metadata_for_encoder(
         encoder = _encoder_stock_mlmodel,
      input_name = AUDIO_INPUT_NAME,
     output_name = ENCODER_OUTPUT_NAME,
    output_shape = list(out_stock_jit.shape)
)

In [16]:
out_stock_jit

tensor([[[ 0.1168, -0.1706, -0.4942,  ..., -0.8114, -0.5475,  0.0136],
         [ 0.6642, -0.0579,  0.0812,  ..., -0.1821, -0.1616,  0.0357],
         [ 1.2810,  0.3786, -0.1048,  ..., -0.3349, -0.8647, -0.2195],
         ...,
         [-0.6410, -0.6419,  0.7206,  ...,  0.3759,  0.2229,  1.0623],
         [-0.3861, -0.3299, -0.6774,  ...,  0.1726,  0.8344,  0.2988],
         [-1.0575,  0.8441, -0.0095,  ...,  0.5406, -0.3654,  0.1805]]],
       grad_fn=<NativeLayerNormBackward0>)

In [17]:
out_stock_mlmod = encoder_stock_mlmodel.predict(
    {AUDIO_INPUT_NAME: x.numpy()}
)
out_stock_mlmod

{'encoded_audio': array([[[ 0.11680819, -0.1705755 , -0.4942226 , ..., -0.8114355 ,
          -0.54749817,  0.01363501],
         [ 0.6641544 , -0.05793391,  0.08117322, ..., -0.18210214,
          -0.16157283,  0.03564826],
         [ 1.280986  ,  0.37862337, -0.10479611, ..., -0.3349134 ,
          -0.86473936, -0.21953698],
         ...,
         [-0.6410045 , -0.64186215,  0.7205772 , ...,  0.37588137,
           0.22286299,  1.0622998 ],
         [-0.38605604, -0.32986856, -0.6774049 , ...,  0.17257194,
           0.8343568 ,  0.29884678],
         [-1.0575485 ,  0.8441242 , -0.00949121, ...,  0.54062414,
          -0.36536804,  0.18053477]]], dtype=float32)}

In [18]:
_output = torch.from_numpy(out_stock_mlmod[ENCODER_OUTPUT_NAME])
"PyTorch jitted stock - coreml stock", abs_diff(_output, out_stock_jit)

('PyTorch jitted stock - coreml stock', tensor(4.0219, grad_fn=<SumBackward0>))

In [19]:
out_stock_jit.shape

torch.Size([1, 1500, 512])

In [20]:
torch.cosine_similarity(_output, out_stock_jit, dim=1).mean()
torch.cosine_similarity(_output, out_stock_jit, dim=2).mean()

tensor(1., grad_fn=<MeanBackward0>)

tensor(1., grad_fn=<MeanBackward0>)

In [21]:
encoder_stock_mlmodel.save(BASE_EXPORT_DIR / f"whisper-{ARCH}-stock{EXPORT_SUFFIX}.mlpackage")

### Export ANE Model To CoreML

In [22]:
_encoder_ane_mlmodel = ct.convert(
    encoder_ane_jit,
    minimum_deployment_target = MIN_DEPLOYMENT_TARGET,
    compute_precision = COMPUTE_PRECISION,
    convert_to = "mlprogram",
    inputs = [ct.TensorType(name=AUDIO_INPUT_NAME, shape=x.shape)]
)

Converting PyTorch Frontend ==> MIL Ops: 100%|███████████████████████████████████▉| 892/893 [00:00<00:00, 7905.20 ops/s]
Running MIL Common passes: 100%|██████████████████████████████████████████████████| 40/40 [00:00<00:00, 137.13 passes/s]
Running MIL Clean up passes: 100%|████████████████████████████████████████████████| 11/11 [00:00<00:00, 113.65 passes/s]


In [23]:
encoder_ane_mlmodel = add_metadata_for_encoder(
         encoder = _encoder_ane_mlmodel,
      input_name = AUDIO_INPUT_NAME,
     output_name = ENCODER_OUTPUT_NAME,
    output_shape = list(out_ane_jit.shape)
)

In [24]:
# out_ane_jit
encoder_ane_jit(x)

tensor([[[[ 0.1168,  0.6642,  1.2810,  ..., -0.6410, -0.3861, -1.0575]],

         [[-0.1706, -0.0579,  0.3786,  ..., -0.6419, -0.3299,  0.8441]],

         [[-0.4942,  0.0812, -0.1048,  ...,  0.7206, -0.6774, -0.0095]],

         ...,

         [[-0.8114, -0.1821, -0.3349,  ...,  0.3759,  0.1726,  0.5406]],

         [[-0.5475, -0.1616, -0.8647,  ...,  0.2229,  0.8344, -0.3654]],

         [[ 0.0136,  0.0357, -0.2195,  ...,  1.0623,  0.2988,  0.1805]]]],
       grad_fn=<DifferentiableGraphBackward>)

In [25]:
out_ane_mlmod = encoder_ane_mlmodel.predict({AUDIO_INPUT_NAME: x.numpy()})
out_ane_mlmod

{'encoded_audio': array([[[[ 0.11680963,  0.6641547 ,  1.2809876 , ..., -0.64100677,
           -0.38605592, -1.057548  ]],
 
         [[-0.17057452, -0.05793391,  0.37862483, ..., -0.6418604 ,
           -0.32986853,  0.8441241 ]],
 
         [[-0.49422303,  0.08117198, -0.10479528, ...,  0.72057784,
           -0.67740405, -0.00949142]],
 
         ...,
 
         [[-0.81143546, -0.18210167, -0.33491337, ...,  0.37588137,
            0.17257176,  0.5406247 ]],
 
         [[-0.5474986 , -0.16157244, -0.8647394 , ...,  0.22286098,
            0.83435595, -0.36536828]],
 
         [[ 0.01363305,  0.03564599, -0.21953641, ...,  1.0623014 ,
            0.29884604,  0.18053426]]]], dtype=float32)}

Stock PT output is the GT


In [26]:
_output_ane = torch.from_numpy(out_ane_mlmod[ENCODER_OUTPUT_NAME])
"PyTorch jitted ANE - coreml ANE", abs_diff(_output_ane, out_ane_jit)
# torch.cosine_similarity(torch.from_numpy(out_ane_mlmod["var_1192"]), out_ane_jit)

('PyTorch jitted ANE - coreml ANE', tensor(4.9226, grad_fn=<SumBackward0>))

In [27]:
out_ane_jit.shape

torch.Size([1, 512, 1, 1500])

In [28]:
torch.cosine_similarity(_output_ane, out_ane_jit, dim=1).mean()
torch.cosine_similarity(_output_ane, out_ane_jit, dim=3).mean()

tensor(1., grad_fn=<MeanBackward0>)

tensor(1., grad_fn=<MeanBackward0>)

In [29]:
# Stock vs ANE MLModel diff
abs_diff(_output_ane, out_ane_jit) - abs_diff(_output, out_stock_jit)

tensor(0.9007, grad_fn=<SubBackward0>)

In [30]:
"coreml stock - coreml ANE", abs_diff(tfm(_output_ane), _output)

('coreml stock - coreml ANE', tensor(1.1897))

In [31]:
encoder_ane_mlmodel.save(BASE_EXPORT_DIR / f"whisper-{ARCH}-ane{EXPORT_SUFFIX}.mlpackage")