In [None]:
#  The MIT License (MIT)
#
#  Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the 'Software'), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in
#  all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#  THE SOFTWARE.

# Whisper

The following example will show how to run `Whisper` with `MIGraphX`.

Install the required dependencies.

In [None]:
# Install dependencies
%pip install accelerate datasets optimum[onnxruntime] transformers

We will use optimum to download the model.

The attention_mask for decoder is not exposed by default, but required to work with MIGraphX.
The following script will do that:

In [None]:
# download and export models
from download_whisper import export
export()

Now it is time to load these models with python.

First, we make sure that MIGraphX module is found in the python path.

In [None]:
import sys
mgx_lib_path = "/opt/rocm/lib/" # or "/code/AMDMIGraphX/build/lib/"
if mgx_lib_path not in sys.path:
    sys.path.append(mgx_lib_path)
import migraphx as mgx

import numpy as np
import os

Next, a helper method to load and cache the models.

This will use the `models/whisper-tiny.en_modified` path. If you changed it, make sure to update here as well.

In [None]:
def load_mgx_model(name, shapes):
    file = f"models/whisper-tiny.en_modified/{name}_model"
    print(f"Loading {name} model from {file}")
    if os.path.isfile(f"{file}.mxr"):
        print("Found mxr, loading it...")
        model = mgx.load(f"{file}.mxr", format="msgpack")
    elif os.path.isfile(f"{file}.onnx"):
        print("Parsing from onnx file...")
        model = mgx.parse_onnx(f"{file}.onnx", map_input_dims=shapes)
        model.compile(mgx.get_target("gpu"))
        print(f"Saving {name} model to mxr file...")
        mgx.save(model, f"{file}.mxr", format="msgpack")
    else:
        print(f"No {name} model found. Please download it and re-try.")
        sys.exit(1)
    return model

With that, we can load the models. This could take several minutes.

In [None]:
encoder_model = load_mgx_model("encoder", {"input_features": [1, 80, 3000]})
decoder_model = load_mgx_model(
    "decoder", {
        "input_ids": [1, 448],
        "attention_mask": [1, 448],
        "encoder_hidden_states": [1, 1500, 384]
    })

Time to load the processor from the original source.
It will be used to get feature embeddings from the audio data and decode the output tokens.

In [None]:
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")

Next, we will define all the steps one by one, to make the last step short and simple.

The first step will be to get audio data.
For testing purposes, we will use Hugging Face's dummy samples.

In [None]:
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy",
                  "clean",
                  split="validation")

Next step will be to get the input features from the audio data.

In [None]:
def get_input_features_from_sample(sample_data, sampling_rate):
    return processor(sample_data,
                     sampling_rate=sampling_rate,
                     return_tensors="np").input_features

We will encode these and use them in the decoding step.

In [None]:
def encode_features(input_features):
    return np.array(
        encoder_model.run(
            {"input_features": input_features.astype(np.float32)})[0])

The decoding process will be explained later in `generate`.

The decoder model will expect the encoded features, the input ids (decoded tokens), and the attention mask to ignore parts as needed.

In [None]:
def decode_step(input_ids, attention_mask, hidden_states):
    return np.array(
        decoder_model.run({
            "input_ids":
            input_ids.astype(np.int64),
            "attention_mask":
            attention_mask.astype(np.int64),
            "encoder_hidden_states":
            hidden_states.astype(np.float32)
        })[0])

The following parameters are from [whisper-tiny.en's config](https://huggingface.co/openai/whisper-tiny.en/blob/main/config.json).

You might need to change them if you change the model.

In [None]:
# model params
decoder_start_token_id = 50257  # <|startoftranscript|>
eos_token_id = 50256  # "<|endoftext|>"
notimestamps = 50362  # <|notimestamps|>
max_length = 448
sot = [decoder_start_token_id, notimestamps]

To kickstart the decoding, we will provide the `<|startoftranscript|>` and `<|notimestamps|>` tokens.

Fill up the remaining tokens with `<|endoftext|>` and mask to ignore them.

In [None]:
def initial_decoder_inputs():
    input_ids = np.array([sot + [eos_token_id] * (max_length - len(sot))])
    # 0 masked | 1 un-masked
    attention_mask = np.array([[1] * len(sot) + [0] * (max_length - len(sot))])
    return (input_ids, attention_mask)

Finally the text generation part.

With each decoding step, we will get the probabilities for the next token. We greedily get best match, add it to the decoded tokens and unmask it.

If the token is `<|endoftext|>`, we finished with the transcribing.

In [None]:
def generate(input_features):
    hidden_states = encode_features(input_features)
    input_ids, attention_mask = initial_decoder_inputs()
    for timestep in range(len(sot) - 1, max_length):
        # get logits for the current timestep
        logits = decode_step(input_ids, attention_mask, hidden_states)
        # greedily get the highest probable token
        new_token = np.argmax(logits[0][timestep])

        # add it to the tokens and unmask it
        input_ids[0][timestep + 1] = new_token
        attention_mask[0][timestep + 1] = 1

        print("Transcribing: " + ''.join(
            processor.decode(input_ids[0][:timestep + 1],
                             skip_special_tokens=True)),
              end='\r')

        if new_token == eos_token_id:
            print(flush=True)
            break

To test this, we will get the fist audio from the dataset.

Feel free to change it and experiment.

In [None]:
sample = ds[0]["audio"]  # or load it from file
data, sampling_rate = sample["array"], sample["sampling_rate"]

In [None]:
input_features = get_input_features_from_sample(data, sampling_rate)
generate(input_features)

The result should be:

`Transcribing:  Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.`