In [1]:
import tensorflow as tf
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer

In [2]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", predict_timestamps=True)
processor = WhisperProcessor(feature_extractor, tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

inputs = feature_extractor(
    ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="tf"
)
input_features = inputs.input_features

input_features

<tf.Tensor: shape=(1, 80, 3000), dtype=float32, numpy=
array([[[ 1.1933082e-01, -9.4576120e-02, -1.0977852e-01, ...,
         -8.0602670e-01, -8.0602670e-01, -8.0602670e-01],
        [ 4.9346685e-04, -8.9271426e-02, -6.7289710e-02, ...,
         -8.0602670e-01, -8.0602670e-01, -8.0602670e-01],
        [-1.5326309e-01, -2.0803916e-01, -2.2226822e-01, ...,
         -8.0602670e-01, -8.0602670e-01, -8.0602670e-01],
        ...,
        [-8.0602670e-01, -8.0602670e-01, -7.9996610e-01, ...,
         -8.0602670e-01, -8.0602670e-01, -8.0602670e-01],
        [-8.0602670e-01, -7.7210999e-01, -8.0602670e-01, ...,
         -8.0602670e-01, -8.0602670e-01, -8.0602670e-01],
        [-8.0602670e-01, -8.0602670e-01, -8.0602670e-01, ...,
         -8.0602670e-01, -8.0602670e-01, -8.0602670e-01]]], dtype=float32)>

In [4]:
import librosa

audio_path = 'record_out.wav'
audio_array, sampling_rate = librosa.load(audio_path, sr=16000)

inputs = feature_extractor(
    audio_array, sampling_rate=16000, return_tensors="tf"
)
input_features_wav = inputs.input_features

input_features_wav

<tf.Tensor: shape=(1, 80, 3000), dtype=float32, numpy=
array([[[-0.43277192, -0.43277192, -0.43277192, ..., -0.43277192,
         -0.43277192, -0.43277192],
        [-0.43277192, -0.43277192, -0.43277192, ..., -0.43277192,
         -0.43277192, -0.43277192],
        [-0.43277192, -0.43277192, -0.43277192, ..., -0.43277192,
         -0.43277192, -0.43277192],
        ...,
        [-0.43277192, -0.43277192, -0.43277192, ..., -0.43277192,
         -0.43277192, -0.43277192],
        [-0.43277192, -0.43277192, -0.43277192, ..., -0.43277192,
         -0.43277192, -0.43277192],
        [-0.43277192, -0.43277192, -0.43277192, ..., -0.43277192,
         -0.43277192, -0.43277192]]], dtype=float32)>

In [5]:
tflite_model_path = './sane/whisper-jv-small.tflite'
interpreter = tf.lite.Interpreter(tflite_model_path)
tflite_generate = interpreter.get_signature_runner()

In [6]:
generated_ids = tflite_generate(input_features=input_features)["sequences"]
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
transcription

' Mr Quilter is the Apostle of the Middle Classes and we are glad to welcome his Gospel'

In [7]:
generated_ids = tflite_generate(input_features=input_features_wav)["sequences"]
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
transcription

' The stale smell of old beer lingers It takes heat to bring out the odor A cold dip restores health and zest A salt pickle tastes fine with ham Tacos al pastor are my favorite A zestful food is the hot cross bun'