In [1]:
import torch
import pandas as pd
import numpy as np
import os
import warnings

from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
import soundfile
import librosa

In [2]:
processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
os.listdir('data')

['test.wav']

In [5]:
waveform, sample_rate = torchaudio.load("data/sample.wav")

In [6]:
if sample_rate != 16000:
    transform = T.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = transform(waveform)
    sample_rate = 16000

In [7]:
inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt", padding=True)
inputs

{'input_features': tensor([[[-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         ...,
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873]]])}

In [8]:
input_features = inputs.input_features
input_features.to(device)

tensor([[[-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         ...,
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873],
         [-1.2873, -1.2873, -1.2873,  ..., -1.2873, -1.2873, -1.2873]]],
       device='cuda:0')

In [9]:
print(input_features.shape)

torch.Size([1, 80, 330])


In [10]:
padding = torch.zeros(1, 80, 3000 - input_features.shape[2])
input_features = torch.cat([input_features, padding], dim=2)
print(input_features.shape)
input_features = input_features.to(device)
model = model.to(device)

torch.Size([1, 80, 3000])


In [12]:
generated_ids = model.generate(input_features, language='en')
generated_ids

tensor([[50258, 50259, 50359, 50363, 15694,   366,  1417,   538,   264,  2853,
            13, 50257]], device='cuda:0')

In [13]:
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
transcription

' Kids are talking by the door.'