In [None]:
import os
import sys
import torch
import logging
import warnings
import configparser
warnings.filterwarnings("ignore", category=UserWarning)
from datetime import datetime
from pyannote.audio import Pipeline as AudioPipeline
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          AutoModelForSpeechSeq2Seq,
                          AutoProcessor,
                          AutoModelForAudioClassification,
                          Wav2Vec2FeatureExtractor,
                          pipeline as huggingface_pipeline)

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
filename = ''

In [None]:
stt_model_id = 'openai/whisper-large-v3'
stt_torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    stt_model_id, torch_dtype=stt_torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
stt_model.to(device)
stt_processor = AutoProcessor.from_pretrained(stt_model_id)
stt_pipeline = huggingface_pipeline(
    'automatic-speech-recognition',
    model=stt_model,
    tokenizer=stt_processor.tokenizer,
    feature_extractor=stt_processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=1,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=stt_torch_dtype,
    device=device,
)

In [None]:
result = stt_pipeline(filename, generate_kwargs={'language': 'french', 'task': 'transcribe'})
result['text']

without pipeline

In [1]:
audio_sample = '../files/output/20240512/test/audio/speaker_000/part_00000.wav'

In [2]:
# Load model directly
from transformers import AutoProcessor, Wav2Vec2FeatureExtractor, AutoModelForSpeechSeq2Seq
import torchaudio

processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3")
feature_extractor = processor.feature_extractor

In [3]:
model_sampling_rate = feature_extractor.sampling_rate
waveform, audio_sampling_rate = torchaudio.load(audio_sample)
resampler = torchaudio.transforms.Resample(audio_sampling_rate, model_sampling_rate)
resampled_waveform = resampler(waveform).squeeze().numpy()

In [4]:
input_features = processor(
    resampled_waveform, sampling_rate=model_sampling_rate, return_tensors="pt"
).input_features

In [6]:
predicted_ids = model.generate(input_features, language="french", task="transcribe")

In [8]:
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription[0]

In [15]:
transcription[0].strip() == transcription[1].strip()