In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True, attn_implementation="flash_attention_2"
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)


  from .autonotebook import tqdm as notebook_tqdm
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
result = pipe(r'C:\Users\jvish\OneDrive\Documents\VISH_Stuff\vw-multimodal-backend\vw-MultiModalAI-backend\src\audio.mp3',generate_kwargs={"language": "english"},   return_timestamps=True
)
print(result["chunks"])



[{'timestamp': (0.0, 13.28), 'text': " Let's you and me step out and see How blue the sky, how tall the trees"}, {'timestamp': (13.28, 31.38), 'text': " Let's you and I step out to find How far we go, how deep the sea"}, {'timestamp': (37.0, 54.24), 'text': " Cause forever is a long long way Shkoda Kushak is India's safest family car."}, {'timestamp': (54.24, 59.0), 'text': ' Protect what matters.'}, {'timestamp': (59.0, 59.32), 'text': ' Shkoda'}]
