In [1]:
import os
os.environ['HF_HOME'] = '/home/lobby/.cache/huggingface'
os.environ['HF_HOME']

'/home/lobby/.cache/huggingface'

In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

In [3]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = "cpu"

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"
# model_id = "openai/whisper-tiny"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

In [4]:
print(f"Using device: {device}")

Using device: cpu


In [None]:
# List of file names
file_names = [
    "Day 10- Enterprise Architecture certification Preparation - Architecture Governance 4",
    "Day 11 - Enterprise Architecture certification Preparation - Building Blocks 4",
    "Day 11 - Enterprise Architecture certification Preparation - TOGAF reference Models 4",
    "Day 11 - Enterprise Architecture certification Preparation - Views and View Points 4"
]

input_file_ext = "mp3"

input_paths = [os.path.join("G15/input/audio/en", f"{file_name}.{input_file_ext}") for file_name in file_names]
output_paths = [os.path.join("G15/output/en", f"{file_name}.txt") for file_name in file_names]

input_paths, output_paths

(['G15/input/audio/en/short-1.mp3',
  'G15/input/audio/en/short-2.mp3',
  'G15/input/audio/en/short-3.mp3'],
 ['G15/output/en/short-1.txt',
  'G15/output/en/short-2.txt',
  'G15/output/en/short-3.txt'])

In [6]:


pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    chunk_length_s=30
    # language='en'
)

# Process each file
for input_path, output_path in zip(input_paths, output_paths):
    # clear file content
    open(output_path, 'w').close()

    result = pipe(
        input_path, 
        batch_size=1,
        generate_kwargs={"language": "english"}
        # generate_kwargs={"language": "french", "task": "translate"}
    )

    with open(output_path, 'a', encoding='utf-8') as srtFile:
        srtFile.write(result["text"])

output_paths

Device set to use cpu
You have passed language=english, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=english.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['G15/output/en/short-1.txt',
 'G15/output/en/short-2.txt',
 'G15/output/en/short-3.txt']