<a href="https://colab.research.google.com/github/Shin-mat/Google-Colab/blob/main/distil_whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio]

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

# Add new tokens to the tokenizer
new_tokens = ["new_token"]  # replace with your new tokens
added_tokens = processor.tokenizer.add_tokens(new_tokens)

# Resize the token embeddings of the model
model.resize_token_embeddings(len(processor.tokenizer))

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=0 if torch.cuda.is_available() else -1
)

import subprocess

# Path of input file
input_file = "XXX.m4a"
# Path of output file
output_file = "XXX.wav"

# Converting audio files with ffmpeg
subprocess.run(["ffmpeg", "-i", input_file, output_file])

result = pipe(output_file)  # Corrected this line
print(result["text"])
with open('output.txt', 'w', encoding='utf-8') as f:
    f.write(result["text"])