In [1]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

WAV_DATA_PATH = "./data/hangang_kor.wav" # 꽁꽁 얼어붙은 한강 위로 고양이가 걸어다닙니다.

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to("cuda")
model.config.forced_decoder_ids = None

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# 오디오 데이터 로드
waveform, sample_rate = torchaudio.load(WAV_DATA_PATH)  # torch.Size([1, 160000]), 16000
print(f"Waveform shape: {waveform.shape}, Sample rate: {sample_rate}")

# waveform = waveform / waveform.abs().max()

if waveform.shape[0] == 1:
    waveform = waveform.squeeze(0)
print(f"Squeezed waveform shape: {waveform.shape}")


# 모델에 입력할 데이터 준비
inputs = processor(
    waveform,
    sampling_rate=sample_rate,
    return_tensors="pt"
)

print(inputs["input_features"].shape)  # torch.Size([80, 128, 3000])


Waveform shape: torch.Size([1, 160000]), Sample rate: 16000
Squeezed waveform shape: torch.Size([160000])
torch.Size([1, 128, 3000])


In [14]:
# 인코더-디코더를 사용
generated_tokens = model.generate(inputs["input_features"].to("cuda"))

# 번역 결과 디코딩
translation = processor.batch_decode(generated_tokens, skip_special_tokens=True)
print("Translated Text:", translation)

Translated Text: [' 꽁꽁 얼어붙은 한강 위로 고양이가 걸어다닙니다.']


## Chunk 로 나눠서 inference

In [16]:
# Split waveform into 1-second chunks
chunk_length = sample_rate  # 1 second
chunks = [waveform[i:i + chunk_length] for i in range(0, waveform.shape[0], chunk_length)]

# Process each chunk and prepare a batch of inputs
input_features_list = []
for chunk in chunks:
    inputs = processor(chunk, sampling_rate=sample_rate, return_tensors="pt")
    input_features_list.append(inputs.input_features)

# Stack input features into a batch
input_features_batch = torch.cat(input_features_list, dim=0)
print(f"Batch input features shape: {input_features_batch.shape}")

# Run inference on the batch of inputs
generated_tokens = model.generate(input_features_batch.to("cuda"), max_length=512, language="ko")

# Decode the translation results
translations = processor.batch_decode(generated_tokens, skip_special_tokens=True)
print("Translations:", translations)

Batch input features shape: torch.Size([10, 128, 3000])


ValueError: Multiple languages detected when trying to predict the most likely target language for transcription. It is currently not supported to transcribe to different languages in a single batch. Please make sure to either force a single language by passing `language='...'` or make sure all input audio is of the same language.