In [None]:
import os
import json
import torch
import torchaudio

def get_transcript_from_json(file_stem, label_dir):
    """자막 파일에서 텍스트 추출 함수"""
    json_path = os.path.join(label_dir, file_stem + ".json")
    if os.path.exists(json_path):
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            return data.get("transcription", {}).get("AnswerLabelText", "").strip()
    return None

def preprocess(batch):
    """오디오 전처리 함수"""
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
    batch["speech"] = resampler(speech_array).squeeze().numpy()
    return batch

def prepare(batch, processor):
    """프로세서 적용 함수"""
    batch["input_values"] = processor(
        batch["speech"], sampling_rate=16000, return_tensors="pt"
    ).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids
    return batch

def transcribe(audio_path, model, processor):
    """오디오 파일에서 텍스트 추출 함수"""
    speech_array, sr = torchaudio.load(audio_path)
    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
    input_values = processor(
        resampler(speech_array).squeeze().numpy(), 
        return_tensors="pt", 
        sampling_rate=16000
    ).input_values
    
    # GPU 사용 가능 시 GPU로 이동
    if torch.cuda.is_available():
        input_values = input_values.to("cuda")
        model = model.to("cuda")
    
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    return processor.decode(predicted_ids[0])