## 데이터 전처리

- Huggingface의 datasets 라이브러리를 이용해서 Whisper 학습 포맷으로 바꾼다.

In [None]:
from datasets import load_dataset, Dataset
import os

def load_custom_dataset(data_dir):
    data = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".wav"):
            txt_file = filename.replace(".wav", ".txt")
            with open(os.path.join(data_dir, txt_file), 'r', encoding='utf-8') as f:
                text = f.read().strip()
            data.append({
                "path": os.path.join(data_dir, filename),
                "audio": {"path": os.path.join(data_dir, filename)},
                "sentence": text
            })
    return Dataset.from_list(data)


## 모델 불러오기

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

## 파인튜닝하기

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-kids",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    learning_rate=1e-5,
    save_steps=500,
    logging_steps=100,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_max_length=128,
    fp16=True,  # GPU 사용 시
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=whisper_data_collator,
)

trainer.train()
