In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 2, 3"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
import os
import json
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset, Audio
from transformers import (Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments, TrainerCallback)
from dataclasses import dataclass
from typing import Dict, List, Union
import re

# 경로 설정
train_audio_dir = "/home/ace3_yongjae/speechRecog/train/training"
train_json_dir = "/home/ace3_yongjae/speechRecog/train/labeling"
valid_audio_dir = "/home/ace3_yongjae/speechRecog/valid/validation" 
valid_json_dir = "/home/ace3_yongjae/speechRecog/valid/labeling"

# JSON 로드 함수
def load_json_files(directory):
    return [json.load(open(os.path.join(directory, f), encoding='utf-8'))
            for f in os.listdir(directory) if f.endswith('.json')]

train_json = load_json_files(train_json_dir)
valid_json = load_json_files(valid_json_dir)

# DataFrame 생성
def create_dataframe(json_data, audio_dir):
    data = []
    for item in json_data:
        file_name = item.get('fileName')
        transcription = item.get('transcription', {})
        answer_text = transcription.get('AnswerLabelText') or transcription.get('ReadingLabelText', '')
        audio_path = os.path.join(audio_dir, file_name)
        if os.path.exists(audio_path) and answer_text:
            data.append({'file_path': audio_path, 'text': answer_text})
    return pd.DataFrame(data)

train_df = create_dataframe(train_json, train_audio_dir)
valid_df = create_dataframe(valid_json, valid_audio_dir)

# 텍스트 정규화
def prepare_korean_text(text):
    text = re.sub(r'[^\uAC00-\uD7A3\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

train_df['normalized_text'] = train_df['text'].apply(prepare_korean_text)
valid_df['normalized_text'] = valid_df['text'].apply(prepare_korean_text)

# 모델 로드
MODEL_ID = "kresnik/wav2vec2-large-xlsr-korean"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(
    MODEL_ID,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id
)
model.freeze_feature_encoder()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 데이터셋 전처리
def prepare_dataset(df):
    dataset = Dataset.from_pandas(df)
    dataset = dataset.cast_column("file_path", Audio(sampling_rate=16000))
    return dataset

train_dataset = prepare_dataset(train_df)
valid_dataset = prepare_dataset(valid_df)

def prepare_dataset_for_model(batch):
    audio = batch["file_path"]
    array = audio["array"]
    if np.max(np.abs(array)) > 0:
        array = array / np.max(np.abs(array))
    batch["input_values"] = processor(array, sampling_rate=16000).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["normalized_text"]).input_ids
    return batch

train_dataset = train_dataset.map(prepare_dataset_for_model, remove_columns=train_dataset.column_names)
valid_dataset = valid_dataset.map(prepare_dataset_for_model, remove_columns=valid_dataset.column_names)

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]
        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(label_features, padding=self.padding, return_tensors="pt")
        batch["labels"] = labels_batch["input_ids"]
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# 평가 메트릭
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    return {
        "wer": wer_metric.compute(predictions=pred_str, references=label_str),
        "cer": cer_metric.compute(predictions=pred_str, references=label_str)
    }

# 메모리 모니터링 콜백
class UnfreezeFeatureEncoderCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        if state.epoch == 5:
            model = kwargs.get('model')
            if model is not None:
                for param in model.wav2vec2.feature_extractor.parameters():
                    param.requires_grad = True
                print("\nFeature encoder unfrozen!")

    def on_epoch_end(self, args, state, control, **kwargs):
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i} - Allocated: {torch.cuda.memory_allocated(i)/1e9:.2f} GB, Reserved: {torch.cuda.memory_reserved(i)/1e9:.2f} GB")
        torch.cuda.empty_cache()

Map: 100%|██████████| 13773/13773 [00:40<00:00, 336.58 examples/s]
Map: 100%|██████████| 1723/1723 [00:04<00:00, 386.40 examples/s]


In [4]:
# 학습 설정
training_args = TrainingArguments(
    output_dir="./wav2vec2-korean-asr",
    group_by_length=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    eval_strategy="epoch",
    num_train_epochs=4,
    fp16=False,
    fp16_full_eval=True,
    eval_accumulation_steps=2,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=3e-4,
    weight_decay=0.005,
    warmup_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=False,
    metric_for_best_model="wer",
    greater_is_better=False,
    ddp_find_unused_parameters=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor,
    callbacks=[UnfreezeFeatureEncoderCallback()]
)

  trainer = Trainer(


In [None]:
# 학습
trainer.train()

# 저장
model.save_pretrained("./final-model")
processor.save_pretrained("./final-model")



Epoch,Training Loss,Validation Loss,Wer,Cer
1,0.6107,0.195249,0.426034,0.154628
2,0.109,-0.04255,0.259833,0.09111
3,-0.0602,-0.146214,0.206438,0.0743
4,-0.1519,-0.22459,0.190126,0.075191


GPU 0 - Allocated: 3.79 GB, Reserved: 14.41 GB
GPU 1 - Allocated: 0.02 GB, Reserved: 11.74 GB
GPU 2 - Allocated: 0.02 GB, Reserved: 12.23 GB




GPU 0 - Allocated: 3.79 GB, Reserved: 15.06 GB
GPU 1 - Allocated: 0.02 GB, Reserved: 12.59 GB
GPU 2 - Allocated: 0.02 GB, Reserved: 12.10 GB
GPU 0 - Allocated: 3.79 GB, Reserved: 14.89 GB
GPU 1 - Allocated: 0.02 GB, Reserved: 12.59 GB
GPU 2 - Allocated: 0.02 GB, Reserved: 12.58 GB




GPU 0 - Allocated: 3.84 GB, Reserved: 15.52 GB
GPU 1 - Allocated: 0.02 GB, Reserved: 13.07 GB
GPU 2 - Allocated: 0.02 GB, Reserved: 13.07 GB




[]

In [9]:
model = Wav2Vec2ForCTC.from_pretrained("final-model")
model = model.float()

# 평가
eval_results = trainer.evaluate()
print("\n========= 평가 결과 =========")
for k, v in eval_results.items():
    print(f"{k}: {v}")

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py", line 97, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 2226, in forward
    outputs = self.wav2vec2(
              ^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 1807, in forward
    extract_features = self.feature_extractor(input_values)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 463, in forward
    hidden_states = conv_layer(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 332, in forward
    hidden_states = self.conv(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 375, in forward
    return self._conv_forward(input, self.weight, self.bias)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ace3_yongjae/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py", line 370, in _conv_forward
    return F.conv1d(
           ^^^^^^^^^
RuntimeError: Input type (float) and bias type (c10::Half) should be the same


In [None]:
# 테스트 추론 예제
import torch
import librosa

def transcribe_audio(audio_path, model, processor):
    audio, rate = librosa.load(audio_path, sr=16000)
    input_values = processor(audio, sampling_rate=rate, return_tensors="pt").input_values
    if torch.cuda.is_available():
        input_values = input_values.to("cuda")
        model = model.to("cuda")
    with torch.no_grad():
        logits = model(input_values).logits
    pred_ids = torch.argmax(logits, dim=-1)
    print(f"Raw pred_ids: {pred_ids}")
    transcription = processor.batch_decode(pred_ids)[0]
    return transcription

# 테스트 예제
test_file = "/home/ace3_yongjae/speechRecog/valid/validation/EN10QC227_EN0101_20211108.wav"
if os.path.exists(test_file):
    transcription = transcribe_audio(test_file, model, processor)
    print(f"인식 결과: {transcription}")
else:
    print(f"테스트 파일을 찾을 수 없습니다: {test_file}")