In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
import json
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset, Audio
from transformers import (Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments, TrainerCallback)
from dataclasses import dataclass
from typing import Dict, List, Union
import re

# 경로 설정
train_audio_dir = "/home/ace3_yongjae/speechRecog/train/training"
train_json_dir = "/home/ace3_yongjae/speechRecog/train/labeling"
valid_audio_dir = "/home/ace3_yongjae/speechRecog/valid/validation"
valid_json_dir = "/home/ace3_yongjae/speechRecog/valid/labeling"

# JSON 로드 함수
def load_json_files(directory):
    return [json.load(open(os.path.join(directory, f), encoding='utf-8'))
            for f in os.listdir(directory) if f.endswith('.json')]

train_json = load_json_files(train_json_dir)
valid_json = load_json_files(valid_json_dir)

# DataFrame 생성 (라벨 누락 방지)
def create_dataframe(json_data, audio_dir):
    data = []
    for item in json_data:
        file_name = item.get('fileName')
        transcription = item.get('transcription', {})
        answer_text = transcription.get('AnswerLabelText') or transcription.get('ReadingLabelText', '')
        audio_path = os.path.join(audio_dir, file_name)
        # 라벨이 비어있거나 None인 경우 제외
        if os.path.exists(audio_path) and answer_text and isinstance(answer_text, str) and answer_text.strip():
            data.append({'file_path': audio_path, 'text': answer_text})
    return pd.DataFrame(data)

train_df = create_dataframe(train_json, train_audio_dir)
valid_df = create_dataframe(valid_json, valid_audio_dir)

# 텍스트 정규화
def prepare_korean_text(text):
    text = re.sub(r'[^\uAC00-\uD7A3\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

train_df['normalized_text'] = train_df['text'].apply(prepare_korean_text)
valid_df['normalized_text'] = valid_df['text'].apply(prepare_korean_text)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 모델 로드
MODEL_ID = "kresnik/wav2vec2-large-xlsr-korean"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(
    MODEL_ID,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id
)
model.freeze_feature_encoder()

# 데이터셋 전처리
def prepare_dataset(df):
    dataset = Dataset.from_pandas(df)
    dataset = dataset.cast_column("file_path", Audio(sampling_rate=16000))
    return dataset

train_dataset = prepare_dataset(train_df)
valid_dataset = prepare_dataset(valid_df)

def prepare_dataset_for_model(batch):
    audio = batch["file_path"]
    array = audio["array"]
    if np.max(np.abs(array)) > 0:
        array = array / np.max(np.abs(array))
    batch["input_values"] = processor(array, sampling_rate=16000).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["normalized_text"]).input_ids
    return batch

train_dataset = train_dataset.map(prepare_dataset_for_model, remove_columns=train_dataset.column_names)
valid_dataset = valid_dataset.map(prepare_dataset_for_model, remove_columns=valid_dataset.column_names)

Map: 100%|██████████| 13773/13773 [00:39<00:00, 344.43 examples/s]
Map: 100%|██████████| 1723/1723 [00:04<00:00, 394.91 examples/s]


In [4]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]
        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(label_features, padding=self.padding, return_tensors="pt")
        batch["labels"] = labels_batch["input_ids"]
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# 평가 메트릭
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    # -100을 pad_token_id로 변환
    label_ids = pred.label_ids.copy()
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    return {
        "wer": wer_metric.compute(predictions=pred_str, references=label_str),
        "cer": cer_metric.compute(predictions=pred_str, references=label_str)
    }

# 메모리 및 Loss 모니터링 콜백
class UnfreezeFeatureEncoderCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        if state.epoch == 5:
            model = kwargs.get('model')
            if model is not None:
                for param in model.wav2vec2.feature_extractor.parameters():
                    param.requires_grad = True
                print("\nFeature encoder unfrozen!")

    def on_epoch_end(self, args, state, control, **kwargs):
        # Loss 직접 출력
        if hasattr(state, 'log_history') and state.log_history:
            last_log = state.log_history[-1]
            if 'loss' in last_log:
                print(f"Epoch {state.epoch} - Loss: {last_log['loss']:.6f}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i} - Allocated: {torch.cuda.memory_allocated(i)/1e9:.2f} GB, Reserved: {torch.cuda.memory_reserved(i)/1e9:.2f} GB")
        torch.cuda.empty_cache()


In [None]:
# 학습 설정
training_args = TrainingArguments(
    output_dir="./wav2vec2-korean-asr",
    group_by_length=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    eval_strategy="epoch",
    num_train_epochs=4,
    fp16=False,
    eval_accumulation_steps=2,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=3e-4,
    weight_decay=0.005,
    warmup_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=False,
    metric_for_best_model="wer",
    greater_is_better=False,
    ddp_find_unused_parameters=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor,
    callbacks=[UnfreezeFeatureEncoderCallback()]
)

  trainer = Trainer(


In [7]:
# 학습
trainer.train()

# 저장
model.save_pretrained("./final-model")
processor.save_pretrained("./final-model")



Epoch,Training Loss,Validation Loss,Wer,Cer
1,0.6129,0.196858,0.426433,0.15441
2,0.1058,-0.044596,0.26005,0.090486
3,-0.0634,-0.148512,0.203937,0.07329
4,-0.1571,-0.229063,0.186573,0.073379


Epoch 1.0 - Loss: 0.612900
GPU 0 - Allocated: 3.79 GB, Reserved: 14.18 GB
GPU 1 - Allocated: 0.02 GB, Reserved: 12.71 GB
GPU 2 - Allocated: 0.02 GB, Reserved: 12.23 GB




Epoch 2.0 - Loss: 0.105800
GPU 0 - Allocated: 3.79 GB, Reserved: 15.36 GB
GPU 1 - Allocated: 0.02 GB, Reserved: 12.25 GB
GPU 2 - Allocated: 0.02 GB, Reserved: 12.10 GB
Epoch 3.0 - Loss: -0.063400
GPU 0 - Allocated: 3.79 GB, Reserved: 15.86 GB
GPU 1 - Allocated: 0.02 GB, Reserved: 12.59 GB
GPU 2 - Allocated: 0.02 GB, Reserved: 11.73 GB




Epoch 4.0 - Loss: -0.157100
GPU 0 - Allocated: 3.84 GB, Reserved: 15.84 GB
GPU 1 - Allocated: 0.02 GB, Reserved: 12.56 GB
GPU 2 - Allocated: 0.02 GB, Reserved: 13.27 GB




[]

In [8]:
# 모델 .pth 파일 저장
pth_save_path = './final-model/model_state_dict.pth'
torch.save(model.state_dict(), pth_save_path)

print(f"PyTorch 모델 state_dict가 {pth_save_path}에 저장완.")

PyTorch 모델 state_dict가 ./final-model/model_state_dict.pth에 저장완.


In [4]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf
import numpy as np

# 모델과 processor 로드
MODEL_ID = "./final-model"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
model.eval()

# .pth 파일에서 state_dict를 불러올 때
state_dict = torch.load('./final-model/model_state_dict.pth', map_location='cpu')
model.load_state_dict(state_dict)

# 테스트할 오디오 파일 경로
test_audio_path = "/home/ace3_yongjae/speechRecog/valid/validation/EN11RC015_EN0208_20211023.wav"

# 오디오 파일 로드 및 전처리
audio_input, sample_rate = sf.read(test_audio_path)
if sample_rate != 16000:
    import librosa
    audio_input = librosa.resample(audio_input, orig_sr=sample_rate, target_sr=16000)
    sample_rate = 16000

# 입력값 정규화
if np.max(np.abs(audio_input)) > 0:
    audio_input = audio_input / np.max(np.abs(audio_input))

# 입력값 추출
input_values = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_values

# 추론
with torch.no_grad():
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

print("예측 결과:", transcription)

예측 결과: 네 여기 분실물 신고 접수 양식을 작성해 주시겠어요 그리고 지갑의 생김새에 대해 최대한 자세히 작성해 주시면 됩됩니다
