## Install Modules

## Import Modules

In [2]:
%pip install pandas
%pip install torch torchvision torchaudio
%pip install 'transformers[torch]'
%pip install datasets librosa soundfile evaluate jiwer

Collecting pandas
  Using cached pandas-2.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting numpy<2,>=1.22.4 (from pandas)
  Using cached numpy-1.26.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Using cached pandas-2.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Using cached numpy-1.26.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Using cached pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-1.26.2 pandas-2.1.3 pytz-2023.3.post1 tzdata-2023.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from glob import glob
import IPython.display as ipd

## Fix Seed

In [2]:
import numpy as np
import random
import os
import torch

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
seed_everything()

## Read Files

In [4]:
TRAIN_PATH = '/mnt/elice/dataset/train/'
TEST_PATH = '/mnt/elice/dataset/test/'

In [5]:
df = pd.read_csv(glob('/mnt/elice/dataset/train/*.csv')[0])
df

Unnamed: 0,filenames,text
0,audio_0.wav,이미 달러 환율 어제보다 올랐어?
1,audio_1.wav,뉴스 보게 텔레비전 좀 틀어 놔라.
2,audio_2.wav,편의시설 뭐가 있나 사이트 검색해서 알려드리겠습니다.
3,audio_3.wav,빨리 원무과까지 길 안내 좀 해 주게.
4,audio_4.wav,목적지 선릉까지 지하철 몇 번 갈아타야 하는지 알려 줘.
...,...,...
13995,audio_13995.wav,시월 삼십일 마지막 수술 몇 시야?
13996,audio_13996.wav,병원 진찰 언제로 예약했었지?
13997,audio_13997.wav,음악 시간 준비물로 뭐가 있는지 알려줘요.
13998,audio_13998.wav,일호선을 얼마나 타고 가야 하는지 안내해 줬으면 해.


In [6]:
df = pd.read_csv(TRAIN_PATH+'texts.csv', index_col=False)
# submission = pd.read_csv('/mnt/elice/dataset/sample_submission.csv', index_col=False)

## EDA

In [7]:
print(df['text'][0])
ipd.Audio(TRAIN_PATH + df['filenames'][0]) # load a local WAV file

이미 달러 환율 어제보다 올랐어?


In [8]:
print(df['text'][1])
ipd.Audio(TRAIN_PATH + df['filenames'][1]) # load a local WAV file

뉴스 보게 텔레비전 좀 틀어 놔라.


In [9]:
test_files = sorted(glob(TEST_PATH+'*'))
test_files

['/mnt/elice/dataset/test/test_0.wav',
 '/mnt/elice/dataset/test/test_1.wav',
 '/mnt/elice/dataset/test/test_10.wav',
 '/mnt/elice/dataset/test/test_100.wav',
 '/mnt/elice/dataset/test/test_1000.wav',
 '/mnt/elice/dataset/test/test_1001.wav',
 '/mnt/elice/dataset/test/test_1002.wav',
 '/mnt/elice/dataset/test/test_1003.wav',
 '/mnt/elice/dataset/test/test_1004.wav',
 '/mnt/elice/dataset/test/test_1005.wav',
 '/mnt/elice/dataset/test/test_1006.wav',
 '/mnt/elice/dataset/test/test_1007.wav',
 '/mnt/elice/dataset/test/test_1008.wav',
 '/mnt/elice/dataset/test/test_1009.wav',
 '/mnt/elice/dataset/test/test_101.wav',
 '/mnt/elice/dataset/test/test_1010.wav',
 '/mnt/elice/dataset/test/test_1011.wav',
 '/mnt/elice/dataset/test/test_1012.wav',
 '/mnt/elice/dataset/test/test_1013.wav',
 '/mnt/elice/dataset/test/test_1014.wav',
 '/mnt/elice/dataset/test/test_1015.wav',
 '/mnt/elice/dataset/test/test_1016.wav',
 '/mnt/elice/dataset/test/test_1017.wav',
 '/mnt/elice/dataset/test/test_1018.wav',
 '

In [10]:
ipd.Audio(test_files[0])

## Data Preprocess & Create Dataset

In [11]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer

# load feature extractor and tokenizer
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from datasets import Dataset, DatasetDict
from datasets import Audio

# create dataset from csv
ds = Dataset.from_dict({"audio": [f'{TRAIN_PATH}/{file_path}' for file_path in df["filenames"]],
                       "transcripts": [text for text in df["text"]]}).cast_column("audio", Audio(sampling_rate=16000))

# train/valid split
train_valid = ds.train_test_split(test_size=0.2)
train_valid_dataset = DatasetDict({
    "train": train_valid["train"],
    "valid": train_valid["test"]})

In [13]:
def prepare_dataset(batch):
    audio = batch['audio']

    # raw form(audio['array']) -> log-Mel spectrogram
    batch['input_features'] = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]
    
    # target text -> label ids(by tokenizer)
    batch['labels'] = tokenizer(batch['transcripts']).input_ids

    return batch

In [16]:
train_valid_dataset = train_valid_dataset.map(prepare_dataset, remove_columns = train_valid_dataset.column_names['train'], num_proc=4)

Map (num_proc=4): 100%|██████████| 11200/11200 [07:07<00:00, 26.19 examples/s]
Map (num_proc=4): 100%|██████████| 2800/2800 [01:48<00:00, 25.81 examples/s]


In [31]:
train_valid_dataset.save_to_disk('dataset')

Saving the dataset (22/22 shards): 100%|██████████| 11200/11200 [00:07<00:00, 1433.20 examples/s]
Saving the dataset (6/6 shards): 100%|██████████| 2800/2800 [00:03<00:00, 877.61 examples/s]


## Training

In [None]:
from datasets import load_from_disk

train_valid_dataset = load_from_disk('dataset')

### Data Collator

In [18]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 인풋 데이터와 라벨 데이터의 길이가 다르며, 따라서 서로 다른 패딩 방법이 적용되어야 한다. 그러므로 두 데이터를 분리해야 한다.
        # 먼저 오디오 인풋 데이터를 간단히 토치 텐서로 반환하는 작업을 수행한다.
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Tokenize된 레이블 시퀀스를 가져온다.
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # 레이블 시퀀스에 대해 최대 길이만큼 패딩 작업을 실시한다.
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 패딩 토큰을 -100으로 치환하여 loss 계산 과정에서 무시되도록 한다.
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 이전 토크나이즈 과정에서 bos 토큰이 추가되었다면 bos 토큰을 잘라낸다.
        # 해당 토큰은 이후 언제든 추가할 수 있다.
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [19]:
from transformers import WhisperProcessor
# 훈련시킬 모델의 processor, tokenizer, feature extractor 로드
processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

In [20]:
# 데이터 콜레이터 초기화
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### Evaluation Metrics

In [25]:
import evaluate

metric = evaluate.load('cer')

In [26]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # pad_token을 -100으로 치환
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # metrics 계산 시 special token들을 빼고 계산하도록 설정
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    cer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

### Pretrained Checkpoint

In [27]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

In [28]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

### Training

In [30]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="repo_name",  # 원하는 리포지토리 이름을 임력한다.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # 배치 크기가 2배 감소할 때마다 2배씩 증가
    learning_rate=1e-5,
    warmup_steps=50,
    num_train_epochs=1.0,

    gradient_checkpointing=True,
    fp16=True,

    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    dataloader_num_workers=8,
    
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    # report_to=["wandb"],
    report_to=None,
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_valid_dataset["train"],
    eval_dataset=train_valid_dataset["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

## Submit

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission

In [None]:
submission['text'] = 'text'
submission

In [None]:
submission.to_csv('sample_submission.csv', index=False)