## Install Modules

In [1]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install evaluate>=0.30
!pip install jiwer
!pip install accelerate -U
!pip install transformers[torch]

^C
[31mERROR: Operation cancelled by user[0m
Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-5uw55khz
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-5uw55khz
^C
[31mERROR: Operation cancelled by user[0m
Note: you may need to restart the kernel to use updated packages.


## Import Modules

In [2]:
import pandas as pd
from glob import glob
import IPython.display as ipd

## Fix Seed

In [3]:
import numpy as np
import random
import os
import torch
import tensorflow as tf

2023-11-29 17:21:22.494799: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-29 17:21:23.362691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
2023-11-29 17:21:23.362782: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64


In [4]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
seed_everything()

In [5]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
seed_everything()

## Read Files

In [6]:
TRAIN_PATH = '/mnt/elice/dataset/train/'
TEST_PATH = '/mnt/elice/dataset/test/'

In [8]:
len(glob(f'{TRAIN_PATH}/*'))

3178

In [10]:
df = pd.read_csv(glob(f'{TRAIN_PATH}/*.csv')[0])
df

IndexError: list index out of range

In [None]:
df = pd.read_csv(f'{TRAIN_PATH}/texts.csv', index_col=False)
submission = pd.read_csv(f'/root/dataset_file/sample_submission.csv', index_col=False)

## EDA

In [None]:
print(df['text'][0])
ipd.Audio(TRAIN_PATH + df['filenames'][0]) # load a local WAV file

In [None]:
print(df['text'][1])
ipd.Audio(TRAIN_PATH + df['filenames'][1]) # load a local WAV file

In [None]:
test_files = sorted(glob(TEST_PATH+'*'))
test_files

In [None]:
ipd.Audio(test_files[0])

## Data Preprocess & Create Dataset

In [None]:
from transformers import WhisperTokenizer,  WhisperFeatureExtractor
from transformers import WhisperProcessor

# load feature extractor and tokenizer
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

In [None]:
from datasets import Dataset, DatasetDict
from datasets import Audio

# create dataset from csv
ds = Dataset.from_dict({"audio": [f'{TRAIN_PATH}/{file_path}' for file_path in df["filenames"]],
                       "transcripts": [text for text in df["text"]]}).cast_column("audio", Audio(sampling_rate=16000))

# train/valid split
train_valid = ds.train_test_split(test_size=0.2)
train_valid_dataset = DatasetDict({
    "train": train_valid["train"],
    "valid": train_valid["test"]})

In [None]:
def prepare_dataset(batch):
    audio = batch['audio']

    # raw form(audio['array']) -> log-Mel spectrogram
    batch['input_features'] = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate']]).input_features[0]
    
    # target text -> label ids(by tokenizer)
    batch['labels'] = tokenizer(batch['transcripts']).input_ids

    return batch

In [None]:
train_valid_dataset = train_valid_dataset.map(prepare_dataset, remove_columns = train_valid_dataset.column_names['train'], num_proc=4)

## Training

### Data Collator

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 인풋 데이터와 라벨 데이터의 길이가 다르며, 따라서 서로 다른 패딩 방법이 적용되어야 한다. 그러므로 두 데이터를 분리해야 한다.
        # 먼저 오디오 인풋 데이터를 간단히 토치 텐서로 반환하는 작업을 수행한다.
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Tokenize된 레이블 시퀀스를 가져온다.
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # 레이블 시퀀스에 대해 최대 길이만큼 패딩 작업을 실시한다.
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 패딩 토큰을 -100으로 치환하여 loss 계산 과정에서 무시되도록 한다.
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 이전 토크나이즈 과정에서 bos 토큰이 추가되었다면 bos 토큰을 잘라낸다.
        # 해당 토큰은 이후 언제든 추가할 수 있다.
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
# 훈련시킬 모델의 processor, tokenizer, feature extractor 로드
processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

In [None]:
# 데이터 콜레이터 초기화
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### Evaluation Metrics

In [None]:
import evaluate

metric = evaluate.load('cer')

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # pad_token을 -100으로 치환
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # metrics 계산 시 special token들을 빼고 계산하도록 설정
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    cer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

### Pretrained Checkpoint

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

### Training

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="repo_name",  # 원하는 리포지토리 이름을 임력한다.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # 배치 크기가 2배 감소할 때마다 2배씩 증가
    learning_rate=1e-5,
    warmup_steps=50,
    max_train_epochs=1.0,

    gradient_checkpointing=True,
    fp16=True,

    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    # report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_valid_dataset["train"],
    eval_dataset=train_valid_dataset["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

## Predict

In [12]:
!pip install datasets

Collecting datasets
  Using cached datasets-2.15.0-py3-none-any.whl (521 kB)
Collecting pyarrow-hotfix
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting huggingface-hub>=0.18.0
  Using cached huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
Collecting xxhash
  Using cached xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (193 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.15-py39-none-any.whl (133 kB)
Collecting huggingface-hub>=0.18.0
  Using cached huggingface_hub-0.19.3-py3-none-any.whl (311 kB)
  Using cached huggingface_hub-0.19.2-py3-none-any.whl (311 kB)
  Using cached huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
  Using cached huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
  Using cached huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
INFO: pip is looking at multiple versions of frozenlist to determine which version is compatible with other requirements. This could take a while.
Collecting frozenlist>=1.1.1
  Using 

In [11]:
from datasets import Dataset, DatasetDict
from datasets import Audio

# create dataset from csv
test_ds = Dataset.from_dict({"audio": [f'{TEST_PATH}/{file_path}' for file_path in submission["filenames"]]})
test_ds = test_ds.cast_column("audio", Audio(sampling_rate=16000))


ModuleNotFoundError: No module named 'datasets'

## Submit

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission

In [None]:
submission['text'] = 'text'
submission

In [None]:
submission.to_csv('sample_submission.csv', index=False)