## Install Modules

In [1]:
%pip install --upgrade pip
%pip install --upgrade datasets transformers accelerate soundfile librosa evaluate jiwer tensorboard gradio

Collecting pip
  Obtaining dependency information for pip from https://files.pythonhosted.org/packages/47/6a/453160888fab7c6a432a6e25f8afe6256d0d9f2cbd25971021da6491d899/pip-23.3.1-py3-none-any.whl.metadata
  Downloading pip-23.3.1-py3-none-any.whl.metadata (3.5 kB)
Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
    --------------------------------------- 0.0/2.1 MB 1.4 MB/s eta 0:00:02
   -------- ------------------------------- 0.5/2.1 MB 7.4 MB/s eta 0:00:01
   ---------------- ----------------------- 0.9/2.1 MB 8.0 MB/s eta 0:00:01
   ------------------------ --------------- 1.3/2.1 MB 8.3 MB/s eta 0:00:01
   -------------------------------- ------- 1.7/2.1 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------  2.1/2.1 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 7.9 MB/s eta 0:00:00
Installing collected packages: pip
Successfully installed pip-23.3.1
Note: you may 

## Import Modules

In [1]:
import pandas as pd
from glob import glob
import IPython.display as ipd

## Fix Seed

In [None]:
import numpy as np
import random
import os
import torch
import tensorflow as tf

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
seed_everything()

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
seed_everything()

## Read Files

In [2]:
TRAIN_PATH = '/mnt/elice/dataset/train/'
TEST_PATH = '/mnt/elice/dataset/test/'

In [3]:
df = pd.read_csv(glob('/mnt/elice/dataset/train/*.csv')[0])
df

IndexError: list index out of range

In [None]:
df = pd.read_csv('/root/rokafnet/dataset_files/train/texts.csv', index_col=False)
submission = pd.read_csv('/root/rokafnet/dataset_files/sample_submission.csv', index_col=False)

## EDA

In [None]:
print(df['text'][0])
ipd.Audio(TRAIN_PATH + df['filenames'][0]) # load a local WAV file

In [None]:
print(df['text'][1])
ipd.Audio(TRAIN_PATH + df['filenames'][1]) # load a local WAV file

In [None]:
test_files = sorted(glob(TEST_PATH+'*'))
test_files

In [None]:
ipd.Audio(test_files[0])

## Data Preprocess & Create Dataset

In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer

# load feature extractor and tokenizer
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

In [None]:
from datasets import Dataset, DatasetDict
from datasets import Audio

# create dataset from csv
ds = Dataset.from_dict({"audio": [f'{TRAIN_PATH}/{file_path}' for file_path in df["filenames"]],
                       "transcripts": [text for text in df["text"]]}).cast_column("audio", Audio(sampling_rate=16000))

# train/valid split
train_valid = ds.train_test_split(test_size=0.2)
train_valid_dataset = DatasetDict({
    "train": train_valid["train"],
    "valid": train_valid["test"]})

In [None]:
def prepare_dataset(batch):
    audio = batch['audio']

    # raw form(audio['array']) -> log-Mel spectrogram
    batch['input_features'] = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate']]).input_features[0]
    
    # target text -> label ids(by tokenizer)
    batch['labels'] = tokenizer(batch['transcripts']).input_ids

    return batch

In [None]:
train_valid_dataset = train_valid_dataset.map(prepare_dataset, remove_columns = train_valid_dataset.column_names['train'], num_proc=4)

## Submit

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission

In [None]:
submission['text'] = 'text'
submission

In [None]:
submission.to_csv('sample_submission.csv', index=False)