## Import Modules

In [None]:
from glob import glob
import pandas as pd
import numpy as np
import random
import os
import torch
import wandb

import IPython.display as ipd
from scipy.io import wavfile
import noisereduce as nr

from datasets import Dataset, DatasetDict, Audio

from transformers import WhisperTokenizer,  WhisperFeatureExtractor, WhisperProcessor

import re
import librosa
from tqdm.auto import tqdm

## Fix Seed

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
seed_everything()

## Config

In [None]:
CFG = {
    'model': 'openai/whisper-tiny',
    'sr': 16000,
    'noise_file_path': '../files/noise.wav'
}

## Read Files

In [None]:
TRAIN_PATH = '/mnt/elice/dataset/train/'
TEST_PATH = '/mnt/elice/dataset/test/'

In [None]:
df = pd.read_csv(f'{TRAIN_PATH}/texts.csv', index_col=False)
submission = pd.read_csv(f'sample_submission.csv', index_col=False)

## Label Cleaning

In [None]:
def clean_text(text, remove_space=True):
    text = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~\\\\]','', text)
    if remove_space:
        text = ''.join(text.split())
    return text

In [None]:
# label cleaning (remove punctuations)
df['text'] = df['text'].apply(lambda x: clean_text(x, False))

# remove outlier data
df = df[df['filenames'] != 'audio_5497.wav']

if not os.path.exists('preprocess'):
    os.mkdir('preprocess')

df.to_csv('preprocess/clean_df.csv', index=False)

## Split long/short dataframe

In [None]:
def split_dataframe(df, df_name, is_train=True):
    df_long = []
    df_short = []

    for idx, row in tqdm(df.iterrows()):
        if is_train:
            path = TRAIN_PATH + row['filenames']
        else:
            path = row['path']
        wav, fs = librosa.load(path)
        length = len(wav)/fs

        if length >= 30:
            df_long.append(row)
        else:
            df_short.append(row)

    df_long = pd.DataFrame(df_long, columns=df.columns)
    df_short = pd.DataFrame(df_short, columns=df.columns)

    df_long.to_csv(f'preprocess/long_{df_name}.csv', index=False)
    df_short.to_csv(f'preprocess/short_{df_name}.csv', index=False)

In [None]:
split_dataframe(df, 'df')
split_dataframe(submission, 'test', False)

## Data Preprocess & Train Dataset

In [None]:
# load feature extractor and tokenizer
feature_extractor = WhisperFeatureExtractor.from_pretrained(CFG['model'])
tokenizer = WhisperTokenizer.from_pretrained(CFG['model'], language="Korean", task="transcribe")

_, noise_array = wavfile.read(CFG["noise_file_path"])

In [None]:
def prepare_dataset(batch):
    audio = batch['audio']
    reduced_noise_audio = nr.reduce_noise(y=audio['array'], sr=CFG['sr'], y_noise = noise_array)

    # raw form(reduced_noise_audio) -> log-Mel spectrogram
    batch['input_features'] = feature_extractor(reduced_noise_audio, sampling_rate=audio['sampling_rate']).input_features[0]
    
    # target text -> label ids(by tokenizer)
    batch['labels'] = tokenizer(batch['transcripts']).input_ids

    return batch

In [None]:
def create_train_datasets(df, dir_name='dataset'):
    # create dataset from csv
    ds = Dataset.from_dict({"audio": [f'{TRAIN_PATH}/{file_path}' for file_path in df["filenames"]],
                        "transcripts": [text for text in df["text"]]}).cast_column("audio", Audio(sampling_rate=CFG['sr']))

    # train/valid split
    train_valid = ds.train_test_split(test_size=0.2)
    train_valid_dataset = DatasetDict({
        "train": train_valid["train"],
        "valid": train_valid["test"]})
    
    train_valid_dataset = train_valid_dataset.map(prepare_dataset, remove_columns = train_valid_dataset.column_names['train'], num_proc=4)

    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
        
    train_valid_dataset.save_to_disk(dir_name)

In [None]:
# create_train_datasets(df)

# create long/short train dataset from csv files
# long_df = pd.read_csv('preprocess/long_df.csv', index_col=False)
short_df = pd.read_csv('preprocess/short_df.csv', index_col=False)

# create_train_datasets(long_df, dir_name='dataset_long')
create_train_datasets(short_df, dir_name='dataset_short')

## Test Dataset

In [None]:
def prepare_test_dataset(batch):
    audio = batch['audio']
    reduced_noise_audio = nr.reduce_noise(y=audio['array'], sr=CFG['sr'], y_noise = noise_array)

    # raw form(reduced_noise_audio) -> log-Mel spectrogram
    batch['input_features'] = feature_extractor(reduced_noise_audio, sampling_rate=audio['sampling_rate']).input_features[0]

    return batch

In [None]:
def create_test_dataset(df, dir_name='dataset_test'):
    # create dataset from csv
    test_dataset = Dataset.from_dict({"audio": [file_path for file_path in df["path"]]})
    test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=CFG['sr']))
    sampling_rate = test_dataset.features['audio'].sampling_rate

    # test data preprocess
    test_dataset = test_dataset.map(prepare_test_dataset, remove_columns = test_dataset.column_names, num_proc=4)

    # save test dataset
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
        
    test_dataset.save_to_disk(dir_name)

In [None]:
create_test_dataset(submission)

# create long/short test dataset from csv files
# long_test = pd.read_csv('preprocess/long_test.csv', index_col=False)
# short_test = pd.read_csv('preprocess/short_test.csv', index_col=False)

# create_test_dataset(long_test, dir_name='dataset_long_test')
# create_test_dataset(short_test, dir_name='dataset_short_test')