In [11]:
import os
import ast
import pickle

from datasets import load_dataset, load_metric
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor

In [12]:
data_files = {
    "train": "../../KEMDy20_v1_1/Splitting/Train.csv",
    "test": "../../KEMDy20_v1_1/Splitting/Test.csv",
    "train_aug": "../../KEMDy20_v1_1/Splitting/Train_aug.csv"
}

In [13]:
dataset = load_dataset("csv", data_files = data_files)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
train_aug_dataset = dataset["train_aug"]

print(train_dataset)
print(test_dataset)
print(train_aug_dataset)

Found cached dataset csv (C:/Users/Yechani/.cache/huggingface/datasets/csv/default-d6b4b419863ca901/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['Emotion', 'Path'],
    num_rows: 10391
})
Dataset({
    features: ['Emotion', 'Path'],
    num_rows: 2437
})
Dataset({
    features: ['Emotion', 'Path'],
    num_rows: 29765
})


In [14]:
import torch
import torchaudio
import torchaudio.transforms as T
import librosa
import IPython.display as ipd
import numpy as np

input_column = "Path"

model_name = "kresnik/wav2vec2-large-xlsr-korean"

In [15]:
processor = Wav2Vec2Processor.from_pretrained(model_name)
processor.feature_extractor.return_attention_mask = False
target_sampling_rate = processor.feature_extractor.sampling_rate

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze()
    fixed_length = 100000
    speech_length = speech.size()[0]
    if speech_length < fixed_length:
        pad_length = fixed_length - speech_length
        speech = torch.cat([speech, torch.zeros(pad_length)], dim=0)
    elif speech_length > fixed_length:
        speech = speech[:fixed_length]
    return speech.numpy()

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    result = processor(speech_list,
                       sampling_rate=target_sampling_rate)
    return result

In [17]:
train_dataset_ex = preprocess_function(train_dataset)
test_dataset_ex = preprocess_function(test_dataset)
train_aug_dataset_ex = preprocess_function(train_aug_dataset)

In [18]:
train_x = np.vstack(train_dataset_ex["input_values"])
test_x = np.vstack(test_dataset_ex["input_values"])
train_aug_x = np.vstack(train_aug_dataset_ex["input_values"])

In [19]:
save_path = "../../KEMDy20_v1_1/Extract/"
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [20]:
np.savez(save_path+"Dataset",
         train_x = train_x,
         train_y = train_dataset["Emotion"],
         test_x = test_x,
         test_y = test_dataset["Emotion"],
         train_aug_x = train_aug_x,
         train_aug_y = train_aug_dataset["Emotion"]
         )