## Package load

In [3]:
import os
import pickle
import torchaudio
import librosa

from datasets import load_dataset
from transformers import Wav2Vec2Processor

## Path setting

In [4]:
data_files = {
    "train": "../../KEMDy20_v1_1/Splitting/Train.csv",
    "test": "../../KEMDy20_v1_1/Splitting/Test.csv"
}

## Data load

In [5]:
dataset = load_dataset("csv", data_files = data_files)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(train_dataset)
print(test_dataset)

Downloading and preparing dataset csv/default to C:/Users/user/.cache/huggingface/datasets/csv/default-61270952849715ef/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/user/.cache/huggingface/datasets/csv/default-61270952849715ef/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['Emotion', 'Path'],
    num_rows: 25890
})
Dataset({
    features: ['Emotion', 'Path'],
    num_rows: 6312
})


## Columns definition

In [6]:
input_column = "Path"
output_column = "Emotion"

## Check classes

In [7]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 7 classes: ['angry', 'disqust', 'fear', 'happy', 'neutral', 'sad', 'surprise']


## Pretrained model name definition

In [8]:
model_name_or_path = "kresnik/wav2vec2-large-xlsr-korean"

## Processor definition & Check processor sampling rate

In [9]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Downloading (…)rocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/161 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The target sampling rate: 16000


## Extract function definition

In [10]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

## Audio feature extraction of train & test

In [None]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True
)

test_dataset = test_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True
)

## Save train_dataset, test_dataset

In [12]:
save_path = "../../KEMDy20_v1_1/Dataset/"
if not os.path.exists(save_path):
    os.mkdir(save_path)

train_dataset.save_to_disk(save_path+"train_dataset")
test_dataset.save_to_disk(save_path+"test_dataset")

Saving the dataset (0/39 shards):   0%|          | 0/25890 [00:00<?, ? examples/s]

Saving the dataset (0/10 shards):   0%|          | 0/6312 [00:00<?, ? examples/s]

## Zip train_dataset & test_dataset

In [2]:
import zipfile

# 압축할 파일/폴더 경로
train_dataset_path = "../../KEMDy20_v1_1/Dataset/train_dataset"
test_dataset_path = "../../KEMDy20_v1_1/Dataset/test_dataset"

# 압축된 파일 저장 경로
zip_file_path = "../../KEMDy20_v1_1/Dataset.zip"

# 압축 파일 생성
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # train_dataset 폴더 내부 파일/폴더 추가
    for root, dirs, files in os.walk(train_dataset_path):
        for file in files:
            zipf.write(os.path.join(root, file))

    # test_dataset 폴더 내부 파일/폴더 추가
    for root, dirs, files in os.walk(test_dataset_path):
        for file in files:
            zipf.write(os.path.join(root, file))