In [14]:
import pandas as pd
import librosa
from transformers import Wav2Vec2FeatureExtractor

In [5]:
ds = 'all_6'
df = pd.read_csv('../Datasets/custom_db/df.csv')
df = df.rename(columns={'file': 'path'})
df['path'] = df['path'].apply(lambda path: '../Datasets/custom_db/' + path[2:])
# df = df[df['dataset'] == ds]
df = df[df['emotion'] != 'surprise']

In [6]:
df.head()

Unnamed: 0,path,emotion,split,dataset,augmentation
0,../Datasets/custom_db/tess/train/angry_1068.wav,angry,train,tess,none
2,../Datasets/custom_db/tess/train/fear_248.wav,fear,train,tess,none
3,../Datasets/custom_db/tess/train/happy_2790.wav,happy,train,tess,none
5,../Datasets/custom_db/tess/train/sad_1820.wav,sad,train,tess,none
6,../Datasets/custom_db/tess/train/happy_2658.wav,happy,train,tess,none


In [7]:
print("Labels: ", df["emotion"].unique())
print()
df.groupby("emotion").count()[["path"]]

Labels:  ['angry' 'fear' 'happy' 'sad' 'neutral' 'disgust']



Unnamed: 0_level_0,path
emotion,Unnamed: 1_level_1
angry,8654
disgust,8656
fear,8646
happy,8646
neutral,8514
sad,8654


In [8]:
save_path = "./"

train_df = df[(df['split'] == 'train') | (df['split'] == 'augment')]
val_df = df[df['split'] == 'val']
test_df = df[df['split'] == 'test']

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
val_df.to_csv(f"{save_path}/val.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(48316, 5)
(1152, 5)
(2302, 5)


## Prepare Data for Training

In [9]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": f"{save_path}train.csv", 
    "validation": f"{save_path}val.csv", 
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Using custom data configuration default-e76973a27f2b5779


Downloading and preparing dataset csv/default to /home/sam/.cache/huggingface/datasets/csv/default-e76973a27f2b5779/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/sam/.cache/huggingface/datasets/csv/default-e76973a27f2b5779/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['path', 'emotion', 'split', 'dataset', 'augmentation'],
    num_rows: 48316
})
Dataset({
    features: ['path', 'emotion', 'split', 'dataset', 'augmentation'],
    num_rows: 1152
})


In [10]:
# We need to specify the input and output column
input_column = "path"
output_column = "emotion"

In [11]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 6 classes: ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad']


# Preprocess Data

In [12]:
model_name_or_path = "facebook/hubert-base-ls960"
# model_name_or_path = 'facebook/hubert-large-ls960-ft'
# model_name_or_path = "facebook/hubert-xlarge-ll60k" # Needs too much memory
# model_name_or_path = "facebook/hubert-large-ll60k" # Need to try again

pooling_mode = "mean"

In [16]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path,)
target_sampling_rate = feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [17]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = librosa.load(path, sr=None)

    if(sampling_rate != target_sampling_rate):
        raise ValueError(f"Sampling rate mismatch between file and target sampling rate. {sampling_rate} != {target_sampling_rate}")

    return speech_array

def label_to_id(label, label_list):
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [18]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)



        

#0:   0%|          | 0/121 [00:00<?, ?ba/s]

#1:   0%|          | 0/121 [00:00<?, ?ba/s]

#3:   0%|          | 0/121 [00:00<?, ?ba/s]

#2:   0%|          | 0/121 [00:00<?, ?ba/s]

2022-11-12 07:16:46.799159: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-12 07:16:46.842654: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-12 07:16:46.856697: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the ap

       

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

2022-11-12 07:29:29.255567: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-12 07:29:30.260435: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-12 07:29:30.260456: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-12 07:29:30.286591: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-12 07:29:30.692928: I tensorflow/core/platform/cpu_feature_g

In [19]:
train_dataset.save_to_disk(f'../Datasets/hf_datasets/{ds}/base/train')
eval_dataset.save_to_disk(f'../Datasets/hf_datasets/{ds}/base/val')

In [20]:
test_dataset = load_dataset("csv", data_files={"test": "./test.csv"}, delimiter="\t")["test"]
test_dataset

Using custom data configuration default-64db17d769ad2788


Downloading and preparing dataset csv/default to /home/sam/.cache/huggingface/datasets/csv/default-64db17d769ad2788/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/sam/.cache/huggingface/datasets/csv/default-64db17d769ad2788/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['path', 'emotion', 'split', 'dataset', 'augmentation'],
    num_rows: 2302
})

In [21]:
# model_name_or_path = "m3hrdadfi/hubert-base-greek-speech-emotion-recognition"
# config = AutoConfig.from_pretrained(model_name_or_path)
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
# model = HubertForSpeechClassification.from_pretrained(model_name_or_path).to(device)

In [22]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=None)
    if sampling_rate != target_sampling_rate:
        raise ValueError(f"Sampling rate mismatch between file and target sampling rate. {sampling_rate} != {target_sampling_rate}")
    batch["speech"] = speech_array
    return batch


# def predict(batch):
#     features = feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt", padding=True)

#     input_values = features.input_values.to(device)

#     with torch.no_grad():
#         logits = model(input_values).logits 

#     pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
#     batch["predicted"] = pred_ids
#     return batch

In [23]:
test_dataset = test_dataset.map(speech_file_to_array_fn)

  0%|          | 0/2302 [00:00<?, ?ex/s]

In [24]:
test_dataset.save_to_disk(f'../Datasets/hf_datasets/{ds}/base/test')