In [1]:
import os

paths = []
labels = []
counter = 0

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if counter < 4:
            counter += 1
            continue
        paths.append(os.path.join(dirname, filename))
        label = filename.split('-')[4]
        labels.append(label)

print('Dataset is Loaded')

Dataset is Loaded


In [2]:
paths[:5]

['/kaggle/input/basic-arabic-vocal-emotions-dataset/remake/remake/2/51-m-20-2-2-767.wav',
 '/kaggle/input/basic-arabic-vocal-emotions-dataset/remake/remake/2/56-f-40-2-0-627.wav',
 '/kaggle/input/basic-arabic-vocal-emotions-dataset/remake/remake/2/56-f-40-2-1-639.wav',
 '/kaggle/input/basic-arabic-vocal-emotions-dataset/remake/remake/2/55-m-16-2-2-809.wav',
 '/kaggle/input/basic-arabic-vocal-emotions-dataset/remake/remake/2/4-m-20-2-0-699.wav']

In [3]:
labels[:5]

['2', '0', '1', '2', '0']

In [4]:
import pandas as pd
## Create a dataframe
df = pd.DataFrame()
df['speech'] = paths
df['label'] = labels
df.head()

Unnamed: 0,speech,label
0,/kaggle/input/basic-arabic-vocal-emotions-data...,2
1,/kaggle/input/basic-arabic-vocal-emotions-data...,0
2,/kaggle/input/basic-arabic-vocal-emotions-data...,1
3,/kaggle/input/basic-arabic-vocal-emotions-data...,2
4,/kaggle/input/basic-arabic-vocal-emotions-data...,0


In [5]:
label2id = {'tired/exhausted': 0, 'neutral': 1, 'positive/negative': 2}
id2label = {0: 'tired/exhausted', 1: 'neutral', 2: 'positive/negative'}

# Replace 'label' column with integer values
df['label'] = df['label'].replace({'0': 0, '1': 1, '2': 2})
# Create 'emotion' column based on 'labels' column
df['emotion'] = df['label'].map(id2label)

df.head()

Unnamed: 0,speech,label,emotion
0,/kaggle/input/basic-arabic-vocal-emotions-data...,2,positive/negative
1,/kaggle/input/basic-arabic-vocal-emotions-data...,0,tired/exhausted
2,/kaggle/input/basic-arabic-vocal-emotions-data...,1,neutral
3,/kaggle/input/basic-arabic-vocal-emotions-data...,2,positive/negative
4,/kaggle/input/basic-arabic-vocal-emotions-data...,0,tired/exhausted


In [6]:
print(df["label"].dtype)

int64


In [7]:
df.shape

(3870, 3)

In [8]:
from sklearn.model_selection import train_test_split

# We need to specify the input and output column
input_column = "speech"
output_column = "label"

train_dataframe, eval_dataframe = train_test_split(df, test_size=0.2, random_state=101, stratify=df["label"])

train_dataframe = train_dataframe.reset_index(drop=True)
eval_dataframe = eval_dataframe.reset_index(drop=True)

from datasets import Dataset

# Convert the dataframe to a Dataset object
train_dataset = Dataset.from_pandas(train_dataframe)
eval_dataset = Dataset.from_pandas(eval_dataframe)

In [9]:
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['speech', 'label', 'emotion'],
    num_rows: 3096
})
Dataset({
    features: ['speech', 'label', 'emotion'],
    num_rows: 774
})


wav2vec2 Model

In [10]:
from transformers import AutoConfig, Wav2Vec2Processor

In [11]:
import warnings

warnings.filterwarnings("ignore")

In [12]:
model_name_or_path = "elgeish/wav2vec2-large-xlsr-53-arabic"
pooling_mode = "mean"
num_labels = 3

In [13]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

In [14]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/303 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The target sampling rate: 16000


In [15]:
import librosa
# Define the speech_file_to_array_fn function
def speech_file_to_array_fn(path):
    signal, sr = librosa.load(path, sr=target_sampling_rate)
    return signal

# Define the preprocess_function function
def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label for label in examples[output_column]]
    result = processor(speech_list, sampling_rate=target_sampling_rate, return_attention_mask=True)
    return result

In [16]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=64,
    batched=True,
    num_proc=4
);

eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=64,
    batched=True,
    num_proc=4
);

        

#1:   0%|          | 0/13 [00:00<?, ?ba/s]

#0:   0%|          | 0/13 [00:00<?, ?ba/s]

#2:   0%|          | 0/13 [00:00<?, ?ba/s]

#3:   0%|          | 0/13 [00:00<?, ?ba/s]

       

#0:   0%|          | 0/4 [00:00<?, ?ba/s]

#1:   0%|          | 0/4 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/4 [00:00<?, ?ba/s]

#3:   0%|          | 0/4 [00:00<?, ?ba/s]

In [17]:
train_dataset

Dataset({
    features: ['speech', 'label', 'emotion', 'input_values', 'attention_mask'],
    num_rows: 3096
})

Building classifier

In [18]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [19]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Padding strategy 

In [20]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["label"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [21]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Evaluation metrics 

In [22]:
is_regression = False

In [23]:
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

Building Model

In [24]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at elgeish/wav2vec2-large-xlsr-53-arabic were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at elgeish/wav2vec2-large-xlsr-53-arabic and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stre

In [25]:
model

Wav2Vec2ForSpeechClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), 

In [26]:
model.freeze_feature_extractor()

Building Trainer

In [27]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/wav2vec2-xlsr-arabic-speech-emotion-recognition",
    # output_dir="/content/gdrive/MyDrive/wav2vec2-xlsr-greek-speech-emotion-recognition"
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",
    num_train_epochs=6.0,
    learning_rate=1e-4,
    fp16 = True
)

In [28]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


In [29]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, speech. If emotion, speech are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3096
  Num Epochs = 6
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 1158
  Number of trainable parameters = 312281219
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.284101,0.919897
1,No log,0.332221,0.908269
2,0.382500,0.203857,0.944444
3,0.382500,0.173628,0.959948
4,0.382500,0.171092,0.963824
5,0.113800,0.151354,0.963824


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, speech. If emotion, speech are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 774
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, speech. If emotion, speech are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 774
  Batch size = 8
Saving model checkpoint to /content/wav2vec2-xlsr-arabic-speech-emotion-recognition/checkpoint-500
Configuration saved in /content/wav2vec2-xlsr-arabic-speech-emotion-recognition/checkpoint-500/config.json
Model weights saved in /content/wav2vec2-xlsr-arabic-speech-emotion-recognition/checkpoi

TrainOutput(global_step=1158, training_loss=0.22453994948629272, metrics={'train_runtime': 1879.2024, 'train_samples_per_second': 9.885, 'train_steps_per_second': 0.616, 'total_flos': 1.377238465255247e+18, 'train_loss': 0.22453994948629272, 'epoch': 6.0})