In [1]:
import ast
import pickle

from datasets import load_dataset, load_metric
from transformers import AutoConfig, Wav2Vec2Processor

In [2]:
data_files = {
    "train": "../../KEMDy20_v1_1/Splitting/Train_Prob.csv",
    "test": "../../KEMDy20_v1_1/Splitting/Test_Prob.csv"
}

In [3]:
dataset = load_dataset("csv", data_files = data_files)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(train_dataset)
print(test_dataset)

Found cached dataset csv (C:/Users/Yechani/.cache/huggingface/datasets/csv/default-937532780f5a0cf9/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['Segment ID', 'path', 'emotion'],
    num_rows: 10897
})
Dataset({
    features: ['Segment ID', 'path', 'emotion'],
    num_rows: 2565
})


In [4]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(train_dataset))
sample = train_dataset[idx]
path = sample["path"]
label = sample["emotion"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(y=np.asarray(speech), orig_sr=sr, target_sr=16000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 8053
      Label: [0.0, 0.0, 0.0, 0.4, 0.6, 0.0, 0.0]



In [5]:
input_column = "path"
output_column = "emotion"

In [6]:
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-japanese"
pooling_mode = "mean"

In [7]:
with open("../../KEMDy20_v1_1/Splitting/Label.txt", "rb") as f:
    label_list = pickle.load(f)

In [8]:
num_labels = len(label_list)

In [9]:
config = AutoConfig.from_pretrained(
    model_name,
    num_labels = num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf"
)

setattr(config, "pooling_mode", pooling_mode)



In [10]:
processor = Wav2Vec2Processor.from_pretrained(model_name,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [11]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [ast.literal_eval(label) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = target_list

    return result

In [12]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True
)

test_dataset = test_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True
)

Loading cached processed dataset at C:\Users\Yechani\.cache\huggingface\datasets\csv\default-937532780f5a0cf9\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e3868190237e2eb3.arrow
Loading cached processed dataset at C:\Users\Yechani\.cache\huggingface\datasets\csv\default-937532780f5a0cf9\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-7727a0ed096d6943.arrow


In [13]:
idx = 0
print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['emotion']}")

Training input_values: [-0.0752476155757904, -0.058158989995718, -0.05531088262796402, -0.05673493444919586, -0.04676656797528267, -0.04961467534303665, -0.036798201501369476, -0.04534251615405083, -0.0410703606903553, -0.03252604603767395, -0.015437417663633823, -0.006893103010952473, -0.021133627742528915, -0.009741207584738731, -0.01686147041618824, -0.009741207584738731, -0.001196893397718668, -0.021133627742528915, -0.018285522237420082, -0.011165260337293148, -0.01686147041618824, -0.019709574058651924, -0.02540578320622444, 0.0030752636957913637, 0.005923368502408266, 0.00022715893283020705, 0.005923368502408266, 0.02301199547946453, 0.024436049163341522, 0.017315786331892014, 0.04010062292218208, 0.041524674743413925, 0.03582846745848656, 0.03155630826950073, 0.027284152805805206, 0.02301199547946453, 0.027284152805805206, 0.018739838153123856, 0.03298036381602287, 0.027284152805805206, 0.02158794365823269, -0.001196893397718668, 0.0016512112924829125, 0.00022715893283020705, -

In [14]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [15]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)
        logits = torch.softmax(logits, dim=-1)
        
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [16]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [17]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [18]:
import numpy as np
from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(p: EvalPrediction):
    y_pred = p.predictions.astype(np.float32)
    y_true = p.label_ids.astype(np.float32)
    
    precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    
    return {"micro_f1": f1_score}

In [19]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name,
    config=config,
)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-japanese were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-japanese and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this mo

In [20]:
model.freeze_feature_extractor()

In [22]:
import os
from transformers import TrainingArguments

output_path = "../../KEMDy20_v1_1/content/"
if not os.path.exists(output_path):
    os.mkdir(output_path)

training_args = TrainingArguments(
    output_dir=output_path,
    logging_dir="../../KEMDy20_v1_1/log/",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=64,
    evaluation_strategy="epoch",
    num_train_epochs=10,
    fp16=True,
    logging_steps = 50,
    save_steps= 50,
    save_total_limit=2
)

In [23]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast

class CTCTrainer(Trainer):
    def __init__(
        self,
        use_amp: Optional[bool] = None,
        *args, **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.use_amp = self.args.fp16 if use_amp is None else use_amp

    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()

        return loss.detach()

In [24]:
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
    use_amp=True
)

In [25]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
