In [1]:
import ast
import pickle
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

from datasets import load_dataset, load_metric
from transformers import AutoConfig, Wav2Vec2Processor

In [2]:
data_files = {
    "train": "../../KEMDy20_v1_1/Splitting/Train.csv",
    "test": "../../KEMDy20_v1_1/Splitting/Test.csv"
}

In [3]:
dataset = load_dataset("csv", data_files = data_files)
train_valid_dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_valid_dataset["train"]
valid_dataset = train_valid_dataset["test"]
test_dataset = dataset["test"]

print(train_dataset)
print(valid_dataset)
print(test_dataset)

Found cached dataset csv (C:/Users/Yechani/.cache/huggingface/datasets/csv/default-1ac06c035d845d89/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['Emotion', 'Path'],
    num_rows: 23301
})
Dataset({
    features: ['Emotion', 'Path'],
    num_rows: 2589
})
Dataset({
    features: ['Emotion', 'Path'],
    num_rows: 6312
})


In [4]:
input_column = "Path"
output_column = "Emotion"

In [5]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

Flattening the indices:   0%|          | 0/23301 [00:00<?, ? examples/s]

A classification problem with 7 classes: ['angry', 'disqust', 'fear', 'happy', 'neutral', 'sad', 'surprise']


In [6]:
#model_name = "jungjongho/wav2vec2-xlsr-korean-speech-emotion-recognition3"
model_name_or_path = "kresnik/wav2vec2-large-xlsr-korean"
pooling_mode = "mean"

In [7]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [8]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The target sampling rate: 16000


In [32]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()[::-1]
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [34]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True
)

valid_dataset = valid_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True
)

test_dataset = test_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True
)

Map:   0%|          | 0/23301 [00:00<?, ? examples/s]

{'input_values': [array([0.51769584, 0.5156637 , 0.52277625, ..., 0.9271746 , 0.91802996,
       0.92412645], dtype=float32), array([0.01465948, 0.02883624, 0.00432533, ..., 0.00959696, 0.00959696,
       0.00959696], dtype=float32), array([-0.24285209, -0.24022847, -0.24547572, ..., -0.8567794 ,
       -0.86071485, -0.8817038 ], dtype=float32), array([-0.3159414 , -0.36037534, -0.31190196, ...,  0.4677118 ,
        0.52830356,  0.4838696 ], dtype=float32), array([ 0.0035406 , -0.0006901 ,  0.00550707, ...,  0.00835217,
        0.00835217,  0.00835217], dtype=float32), array([0.11408135, 0.09368349, 0.09671021, ..., 0.0119238 , 0.0119238 ,
       0.0119238 ], dtype=float32), array([-0.00330544, -0.00229159, -0.05217878, ...,  0.00409706,
        0.00409706,  0.00409706], dtype=float32), array([-0.07691821, -0.084099  , -0.09127979, ..., -0.3300411 ,
       -0.3354267 , -0.3408123 ], dtype=float32), array([0.1758685 , 0.16997437, 0.16994512, ..., 0.00813515, 0.00813515,
       0.0081351

{'input_values': [array([0.25988823, 0.24320638, 0.24320638, ..., 0.01607661, 0.00837729,
       0.00581085], dtype=float32), array([-0.04228264, -0.02734047, -0.01475935, ...,  0.00893792,
        0.00893792,  0.00893792], dtype=float32), array([ 3.1612590e-01,  2.7015790e-01,  3.0121893e-01, ...,
       -6.9177302e-05, -6.9177302e-05, -6.9177302e-05], dtype=float32), array([0.08710042, 0.12526874, 0.08233121, ..., 0.01385036, 0.01385036,
       0.01385036], dtype=float32), array([-0.09417423, -0.08778615, -0.09843295, ..., -0.00261168,
        0.01442321,  0.01761725], dtype=float32), array([-0.00294533, -0.01463289, -0.02982241, ..., -0.00028789,
       -0.00028789, -0.00028789], dtype=float32), array([ 0.2216569 ,  0.22545166,  0.21786214, ..., -0.33237806,
       -0.29822522, -0.2526881 ], dtype=float32), array([8.8067343e-03, 8.4282085e-03, 1.0089314e-02, ..., 3.1134048e-05,
       3.1134048e-05, 3.1134048e-05], dtype=float32), array([ 1.6484423e-01,  2.8221494e-01,  2.5966647e-0

{'input_values': [array([-0.21279766, -0.22454466, -0.19635187, ...,  0.64003414,
        0.62828714,  0.63298595], dtype=float32), array([ 0.02132689, -0.01401411,  0.00327089, ...,  0.05209477,
        0.05209477,  0.05209477], dtype=float32), array([-0.22936429, -0.22593178, -0.23005079, ...,  0.04935558,
        0.0575936 ,  0.06926414], dtype=float32), array([ 0.00695042,  0.00398294, -0.01283278, ...,  0.06531086,
        0.04898972,  0.06630001], dtype=float32), array([-0.21590956, -0.20690991, -0.2107669 , ..., -0.02884542,
       -0.02755976, -0.03013108], dtype=float32), array([ 1.3173641e-02,  2.1290559e-02,  2.2120321e-02, ...,
       -6.9208632e-05, -6.9208632e-05, -6.9208632e-05], dtype=float32), array([-6.0906755e-03, -5.2137244e-03, -2.1345185e-03, ...,
       -1.3543960e-05, -1.3543960e-05, -1.3543960e-05], dtype=float32), array([ 0.00683861,  0.0050679 ,  0.01167454, ..., -0.00011322,
       -0.00011322, -0.00011322], dtype=float32), array([ 0.00297631,  0.0037397 ,  

{'input_values': [array([-2.7533149e-04, -4.8781661e-03, -2.4873964e-03, ...,
        1.3629973e-05,  1.3629973e-05,  1.3629973e-05], dtype=float32), array([-1.8127989e-03, -2.1275938e-03, -3.3239943e-03, ...,
       -1.1817620e-05, -1.1817620e-05, -1.1817620e-05], dtype=float32), array([ 0.00174121,  0.00045433, -0.00054941, ...,  0.00012631,
        0.00012631,  0.00012631], dtype=float32), array([0.1554816 , 0.16160037, 0.15140243, ..., 0.8244667 , 0.842823  ,
       0.8265063 ], dtype=float32), array([ 9.2136683e-03, -6.4590480e-03, -3.3389992e-06, ...,
        2.3599638e-05,  2.3599638e-05,  2.3599638e-05], dtype=float32), array([ 0.5396157 ,  0.57690674,  0.5636722 , ..., -0.00130983,
       -0.00130983, -0.00130983], dtype=float32), array([-4.2237461e-01, -3.9321974e-01, -4.1060838e-01, ...,
        1.9400162e-04,  1.9400162e-04,  1.9400162e-04], dtype=float32), array([-0.02621468, -0.02751873, -0.02308119, ..., -0.00010184,
       -0.00010184, -0.00010184], dtype=float32), arra

KeyboardInterrupt: 

In [16]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [17]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [18]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [19]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [20]:
is_regression = False

In [21]:
import numpy as np
from transformers import EvalPrediction

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [22]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream tas

In [23]:
model.freeze_feature_extractor()

In [24]:
import os
from transformers import TrainingArguments

output_path = "../../KEMDy20_v1_1/content"
if not os.path.exists(output_path):
    os.mkdir(output_path)

training_args = TrainingArguments(
    output_dir=output_path,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    gradient_checkpointing= True,
    evaluation_strategy="steps",
    num_train_epochs=1.0,
    fp16=True,
    save_steps=200,
    eval_steps=200,
    logging_steps=200,
    learning_rate=2e-4,
    save_total_limit=2
)

In [25]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast

class CTCTrainer(Trainer):
    def __init__(
        self,
        use_amp: Optional[bool] = None,
        *args, **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.use_amp = self.args.fp16 if use_amp is None else use_amp

    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        else:
            loss.backward()

        return loss.detach()

In [27]:
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor.feature_extractor
)

In [28]:
import gc
gc.collect()
torch.cuda.empty_cache() 

In [29]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
200,1.0795,0.759974,0.766705
400,0.7843,0.639639,0.801854
600,0.6879,0.608169,0.812283
800,0.6216,0.559118,0.828505
1000,0.5504,0.485726,0.848204
1200,0.5129,0.460861,0.861723
1400,0.4886,0.435372,0.869448


TrainOutput(global_step=1456, training_loss=0.6672310265865955, metrics={'train_runtime': 10445.5386, 'train_samples_per_second': 2.231, 'train_steps_per_second': 0.139, 'total_flos': 4.07648732942101e+18, 'train_loss': 0.6672310265865955, 'epoch': 1.0})

In [31]:
# 학습한 모델 저장
model_dir = "../../KEMDy20_v1_1/Pretrained_Model2/"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

trainer.save_model(model_dir)