In [1]:
# audio_dataset.ipynb 에서 만든 데이터셋 불러오기

import datasets

minds = datasets.load_from_disk("/mnt/c/Users/user/angry_level_classification/data/dataset_01")

In [2]:
minds

Dataset({
    features: ['audio', 'intent_class'],
    num_rows: 5501
})

In [3]:
minds['intent_class'][:2]

[1, 1]

In [4]:
label2id = {'angry':'1','neutral':'0'}
id2label = {'1':'angry','0':'neutral'}
num_labels = len(id2label)

In [5]:
minds = minds.train_test_split(test_size=0.2)
minds

DatasetDict({
    train: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 4400
    })
    test: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 1101
    })
})

In [6]:
minds["train"][0]

{'audio': {'array': [-0.00341796875,
   -0.003570556640625,
   -0.0035400390625,
   -0.00408935546875,
   -0.004486083984375,
   -0.0045166015625,
   -0.00457763671875,
   -0.004669189453125,
   -0.004791259765625,
   -0.0054931640625,
   -0.00531005859375,
   -0.005157470703125,
   -0.005035400390625,
   -0.0052490234375,
   -0.00506591796875,
   -0.005462646484375,
   -0.00543212890625,
   -0.00628662109375,
   -0.006011962890625,
   -0.00628662109375,
   -0.00640869140625,
   -0.00628662109375,
   -0.00640869140625,
   -0.00634765625,
   -0.00653076171875,
   -0.006134033203125,
   -0.0067138671875,
   -0.006805419921875,
   -0.0068359375,
   -0.006561279296875,
   -0.006561279296875,
   -0.0064697265625,
   -0.0067138671875,
   -0.006378173828125,
   -0.006622314453125,
   -0.006500244140625,
   -0.006378173828125,
   -0.006134033203125,
   -0.006103515625,
   -0.005615234375,
   -0.0054931640625,
   -0.00482177734375,
   -0.004669189453125,
   -0.004608154296875,
   -0.00381469726

In [7]:
id2label[str(0)]

'neutral'

In [8]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [9]:
from datasets import Audio

minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [10]:
minds

DatasetDict({
    train: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 4400
    })
    test: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 1101
    })
})

In [11]:
minds["train"][0]

{'audio': {'path': None,
  'array': array([-0.00341797, -0.00357056, -0.00354004, ..., -0.00021362,
         -0.00021362, -0.00021362]),
  'sampling_rate': 16000},
 'intent_class': 1}

In [12]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [13]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

Map:   0%|          | 0/4400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

In [14]:
import evaluate

accuracy = evaluate.load("accuracy")

In [15]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [16]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'projector.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = TrainingArguments(
    output_dir="mnt/c/Users/user/angry_level_classification/data/model-1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
    # no_cuda =True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  return F.conv1d(input, weight, bias, self.stride,


Epoch,Training Loss,Validation Loss,Accuracy
0,0.5347,0.359535,0.897366
2,0.3118,0.307172,0.887375
2,0.2599,0.314483,0.871935
4,0.2306,0.221261,0.91644
4,0.1688,0.218388,0.915531
6,0.1812,0.210887,0.918256
6,0.1681,0.202692,0.921889
8,0.187,0.229854,0.91644
8,0.1453,0.195321,0.926431
9,0.1324,0.195881,0.928247


TrainOutput(global_step=340, training_loss=0.24424813389778138, metrics={'train_runtime': 594.2151, 'train_samples_per_second': 74.047, 'train_steps_per_second': 0.572, 'total_flos': 3.9379476939264e+17, 'train_loss': 0.24424813389778138, 'epoch': 9.86})