In [59]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, pipeline
from datasets import Dataset, ClassLabel, Sequence, Audio
import os
import pandas as pd
import torch

In [2]:
pipe = pipeline("audio-classification", model="sarthak712/my_awesome_model")

config.json:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [12]:
mapping = {'W': "angry", "E": "disgust", 'A': "fear", "F": "happy", 'T': "sad", 'N': "neutral"}

In [13]:
mapping

{'W': 'angry',
 'E': 'disgust',
 'A': 'fear',
 'F': 'happy',
 'T': 'sad',
 'N': 'neutral'}

In [14]:
directory = '/kaggle/input/berlin-database-of-emotional-speech-emodb/wav'

In [126]:
audio_path = []
label = []
for x in os.listdir(directory):
    if(x[5] != 'L'):
        audio_path.append(directory + "/" + x)
        label.append(mapping[x[5]])

In [127]:
df = pd.DataFrame({"path": audio_path, "label": label})

In [128]:
df

Unnamed: 0,path,label
0,/kaggle/input/berlin-database-of-emotional-spe...,happy
1,/kaggle/input/berlin-database-of-emotional-spe...,happy
2,/kaggle/input/berlin-database-of-emotional-spe...,angry
3,/kaggle/input/berlin-database-of-emotional-spe...,angry
4,/kaggle/input/berlin-database-of-emotional-spe...,angry
...,...,...
449,/kaggle/input/berlin-database-of-emotional-spe...,angry
450,/kaggle/input/berlin-database-of-emotional-spe...,neutral
451,/kaggle/input/berlin-database-of-emotional-spe...,sad
452,/kaggle/input/berlin-database-of-emotional-spe...,neutral


In [129]:
new_mapping = {'neutral': '0',
 'disgust': '1',
 'fear': '2',
 'sad': '3',
 'happy': '4',
 'angry': '5'}

In [130]:
new_mapping.keys()

dict_keys(['neutral', 'disgust', 'fear', 'sad', 'happy', 'angry'])

In [131]:
df["mapping"] = df["label"].apply(lambda x: new_mapping[x])

In [132]:
df

Unnamed: 0,path,label,mapping
0,/kaggle/input/berlin-database-of-emotional-spe...,happy,4
1,/kaggle/input/berlin-database-of-emotional-spe...,happy,4
2,/kaggle/input/berlin-database-of-emotional-spe...,angry,5
3,/kaggle/input/berlin-database-of-emotional-spe...,angry,5
4,/kaggle/input/berlin-database-of-emotional-spe...,angry,5
...,...,...,...
449,/kaggle/input/berlin-database-of-emotional-spe...,angry,5
450,/kaggle/input/berlin-database-of-emotional-spe...,neutral,0
451,/kaggle/input/berlin-database-of-emotional-spe...,sad,3
452,/kaggle/input/berlin-database-of-emotional-spe...,neutral,0


In [133]:
dataset = Dataset.from_dict({"audio": df["path"].to_list(), "label": df["mapping"].to_list()}).cast_column("audio", Audio(sampling_rate=16_000))

In [134]:
dataset = dataset.cast_column("label", ClassLabel(names=['neutral', 'disgust', 'fear', 'sad', 'happy', 'angry']))

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [135]:
dataset.features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'label': ClassLabel(num_classes=6, names=['neutral', 'disgust', 'fear', 'sad', 'happy', 'angry'], id=None)}

In [136]:
dataset[0]

{'audio': {'path': '/kaggle/input/berlin-database-of-emotional-speech-emodb/wav/15a04Fd.wav',
  'array': array([ 0.0000000e+00,  6.1035156e-05,  2.1362305e-04, ...,
          1.2207031e-04,  4.2724609e-04, -1.2207031e-04], dtype=float32),
  'sampling_rate': 16000},
 'label': 4}

In [137]:
dataset["audio"][:2]

[{'path': '/kaggle/input/berlin-database-of-emotional-speech-emodb/wav/15a04Fd.wav',
  'array': array([ 0.0000000e+00,  6.1035156e-05,  2.1362305e-04, ...,
          1.2207031e-04,  4.2724609e-04, -1.2207031e-04], dtype=float32),
  'sampling_rate': 16000},
 {'path': '/kaggle/input/berlin-database-of-emotional-speech-emodb/wav/13b02Fb.wav',
  'array': array([ 9.1552734e-05,  9.1552734e-05,  9.1552734e-05, ...,
         -1.8310547e-04, -3.0517578e-04, -5.1879883e-04], dtype=float32),
  'sampling_rate': 16000}]

In [138]:
prediction = []
for a in dataset["audio"]:
    audio_file = a["path"]
    result = pipe(audio_file)
    prediction.append(result[0]['label'])

In [139]:
preds = []
for i in prediction:
    preds.append(int(new_mapping[i]))

In [140]:
true_values = df["mapping"].astype(int).to_list()

In [141]:
true_values[:10]

[4, 4, 5, 5, 5, 2, 5, 3, 5, 5]

In [142]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

In [143]:
print(classification_report(true_values, preds, target_names=['neutral', 'disgust', 'fear', 'sad', 'happy', 'angry']))

              precision    recall  f1-score   support

     neutral       0.98      0.57      0.72        79
     disgust       1.00      0.33      0.49        46
        fear       1.00      0.04      0.08        69
         sad       0.48      1.00      0.65        62
       happy       0.43      0.90      0.58        71
       angry       0.93      0.81      0.87       127

    accuracy                           0.64       454
   macro avg       0.80      0.61      0.56       454
weighted avg       0.82      0.64      0.61       454



In [144]:
accuracy_score(true_values, preds)

0.6431718061674009