In [1]:
import os
import pandas as pd
import librosa
import numpy as np
import torch
from torch.serialization import safe_globals
from datasets import Dataset
import random
from sklearn.model_selection import train_test_split
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import GenerationConfig
from tabulate import tabulate

In [2]:
#Load Data
df = pd.read_csv('multimodal_dataset_normalized.csv')
display(df.head())

Unnamed: 0,Audio_Song,Lyric_Song,Arousal,Valence,Quadrant,Emotion,lyric_id,word_count,unique_word_count,lexical_diversity,...,rms_mean,rms_std,beat_strength,low_energy_ratio,energy_entropy,brightness,warmth,activity,harmonic_energy_ratio,harmonicity
0,A005,L055,0.7875,0.6875,Q1,Surprise,L055,0.583846,-0.032136,-1.136493,...,-1.315662,-1.165455,0.896397,-0.700513,-0.145042,1.086334,-0.092941,[0.06471955],-1.951364,-0.583475
1,A011,L061,0.68125,0.85625,Q1,Happiness,L061,-0.54914,-0.97493,-1.286263,...,0.107522,-0.408386,0.006682,0.068631,0.750471,0.147696,0.04022,[0.18521025],0.472448,-0.078961
2,A014,L064,0.8625,0.725,Q1,Surprise,L064,0.415581,-0.162177,-1.133997,...,0.361195,-0.265328,0.710477,-0.673247,0.854382,1.970159,-0.798261,[0.17745368],-0.949336,-0.454885
3,A019,L069,0.78125,0.81875,Q1,Excitement,L069,-0.229436,-0.308472,-0.401603,...,-0.219121,-0.979168,-0.061622,-0.34051,1.062094,0.474343,0.110013,[0.13742129],0.24476,-0.150335
4,A022,L072,0.76875,0.8375,Q1,Excitement,L072,-0.599619,-0.828634,-0.755787,...,0.916902,0.959027,-0.008507,-0.482841,0.29892,-0.169472,0.269772,[0.22606403],-0.078616,-0.234134


In [3]:
#Emotion Class
unique_items = df["Quadrant"].unique()
print(unique_items)

all_classes = unique_items
class_to_idx = {cls: i for i, cls in enumerate(all_classes)}
print("Class to index mapping:", class_to_idx)

['Q1' 'Q2' 'Q3' 'Q4']
Class to index mapping: {'Q1': 0, 'Q2': 1, 'Q3': 2, 'Q4': 3}


In [4]:
#Extract path and emotion
data_dir = "MERGE_Bimodal_Complete/audio_wav"
labels_dict = {f"{data_dir}/{row['Quadrant']}/{row['Audio_Song']}.wav": row['Quadrant'] for _, row in df.iterrows()}
print(labels_dict)

{'MERGE_Bimodal_Complete/audio_wav/Q1/A005.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A011.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A014.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A019.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A022.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A024.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A039.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A042.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A043.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A047.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A050.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A064-116.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A077-122.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A086-123.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A092-96.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A094-110.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A095-113.wav': 'Q1', 'MERGE_Bimodal_Complete/audio_wav/Q1/A100-124.wav': 'Q1', 

In [5]:
X_list = list(labels_dict.keys())
y_list = [labels_dict[fp] for fp in X_list]

X_trainval, X_test, y_trainval, y_test = train_test_split(X_list, y_list, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

print("Train size:", len(X_train), len(y_train))
print("Val size:", len(X_val), len(y_val))
print("Test size:", len(X_test), len(y_test))

def make_dataset(paths, labels):
    return Dataset.from_dict({"audio": paths, "label": labels})

train_ds = make_dataset(X_train, y_train)
eval_ds = make_dataset(X_val, y_val)
test_ds = make_dataset(X_test, y_test)

print("Train shape:", train_ds.shape)
print("Val shape:", eval_ds.shape)
print("Test shape:", test_ds.shape)

Train size: 1329 1329
Val size: 443 443
Test size: 444 444
Train shape: (1329, 2)
Val shape: (443, 2)
Test shape: (444, 2)


In [6]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
label_list = sorted(set(y_list))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

In [7]:
def preprocess(example):
    audio_array, _ = librosa.load(example["audio"], sr=16000)
    inputs = feature_extractor(audio_array, sampling_rate=16000)
    inputs["label"] = label2id[example["label"]]
    return inputs

train_ds = train_ds.map(preprocess)
eval_ds = eval_ds.map(preprocess)
test_ds = test_ds.map(preprocess)

Map:   0%|          | 0/1329 [00:00<?, ? examples/s]

Map:   0%|          | 0/443 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Single-label prediction
    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions, average="macro", zero_division=1),
        "recall": recall_score(labels, predictions, average="macro", zero_division=1),
        "f1": f1_score(labels, predictions, average="macro", zero_division=1),
    }

In [9]:
class AudioCollator:
    def __call__(self, batch):
        input_values = [torch.tensor(b["input_values"]).squeeze() for b in batch]
        labels = [b["label"] for b in batch]
        return {
            "input_values": torch.stack(input_values),
            "labels": torch.tensor(labels)
        }

In [10]:
model = AutoModelForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    num_labels=len(label_list),
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True  # if your label count differs
)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir="./ast-emotion2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=AudioCollator(),
)

trainer.train()

  trainer = Trainer(


KeyError: 'input_values'

In [12]:
# Check results on the test set
metrics = trainer.evaluate(eval_dataset=test_ds)
print(tabulate(metrics.items(), headers=["Metric", "Value"], tablefmt="pretty"))

+-------------------------+--------------------+
|         Metric          |       Value        |
+-------------------------+--------------------+
|        eval_loss        | 1.6291677951812744 |
|      eval_accuracy      | 0.7072072072072072 |
|     eval_precision      | 0.6938133253554661 |
|       eval_recall       | 0.6962662860488947 |
|         eval_f1         | 0.6937526554939218 |
|      eval_runtime       |      73.1952       |
| eval_samples_per_second |       6.066        |
|  eval_steps_per_second  |       0.765        |
|          epoch          |        10.0        |
+-------------------------+--------------------+


In [11]:
# Save the whole model to be loaded easily later
model_save_path = "models/ast_lyrics_saved"
model.save_pretrained(model_save_path)
feature_extractor.save_pretrained(model_save_path)

['models/ast_lyrics_saved\\preprocessor_config.json']