In [1]:
!pip install transformers datasets scikit-learn scipy librosa



In [2]:
import librosa
import numpy as np
import datasets
from datasets import load_dataset, Dataset
from tqdm.auto import tqdm
from google.colab import drive
import torch # We need this for the manual loop

from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
from torch.nn import MSELoss

In [3]:
# 1. Mount your Google Drive
drive.mount('/content/drive')

# 2. --- !! IMPORTANT !! ---
#    Update this path to point to your dataset folder
dataset_path = "/content/drive/MyDrive/ASLP-SongEval"

# 3. Load in STREAMING mode
dataset = load_dataset(dataset_path, split='train', streaming=True)

# 4. Tell it NOT to decode audio
dataset = dataset.cast_column("audio", datasets.Audio(decode=False))

print(f"--- Successfully loaded dataset from {dataset_path} in STREAMING mode ---")

Mounted at /content/drive


Resolving data files:   0%|          | 0/890 [00:00<?, ?it/s]

--- Successfully loaded dataset from /content/drive/MyDrive/ASLP-SongEval in STREAMING mode ---


In [4]:
# This is the Wav2Vec2 feature extractor.
# It also expects 16kHz audio, so our function is fine.
feature_extractor = AutoFeatureExtractor.from_pretrained(
    "facebook/wav2vec2-base"
)
print("--- Loaded Wav2Vec2 Feature Extractor ---")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

--- Loaded Wav2Vec2 Feature Extractor ---




In [5]:
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=5,
    ignore_mismatched_sizes=True
)
print("--- Loaded Wav2Vec2 Model (with new regression head) ---")

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Loaded Wav2Vec2 Model (with new regression head) ---


In [15]:
import os

root = "/content/drive/MyDrive/ASLP-SongEval/Data/mp3"
file_count = 0
file_list = []

for path, dirs, files in os.walk(root):
    for f in files:
        file_count += 1
        file_list.append(os.path.join(path, f))

file_count, file_list

(884,
 ['/content/drive/MyDrive/ASLP-SongEval/Data/mp3/0.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/10.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1001.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/100.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1003.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1002.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1000.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1005.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1004.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1006.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1007.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1008.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1009.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1010.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Data/mp3/1011.mp3',
  '/content/drive/MyDrive/ASLP-SongEval/Dat

In [26]:
import os

N_EXAMPLES = 500
small_stream = dataset.take(5000)   # take a large pool, we will stop after 20 processed

all_input_values = []
all_labels = []

target_sr = 16000
target_sec = 10
max_length_samples = target_sr * target_sec

processed_count = 0
skipped_count = 0

print(f"--- Starting MANUAL processing loop aiming for {N_EXAMPLES} processed samples ---")

for example in small_stream:
    if processed_count >= N_EXAMPLES:
        break  # stop only when 20 *valid* samples are processed

    try:
        audio_path = example["audio"]["path"]

        # --- Skip if missing path ---
        if not audio_path or not isinstance(audio_path, str):
            skipped_count += 1
            print(f"[SKIP] Missing audio path")
            continue

        # --- Skip if file does not exist ---
        if not os.path.exists(audio_path):
            skipped_count += 1
            print(f"[SKIP] File not found: {audio_path}")
            continue

        print(f"[PROCESS] Loading file: {audio_path}")

        # --- Load the audio ---
        duration = librosa.get_duration(path=audio_path)
        start_time = 0.0
        if duration > target_sec:
            start_time = np.random.uniform(0.0, duration - target_sec)

        chunk, sr = librosa.load(
            audio_path,
            sr=target_sr,
            offset=start_time,
            duration=target_sec
        )

        # --- Zero or tiny audio check ---
        if chunk is None or len(chunk) == 0:
            skipped_count += 1
            print(f"[SKIP] Empty audio data: {audio_path}")
            continue

        # --- Pad if shorter than 10s ---
        if len(chunk) < max_length_samples:
            chunk = np.pad(chunk, (0, max_length_samples - len(chunk)), 'constant')

        # --- Feature extraction ---
        inputs = feature_extractor(
            chunk,
            sampling_rate=target_sr,
            padding="max_length",
            max_length=max_length_samples,
            return_tensors="pt"
        )

        # --- Extract labels ---
        annotations = example['annotation']
        scores = {
            'Coherence': [], 'Memorability': [], 'Naturalness': [],
            'Clarity': [], 'Musicality': []
        }
        for ann in annotations:
            for k in scores.keys():
                scores[k].append(ann[k])

        labels = [np.mean(scores[k]) for k in scores]

        # --- Save sample ---
        all_input_values.append(inputs['input_values'].squeeze(0).cpu().numpy())
        all_labels.append(labels)

        processed_count += 1
        print(f"[OK] Processed #{processed_count}: {audio_path}")

    except Exception as e:
        skipped_count += 1
        print(f"[ERROR] {audio_path}: {e}")

print("\n--- Finished ---")
print("Successfully processed:", processed_count)
print("Skipped:", skipped_count)

# ------------------------------------------------------
# --------------- CREATE PROCESSED DATASET -------------
# ------------------------------------------------------

if processed_count > 0:
    processed_dataset = Dataset.from_dict({
        'input_values': all_input_values,
        'labels': all_labels
    })

    processed_dataset.set_format("torch")

    # Split into train/test
    processed_dataset = processed_dataset.train_test_split(test_size=0.1, seed=42)

    print("\n--- SUCCESSFULLY CREATED, FORMATTED, AND SPLIT DATASET ---")
    print(processed_dataset)
else:
    print("\n--- No valid samples processed. Dataset not created. ---")


--- Starting MANUAL processing loop aiming for 500 processed samples ---
[PROCESS] Loading file: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/0.mp3
[OK] Processed #1: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/0.mp3
[PROCESS] Loading file: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/1.mp3
[OK] Processed #2: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/1.mp3
[SKIP] File not found: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/2.mp3
[SKIP] File not found: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/3.mp3
[SKIP] File not found: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/4.mp3
[SKIP] File not found: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/5.mp3
[SKIP] File not found: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/6.mp3
[SKIP] File not found: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/7.mp3
[SKIP] File not found: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/8.mp3
[SKIP] File not found: /content/drive/MyDrive/ASLP-SongEval/Data/mp3/9.mp3
[PROCESS] Loading file: /content/

In [27]:
class RegressionTrainer(Trainer):
    # We include 'num_items_in_batch' to fix the old transformers version bug
    def compute_loss(self, model, inputs, num_items_in_batch, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = MSELoss()
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

In [28]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mae = mean_absolute_error(labels, predictions)
    srcc_scores = []
    for i in range(labels.shape[1]):
        correlation, p_value = spearmanr(labels[:, i], predictions[:, i])
        srcc_scores.append(correlation)
    avg_srcc = np.mean(srcc_scores)

    return {
        "mean_absolute_error": mae,
        "average_spearman_correlation": avg_srcc,
        "srcc_coherence": srcc_scores[0],
        "srcc_memorability": srcc_scores[1],
        "srcc_naturalness": srcc_scores[2],
        "srcc_clarity": srcc_scores[3],
        "srcc_musicality": srcc_scores[4]
    }

In [29]:
# This is our version-safe, "bulletproof" set of arguments
# with the GPU memory fixes (batch=2, accum=4)
training_args = TrainingArguments(
    output_dir="./SongEval-Wav2Vec2-Baseline",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    logging_steps=5,
)
print("--- Using BULLETPROOF TrainingArguments. ---")

--- Using BULLETPROOF TrainingArguments. ---


In [30]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    compute_metrics=compute_metrics,
)

In [31]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mahsanadil[0m ([33mahsanadil-nust[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,13.4917
10,10.5034
15,7.3152
20,8.0325
25,6.5558
30,6.1044
35,5.0299
40,4.7962
45,4.9198
50,3.4157


TrainOutput(global_step=171, training_loss=3.2727468432041635, metrics={'train_runtime': 371.246, 'train_samples_per_second': 3.636, 'train_steps_per_second': 0.461, 'total_flos': 1.22562539856e+17, 'train_loss': 3.2727468432041635, 'epoch': 3.0})

In [32]:
print("\n--- Training finished. Starting manual evaluation... ---")
eval_results = trainer.evaluate()

print("\n--- FINAL EVALUATION RESULTS (Wav2Vec2) ---")
print(eval_results)


--- Training finished. Starting manual evaluation... ---



--- FINAL EVALUATION RESULTS (Wav2Vec2) ---
{'eval_loss': 1.1989234685897827, 'eval_mean_absolute_error': 0.9360284805297852, 'eval_average_spearman_correlation': 0.09966406817436248, 'eval_srcc_coherence': 0.4002948261510761, 'eval_srcc_memorability': 0.23905403822279683, 'eval_srcc_naturalness': 0.0655058173341579, 'eval_srcc_clarity': 0.10685440849928297, 'eval_srcc_musicality': -0.3133887493355015, 'eval_runtime': 2.8077, 'eval_samples_per_second': 17.808, 'eval_steps_per_second': 8.904, 'epoch': 3.0}
