In [None]:
# !pip install scikit-learn librosa datasets tqdm

In [None]:
import librosa
import numpy as np
import datasets
from datasets import load_dataset
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
from google.colab import drive

In [None]:
# 1. Mount your Google Drive
drive.mount('/content/drive')

# 2. --- !! IMPORTANT !! ---
#    Update this path to point to your dataset folder
dataset_path = "/content/drive/MyDrive/ASLP-SongEval"

# 3. Load in STREAMING mode
dataset = load_dataset(dataset_path, split='train', streaming=True)

# 4. Tell it NOT to decode audio
dataset = dataset.cast_column("audio", datasets.Audio(decode=False))

print(f"--- Successfully loaded dataset from {dataset_path} in STREAMING mode ---")

Mounted at /content/drive


Resolving data files:   0%|          | 0/2405 [00:00<?, ?it/s]

--- Successfully loaded dataset from /content/drive/MyDrive/ASLP-SongEval in STREAMING mode ---


In [None]:
# Block 4: Manual Processing Loop (FIXED)

# 1. Take 500 examples (a good size for RF)
N_EXAMPLES = 500
small_stream = dataset.take(N_EXAMPLES)

# 2. Create empty lists to store our features (X) and labels (y)
X_features = []
y_labels = []

target_sr = 16000
target_sec = 10
max_length_samples = target_sr * target_sec

print(f"--- Starting MANUAL processing loop for {N_EXAMPLES} examples ---")

# 3. Manually loop through the stream
for example in tqdm(small_stream, total=N_EXAMPLES):
    try:
        audio_path = example["audio"]["path"]

        # --- Manually load 10s audio chunk ---
        duration = librosa.get_duration(path=audio_path)
        start_time = 0.0
        if duration > target_sec:
            start_time = np.random.uniform(0.0, duration - target_sec)

        chunk, sr = librosa.load(
            audio_path,
            sr=target_sr,
            offset=start_time,
            duration=target_sec
        )

        if len(chunk) < max_length_samples:
            chunk = np.pad(chunk, (0, max_length_samples - len(chunk)), 'constant')

        # --- THIS IS THE FEATURE EXTRACTION for RF ---
        # We extract features and average them over the 10s clip
        mfccs = np.mean(librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=20), axis=1)
        chroma = np.mean(librosa.feature.chroma_stft(y=chunk, sr=sr), axis=1)
        contrast = np.mean(librosa.feature.spectral_contrast(y=chunk, sr=sr), axis=1)

        # Combine all features into one vector
        features = np.concatenate([mfccs, chroma, contrast])
        # --- END OF RF FEATURE EXTRACTION ---

        # --- This is our label-fixing logic ---
        annotations = example['annotation']
        coherence_scores, memorability_scores, naturalness_scores, clarity_scores, musicality_scores = [], [], [], [], []

        for ann in annotations:
            coherence_scores.append(ann['Coherence'])
            memorability_scores.append(ann['Memorability'])
            naturalness_scores.append(ann['Naturalness'])
            clarity_scores.append(ann['Clarity'])
            musicality_scores.append(ann['Musicality'])

        labels = [
            np.mean(coherence_scores),
            np.mean(memorability_scores),
            np.mean(naturalness_scores),
            np.mean(clarity_scores),
            np.mean(musicality_scores)
        ]

        # --- Add the data to our lists ---
        X_features.append(features)
        y_labels.append(labels)

    except Exception as e:
        # We also catch the KeyboardInterrupt here if you stop it
        if isinstance(e, KeyboardInterrupt):
            print("\n--- Loop Interrupted by User ---")
            break
        print(f"Error processing {example.get('audio', {}).get('path', 'unknown')}: {e}")

print(f"\n--- Successfully processed {len(X_features)} / {N_EXAMPLES} examples ---")

--- Starting MANUAL processing loop for 500 examples ---


  0%|          | 0/500 [00:00<?, ?it/s]

  return pitch_tuning(



--- Successfully processed 500 / 500 examples ---


In [None]:
if len(X_features) > 0:
    X = np.array(X_features)
    y = np.array(y_labels)
    print(f"Feature matrix 'X' shape: {X.shape}")
    print(f"Label matrix 'y' shape: {y.shape}")
else:
    print("--- No data processed, cannot continue. ---")

Feature matrix 'X' shape: (500, 39)
Label matrix 'y' shape: (500, 5)


In [None]:
# Use scikit-learn's split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Test shapes:  X={X_test.shape}, y={y_test.shape}")

Train shapes: X=(400, 39), y=(400, 5)
Test shapes:  X=(100, 39), y=(100, 5)


In [None]:
# RandomForestRegressor handles multi-output out of the box.
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

print("--- Training Random Forest Regressor ---")
model.fit(X_train, y_train)
print("--- Training Complete ---")

--- Training Random Forest Regressor ---
--- Training Complete ---


In [None]:
print("--- Evaluating model ---")
preds = model.predict(X_test)

# Calculate MAE
mae = mean_absolute_error(y_test, preds)
print(f"Mean Absolute Error: {mae}")

# Calculate Spearman Correlation (SRCC)
srcc_scores = []
for i in range(y_test.shape[1]): # Iterate over each of the 5 dimensions
    correlation, p_value = spearmanr(y_test[:, i], preds[:, i])
    srcc_scores.append(correlation)

avg_srcc = np.mean(srcc_scores)
print(f"Average Spearman Correlation: {avg_srcc}")
print(f"All SRCC scores (Co, Me, Na, Cl, Mu): {srcc_scores}")

--- Evaluating model ---
Mean Absolute Error: 0.6562405
Average Spearman Correlation: 0.5395993392031205
All SRCC scores (Co, Me, Na, Cl, Mu): [np.float64(0.5326818662138679), np.float64(0.5243678216462144), np.float64(0.5136950007350268), np.float64(0.5647702158913969), np.float64(0.5624817915290966)]
