In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
import xgboost as xgb
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import librosa  # or torchaudio for loading audio
import numpy as np
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

def get_all_wav_files(directory):
    wav_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.wav'):
                wav_files.append(dirpath + '/' + filename)
    return wav_files

import librosa

# Load pre-trained processor (tokenizer) and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav2vec.to(device)
wav2vec.eval()  # evaluation mode (no training in typical usage)

def extract_wav2vec_features(audio_path, sr=16000, model=wav2vec, proc=processor, device=device):
    # 1. Load raw audio
    waveform, original_sr = librosa.load(audio_path, sr=sr, mono=True, duration=20)

    # 2. Wav2Vec2 expects a batch of waveforms in float32
    inputs = proc(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
    # => inputs is a dict with 'input_values' (batch_size, seq_len)

    # 3. Move to device (GPU/CPU)
    input_values = inputs["input_values"].to(device)  # shape: (1, seq_len)

    # 4. Forward pass through Wav2Vec2
    with torch.no_grad():
        outputs = model(input_values)
        # outputs.last_hidden_state: shape (batch_size, time_steps, hidden_size)
        # e.g., for facebook/wav2vec2-base-960h => hidden_size=768

    # Typically, you might use:
    # - outputs.last_hidden_state (per-frame embeddings, shape: (1, T, 768))
    # - outputs.hidden_states (if you want all layer outputs)
    # Let's use the last hidden state
    hidden_states = outputs.last_hidden_state.squeeze(0).cpu().numpy()  # shape: (time_steps, 768)
    return hidden_states

def clean_labels(s):
    return s.split('.')[0]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
labelMap = {
    "blues": 0,
    "classical": 1,
    "country": 2,
    "disco": 3,
    "hiphop": 4,
    "jazz": 5,
    "metal": 6,
    "pop": 7,
    "reggae": 8,
    "rock": 9
}

files = get_all_wav_files('../data/genres/')
entries = []
labels = []

for file in files:
    try:
        print(file)
        filename = file.split('/')[-1]
        hidden_states = extract_wav2vec_features(file, model=wav2vec, proc=processor)
        X_np = np.array(hidden_states, dtype=np.float32)
        entries.append(hidden_states)
        labels.append(clean_labels(filename))
    except Exception as e:
        print(e)
        print(f"Could not process {file}")

../data/genres/blues/blues.00000.wav
../data/genres/blues/blues.00001.wav
../data/genres/blues/blues.00002.wav


In [None]:
df = pd.DataFrame(np.row_stack(entries))

X = df
y = list(map(lambda x: labelMap[x], labels))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1122)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
labels = labelMap.values()

clf = xgb.XGBClassifier(
    objective='multi:softmax',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=3,
    random_state=1122,
    eval_metric='mlogloss',
    early_stopping_rounds=5,
    n_jobs=-1
)

eval_set = [(X_train_scaled, y_train), (X_test_scaled, y_test)]

# clf = SVC(kernel='rbf', random_state=1122)
clf.fit(X_train_scaled, y_train, verbose=1, eval_set=eval_set)

predictions = clf.predict(X_test_scaled)
print('Accuracy:', accuracy_score(y_test, predictions))

import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Multiclass Confusion Matrix')
    plt.show()

# Call function:
plot_confusion_matrix(y_test, predictions, labels)