In [1]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import librosa  # or torchaudio for loading audio
import numpy as np
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import ast

In [2]:
# Load pre-trained processor (tokenizer) and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav2vec.to(device)
wav2vec.eval()  # evaluation mode (no training in typical usage)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [3]:
import librosa

def extract_wav2vec_features(audio_path, sr=16000, model=wav2vec, proc=processor, device=device):
    # 1. Load raw audio
    waveform, original_sr = librosa.load(audio_path, sr=sr, mono=True, duration=20)

    # 2. Wav2Vec2 expects a batch of waveforms in float32
    inputs = proc(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
    # => inputs is a dict with 'input_values' (batch_size, seq_len)

    # 3. Move to device (GPU/CPU)
    input_values = inputs["input_values"].to(device)  # shape: (1, seq_len)

    # 4. Forward pass through Wav2Vec2
    with torch.no_grad():
        outputs = model(input_values)
        # outputs.last_hidden_state: shape (batch_size, time_steps, hidden_size)
        # e.g., for facebook/wav2vec2-base-960h => hidden_size=768

    # Typically, you might use:
    # - outputs.last_hidden_state (per-frame embeddings, shape: (1, T, 768))
    # - outputs.hidden_states (if you want all layer outputs)
    # Let's use the last hidden state
    hidden_states = outputs.last_hidden_state.squeeze(0).cpu().numpy()  # shape: (time_steps, 768)
    return hidden_states

In [4]:
class AudioTransformerClassifier(nn.Module):
    def __init__(self, feature_dim, model_dim, num_heads, num_layers, num_classes, max_seq_len=1000):
        """
        A simple Transformer-based classifier:
          - Each time step = 1 segment
          - Each segment has feature_dim features
          - Map segments to embeddings of size model_dim
          - Learn positional embeddings up to max_seq_len
          - TransformerEncoder of `num_layers` layers
          - Classification head (pooling + linear)
        """
        super().__init__()
        self.feature_dim = feature_dim
        self.model_dim = model_dim
        self.max_seq_len = max_seq_len

        # 1. Convert segment features -> model_dim
        self.input_proj = nn.Linear(feature_dim, model_dim)

        # 2. Positional embeddings
        self.pos_embedding = nn.Embedding(max_seq_len, model_dim)

        # 3. Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 4. Classification head
        self.classifier = nn.Sequential(
            nn.Linear(model_dim, model_dim),
            nn.ReLU(),
            nn.Linear(model_dim, num_classes)
        )

    def forward(self, x):
        """
        x shape: (batch_size, seq_len, feature_dim)
        Returns: (batch_size, num_classes)
        """
        batch_size, seq_len, _ = x.shape
        print(x.shape)
        
        # Project input features to model_dim
        x = self.input_proj(x)  # (batch_size, seq_len, model_dim)

        # Add positional embeddings
        # We assume seq_len <= max_seq_len
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)  # shape (1, seq_len)
        pos_emb = self.pos_embedding(positions)  # shape (1, seq_len, model_dim)
        x = x + pos_emb  # broadcast add

        # Transformer encoding
        x = self.transformer_encoder(x)  # (batch_size, seq_len, model_dim)

        # Pooling: take the mean (or the last token, or [CLS]-like approach)
        x = x.mean(dim=1)  # shape (batch_size, model_dim)

        # Classification
        logits = self.classifier(x)  # (batch_size, num_classes)
        return logits
    
def get_all_wav_files(directory):
    wav_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.wav'):
                wav_files.append(dirpath + '/' + filename)
    return wav_files


class GTZANSegmentsDataset(Dataset):
    def __init__(self, files, labels):
        self.files = files
        self.labels = labels

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        row = self.files[idx]

        # Convert the string to an actual Python object (list or list of lists)
        filename = row.split('/')[-1]
        print(filename)
        
        hidden_states = extract_wav2vec_features(row, model=wav2vec, proc=processor)

        # Convert to a NumPy array or directly to torch.Tensor
        X_np = np.array(hidden_states, dtype=np.float32)
        segments_tensor = torch.tensor(X_np, dtype=torch.float)

        label = labelMap[filename.split(".")[0]]
        label_tensor = torch.tensor(label)

        return {
            "segments": segments_tensor,
            "label": label_tensor
        }

labelMap = {
    "blues": 0,
    "classical": 1,
    "country": 2,
    "disco": 3,
    "hiphop": 4,
    "jazz": 5,
    "metal": 6,
    "pop": 7,
    "reggae": 8,
    "rock": 9
}

files = get_all_wav_files('../data/genres/')
labels = list(map(lambda x: labelMap[x.split('/')[-1].split('.')[0]], files))
X_train, X_test, y_train, y_test = train_test_split(files,labels, random_state=1122)

train_dataset = GTZANSegmentsDataset(X_train, y_train)
test_dataset = GTZANSegmentsDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=False, drop_last=False)

In [5]:
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import classification_report

from sklearn.metrics import precision_score, recall_score, f1_score

num_classes = 10
feature_dim = 768
model_dim = 16
num_heads = 2
num_layers = 4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AudioTransformerClassifier(feature_dim, model_dim, num_heads, num_layers, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

preds_dict = {}

# Example training loop
for epoch in range(100):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        segments = batch["segments"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        logits = model(segments)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_loss:.4f}")

    # Evaluate on test set
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            segments = batch["segments"].to(device)  # (batch_size, seq_len, feature_dim)
            labels = batch["label"].to(device)       # (batch_size,)
            
            # Forward pass
            logits = model(segments)
            # Predicted class indices
            preds = torch.argmax(logits, dim=-1)
            
            # Move predictions & labels back to CPU and store
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Convert lists to numpy arrays
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    genres_list = ["blues", "classical", "country", "disco", "hiphop", 
               "jazz", "metal", "pop", "reggae", "rock"]
    report = classification_report(all_preds, all_labels, target_names=genres_list)
    print(report)
    
    preds_dict[epoch] = (all_preds, all_labels)

pop.00055.wav
hiphop.00014.wav
blues.00054.wav
country.00046.wav
jazz.00051.wav
country.00099.wav
rock.00090.wav
hiphop.00033.wav
country.00012.wav
hiphop.00029.wav
disco.00058.wav
metal.00098.wav
torch.Size([12, 999, 768])
rock.00084.wav
reggae.00011.wav
country.00088.wav
blues.00045.wav
blues.00044.wav
metal.00085.wav
jazz.00008.wav
blues.00038.wav
rock.00019.wav
disco.00053.wav
blues.00025.wav
classical.00085.wav
torch.Size([12, 999, 768])
jazz.00067.wav
pop.00012.wav
reggae.00057.wav
classical.00033.wav
rock.00021.wav
pop.00042.wav
metal.00008.wav
rock.00074.wav
disco.00033.wav
metal.00061.wav
metal.00051.wav
classical.00068.wav
torch.Size([12, 999, 768])
classical.00081.wav
pop.00047.wav
disco.00086.wav
reggae.00062.wav
blues.00033.wav
jazz.00094.wav
metal.00078.wav
rock.00071.wav
rock.00080.wav
country.00042.wav
disco.00090.wav
metal.00047.wav
torch.Size([12, 999, 768])
blues.00061.wav
disco.00055.wav
classical.00031.wav
hiphop.00055.wav
rock.00082.wav
country.00080.wav
jazz.0004

  waveform, original_sr = librosa.load(audio_path, sr=sr, mono=True, duration=20)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


NoBackendError: 