In [1]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
def load_data(data_folder):
    file_list = os.listdir(data_folder)
    data = []
    labels = []
    for filename in file_list:
        filepath = os.path.join(data_folder, filename)
        audio, sr = librosa.load(filepath, sr=44100)
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        data.append(spectrogram)
        labels.append(int(filename.split('.')[0]))  # Extract the label from the filename
    return data, labels

In [11]:
def pad_spectrograms(spectrograms):
    max_len = max(spec.shape[1] for spec in spectrograms)
    padded_specs = []
    for spec in spectrograms:
        pad_width = max_len - spec.shape[1]
        padded_spec = np.pad(spec, ((0, 0), (0, pad_width)), mode='constant')
        padded_specs.append(padded_spec)
    # Convert to PyTorch tensor
    return torch.tensor(padded_specs, dtype=torch.float32)

In [4]:
def create_lstm_model(input_size, hidden_size, num_classes):
    model = nn.Sequential(
        nn.LSTM(input_size, hidden_size, batch_first=True),
        nn.Linear(hidden_size, num_classes)
    )
    return model

In [5]:
def train_model(model, criterion, optimizer, dataloader, num_epochs=10):
    for epoch in range(num_epochs):
        for spectrograms, labels in dataloader:
            spectrograms = pad_spectrograms(spectrograms.numpy())
            spectrograms = torch.tensor(spectrograms, dtype=torch.float32)
            labels = labels.long()  # Convert labels to long tensor
            
            optimizer.zero_grad()
            outputs = model(spectrograms)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [12]:
data_folder = 'D:\\vs_code\\DL\\proj\\resources\\fma_small_edited_truncated'
data, labels = load_data(data_folder)

# Step 7: Create DataLoader
dataset = list(zip(data, labels))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Step 8: Prepare model and optimizer
input_size = data[0].shape[0]
hidden_size = 128
num_classes = len(np.unique(labels))
model = create_lstm_model(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 9: Train the model
train_model(model, criterion, optimizer, dataloader)

RuntimeError: stack expects each tensor to be equal size, but got [128, 2585] at entry 0 and [128, 2582] at entry 1