In [30]:
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import *
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import librosa
import librosa.display
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(librosa.__version__)



0.10.1


In [47]:
class GTZANDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.classes = os.listdir(root_dir)
        self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}
        self.transform = transform
        self.file_paths, self.labels = self.load_dataset()
    
    def load_dataset(self):
        file_paths = []
        labels = []
        for i, cls in enumerate(self.classes):
            class_dir = os.path.join(self.root_dir, cls)
            for filename in os.listdir(class_dir):
                file_paths.append(os.path.join(class_dir, filename))
                labels.append(i)
        return file_paths, labels

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio_path = self.file_paths[idx]
        label = self.labels[idx]
        y, sr = librosa.load(audio_path, mono=True, duration=30)
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        print(f"{audio_path}\t {log_mel_spectrogram.size}")
        if self.transform:
            log_mel_spectrogram = self.transform(log_mel_spectrogram)
        return log_mel_spectrogram, label


        


In [32]:
class AudioCNN(nn.Module):
    def __init__(self, num_classes):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.fc1 = nn.Linear (616320, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        expected_size = 64 * x.size(2) * x.size(3)
        x = torch.flatten(x,1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


In [48]:
num_epochs = 10
batch_size = 2
learning_rate = 0.001
num_classes = 10

# Load your dataset and create data loaders
gtzan_path = '../data/genres_original/'

gtzan_dataset = GTZANDataset(root_dir=gtzan_path)

train_size = int(0.8 * len(gtzan_dataset))
test_size = len(gtzan_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(gtzan_dataset, [train_size, test_size])

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


In [39]:
model = AudioCNN(num_classes)
optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [49]:
x= next(iter(train_loader))
y, z  = x

../data/genres_original/blues\blues.00009.wav	 
../data/genres_original/jazz\jazz.00088.wav	 


In [36]:
for data in train_loader:
    inputs, labels = data
    print(f"input:{inputs.size()}, label:{labels.size()}")

../data/genres_original/country\country.00024.wav
../data/genres_original/blues\blues.00050.wav


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

In [None]:
for epoch in range(num_epochs):
    for data in train_loader:
        inputs, labels = data
        #print(inputs.size())
        optimizer.zero_grad()
        outputs = model(inputs.unsqueeze(1).float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

RuntimeError: stack expects each tensor to be equal size, but got [128, 1292] at entry 0 and [128, 1290] at entry 11

In [None]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for data in test_loader:
        inputs, labels = data
        outputs = model(inputs.unsqueeze(1).float())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the test audio clips: %d %%' % (100 * correct / total))