## Installs and Imports

In [1]:
!pip install librosa
!pip install torch
!pip install torchvision



In [2]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision.transforms import Normalize
import torch.optim as optim
import torch
import torch.nn as nn
from torchvision import models
from sklearn.model_selection import train_test_split


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# GTZN Dataset Import
# Create Shortcut into your own drive: https://drive.google.com/file/d/12Ls4CNeMIPsOZTEqztT06RChdfBRjUBm/view?usp=sharing
!cp /content/drive/MyDrive/archive.zip /content
!unzip /content/archive.zip

In [None]:
# FMA Small Dataset Import, Unzip
# Create Shortcut into your own drive : https://drive.google.com/file/d/1Z5jTAKZW3ng4ztg9OkOMY5FkLeZ-tpkt/view
!cp /content/drive/MyDrive/mel_spectograms.zip /content
!unzip /content/mel_spectograms.zip

## Dataset Prep

In [7]:
# This Code is used to declare the class to prepare the GTZAN Dataset
class GenreDatasetGTZAN(Dataset):
    def __init__(self, root_dir, transform=None):
        self.data = []
        self.labels = []
        self.genre_to_label = {
            "blues": 0, "classical": 1, "country": 2, "disco": 3,
            "hiphop": 4, "jazz": 5, "metal": 6, "pop": 7, "reggae": 8, "rock": 9
        }
        self.transform = transform

        for genre, label in self.genre_to_label.items():
            genre_dir = os.path.join(root_dir, genre)
            for file in os.listdir(genre_dir):
                if file.endswith(".wav"):
                    file_path = os.path.join(genre_dir, file)
                    try:
                        librosa.load(file_path, duration=1)
                        self.data.append(file_path)
                        self.labels.append(label)
                    except Exception as e:
                        print(f"Skipping invalid file: {file_path}")
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data[idx]
        label = self.labels[idx]
        try:
            y_audio, sr = librosa.load(file_path, duration=30)
        except Exception as e:
            return None, None
        mel_spec = librosa.feature.melspectrogram(y=y_audio, sr=sr)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_resized = librosa.util.fix_length(mel_spec_db, size=2048, axis=1)
        mel_spec_resized = mel_spec_resized[:128, :]
        sample = torch.tensor(mel_spec_resized, dtype=torch.float32).unsqueeze(0)
        if self.transform:
            sample = self.transform(sample)

        return sample, label





In [None]:
# This Code is used to declare the class to prepare the Free Music Archive Dataset
class GenreDatasetFMA(Dataset):
    def __init__(self, root_dir, transform=None):
        self.data = []
        self.labels = []
        self.genre_to_label = {genre: idx for idx, genre in enumerate(sorted(os.listdir(root_dir)))}
        self.transform = transform

        # Load files
        for genre, label in self.genre_to_label.items():
            genre_dir = os.path.join(root_dir, genre)
            for file in os.listdir(genre_dir):
                if file.endswith(".npy"):
                    file_path = os.path.join(genre_dir, file)
                    self.data.append(file_path)
                    self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data[idx]
        label = self.labels[idx]

        try:
            mel_spec = np.load(file_path)
            mel_spec = mel_spec[:128, :2048]
            sample = torch.tensor(mel_spec, dtype=torch.float32).unsqueeze(0)

            if self.transform:
                sample = self.transform(sample)

            return sample, label

        except Exception as e:
            return None, None

# We are using a custom collate Function since we encountered a few problematic files in the FMA Dataset
def custom_collate(batch):
    # Filter out None entries
    batch = [item for item in batch if item is not None]

    if len(batch) == 0:
        return None, None

    try:
        inputs, labels = zip(*batch)

        # Remove inputs with NaN values
        valid_inputs_labels = []
        for input_tensor, label in zip(inputs, labels):
            if torch.isnan(input_tensor).any():
                continue
            valid_inputs_labels.append((input_tensor, label))

        if len(valid_inputs_labels) == 0:
            return None, None

        inputs, labels = zip(*valid_inputs_labels)
        inputs = torch.stack(inputs)
        labels = torch.tensor(labels)

        return inputs, labels

    except Exception as ex:
        return None, None

## RESNET 18


In [None]:
# Define Resnet
class ResNet18GenreClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ResNet18GenreClassifier, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet(x)


In [8]:

#Code to Train Model and Evaluate Model - Used for Both ResNet and Traditional CNN
def train_model(model, train_loader, criterion, optimizer, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            if inputs is None or labels is None:
                continue

            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        if len(train_loader) > 0:
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(train_loader):.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            if inputs is None or labels is None:
                continue

            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    if total > 0:
        accuracy = correct / total
        print(f"Test Accuracy: {accuracy:.4f}")
    else:
        print("No valid test samples found")


### GTZAN - ResNet

In [None]:
# Split dataset into train and test sets for GTZAN
dataset = GenreDatasetGTZAN(root_dir='/content/Data/genres_original', transform=None)
train_indices, test_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
test_sampler = torch.utils.data.SubsetRandomSampler(test_indices)
train_loader = DataLoader(dataset, batch_size=32, sampler=train_sampler, num_workers=2)
test_loader = DataLoader(dataset, batch_size=32, sampler=test_sampler, num_workers=2)


In [None]:
EPOCHS = 15
# Initialize model, criterion, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNet18GenreClassifier(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)
train_model(model, train_loader, criterion, optimizer, device, epochs=EPOCHS)
accuracy = evaluate_model(model, test_loader, device)


Epoch 1/15, Loss: 1.5494
Epoch 2/15, Loss: 0.8599
Epoch 3/15, Loss: 0.5928
Epoch 4/15, Loss: 0.3803
Epoch 5/15, Loss: 0.2446
Epoch 6/15, Loss: 0.1742
Epoch 7/15, Loss: 0.1336
Epoch 8/15, Loss: 0.0880
Epoch 9/15, Loss: 0.0636
Epoch 10/15, Loss: 0.0435
Epoch 11/15, Loss: 0.0361
Epoch 12/15, Loss: 0.0403
Epoch 13/15, Loss: 0.0459
Epoch 14/15, Loss: 0.0264
Epoch 15/15, Loss: 0.0205
Test Accuracy: 0.8350


In [None]:
save_path = "resnet_genre_classifier_gtzan.pth"
torch.save(model.state_dict(), save_path)

### FMA - ResNet

In [None]:
# Split dataset into train and test sets for FMA
class NormalizeSpectrogram:
    def __call__(self, sample):
        return (sample - sample.mean()) / (sample.std() + 1e-5)

transform = NormalizeSpectrogram()
dataset = GenreDatasetFMA(root_dir='/content/kaggle/working/mel_spectrograms', transform=transform)

train_indices, test_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
test_sampler = torch.utils.data.SubsetRandomSampler(test_indices)
train_loader = DataLoader(dataset, batch_size=32, sampler=train_sampler, num_workers=2, collate_fn=custom_collate)
test_loader = DataLoader(dataset, batch_size=32, sampler=test_sampler, num_workers=2, collate_fn=custom_collate)


In [None]:
EPOCHS = 10
# Initialize model, criterion, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNet18GenreClassifier(num_classes=8).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)
train_model(model, train_loader, criterion, optimizer, device, epochs=EPOCHS)
accuracy = evaluate_model(model, test_loader, device)


Epoch 1/10, Loss: 1.3384
Epoch 2/10, Loss: 1.0144
Epoch 3/10, Loss: 0.8264
Epoch 4/10, Loss: 0.6363
Epoch 5/10, Loss: 0.4668
Epoch 6/10, Loss: 0.2924
Epoch 7/10, Loss: 0.1697
Epoch 8/10, Loss: 0.0981
Epoch 9/10, Loss: 0.0538
Epoch 10/10, Loss: 0.0342
Test Accuracy: 0.6044


In [None]:
save_path = "resnet_genre_classifier_fma.pth"
torch.save(model.state_dict(), save_path)

## Traditional CNN


In [10]:
class ConvNeuralNetwork(nn.Module):
    def __init__(self, num_classes):
        super(ConvNeuralNetwork, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(8, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 4 * 64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x


### Traditional CNN - GTZAN

In [None]:
# Split dataset into train and test sets for GTZAN
dataset = GenreDatasetGTZAN(root_dir='/content/Data/genres_original', transform=None)
train_indices, test_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
test_sampler = torch.utils.data.SubsetRandomSampler(test_indices)
train_loader = DataLoader(dataset, batch_size=32, sampler=train_sampler, num_workers=2)
test_loader = DataLoader(dataset, batch_size=32, sampler=test_sampler, num_workers=2)


In [None]:
EPOCHS = 10
# Initialize model, criterion, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ConvNeuralNetwork(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)
train_model(model, train_loader, criterion, optimizer, device, epochs=EPOCHS)
accuracy = evaluate_model(model, test_loader, device)


Epoch 1/10, Loss: 2.3133
Epoch 2/10, Loss: 2.2409
Epoch 3/10, Loss: 2.0828
Epoch 4/10, Loss: 1.9959
Epoch 5/10, Loss: 1.7912
Epoch 6/10, Loss: 1.6417
Epoch 7/10, Loss: 1.5343
Epoch 8/10, Loss: 1.4093
Epoch 9/10, Loss: 1.3440
Epoch 10/10, Loss: 1.2545
Test Accuracy: 0.5000


In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-4)
train_model(model, train_loader, criterion, optimizer, device, epochs=10)
accuracy = evaluate_model(model, test_loader, device)

Epoch 1/10, Loss: 0.8533
Epoch 2/10, Loss: 0.7868
Epoch 3/10, Loss: 0.7644
Epoch 4/10, Loss: 0.7440
Epoch 5/10, Loss: 0.7197
Epoch 6/10, Loss: 0.7051
Epoch 7/10, Loss: 0.6933
Epoch 8/10, Loss: 0.6741
Epoch 9/10, Loss: 0.6473
Epoch 10/10, Loss: 0.6320
Test Accuracy: 0.5750


In [None]:
optimizer = optim.Adam(model.parameters(), lr=2e-4)
train_model(model, train_loader, criterion, optimizer, device, epochs=5)
accuracy = evaluate_model(model, test_loader, device)

Epoch 1/5, Loss: 0.5125
Epoch 2/5, Loss: 0.5236
Epoch 3/5, Loss: 0.4925
Epoch 4/5, Loss: 0.4239
Epoch 5/5, Loss: 0.3944
Test Accuracy: 0.6250


### Traditional CNN - FMA


In [None]:
# Split dataset into train and test sets for FMA
class NormalizeSpectrogram:
    def __call__(self, sample):
        return (sample - sample.mean()) / (sample.std() + 1e-6)

transform = NormalizeSpectrogram()
dataset = GenreDatasetFMA(root_dir='/content/kaggle/working/mel_spectrograms', transform=transform)

train_indices, test_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
test_sampler = torch.utils.data.SubsetRandomSampler(test_indices)
train_loader = DataLoader(dataset, batch_size=32, sampler=train_sampler, num_workers=2, collate_fn=custom_collate)
test_loader = DataLoader(dataset, batch_size=32, sampler=test_sampler, num_workers=2, collate_fn=custom_collate)


In [None]:
EPOCHS = 55
# Initialize model, criterion, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fma_cnn_model = ConvNeuralNetwork(num_classes=8).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(fma_cnn_model.parameters(), lr=5e-5)
train_model(fma_cnn_model, train_loader, criterion, optimizer, device, epochs=EPOCHS)
accuracy = evaluate_model(fma_cnn_model, test_loader, device)


Epoch 1/25, Loss: 1.9342
Epoch 2/25, Loss: 1.8092
Epoch 3/25, Loss: 1.6857
Epoch 4/25, Loss: 1.6192
Epoch 5/25, Loss: 1.5767
Epoch 6/25, Loss: 1.5412
Epoch 7/25, Loss: 1.5187
Epoch 8/25, Loss: 1.5027
Epoch 9/25, Loss: 1.4738
Epoch 10/25, Loss: 1.4650
Epoch 11/25, Loss: 1.4455
Epoch 12/25, Loss: 1.4405
Epoch 13/25, Loss: 1.4398
Epoch 14/25, Loss: 1.4117
Epoch 15/25, Loss: 1.4063
Epoch 16/25, Loss: 1.4041
Epoch 17/25, Loss: 1.3832
Epoch 18/25, Loss: 1.3711
Epoch 19/25, Loss: 1.3529
Epoch 20/25, Loss: 1.3534
Epoch 21/25, Loss: 1.3373
Epoch 22/25, Loss: 1.3208
Epoch 23/25, Loss: 1.3087
Epoch 24/25, Loss: 1.3010
Epoch 25/25, Loss: 1.2927
Test Accuracy: 0.4688


In [None]:
optimizer = optim.Adam(fma_cnn_model.parameters(), lr=1e-3)
train_model(fma_cnn_model, train_loader, criterion, optimizer, device, epochs=5)
accuracy = evaluate_model(fma_cnn_model, test_loader, device)

Epoch 1/5, Loss: 1.5855
Epoch 2/5, Loss: 1.3792
Epoch 3/5, Loss: 1.2639
Epoch 4/5, Loss: 1.1623
Epoch 5/5, Loss: 1.0031
Test Accuracy: 0.5027


In [None]:
save_path = "cnn_genre_classifier_fma.pth"
torch.save(model.state_dict(), save_path)