# music genre classification

In [None]:
import os
import pandas as pd
import numpy as np
import h5py
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from google.colab import drive
drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

Mounted at /content/drive
device: cuda


In [None]:
spectrograms_path = '/content/drive/MyDrive/sound_classifier/dataset.zip'

!cp '{spectrograms_path}' .
!unzip -q dataset.zip

## dataset & data loader

In [None]:
# dataset
class GTZAN_Dataset(Dataset):
    def __init__(self, dataset, dir, transform=None, target_transform=None):
        self.labels = pd.read_csv(dataset, dtype={'file':str, 'category':int})
        self.dir = dir
        self.transform = transform
        self.target_transform = target_transform
    
    def __len__(self):
        return len(self.labels)
    
    def split_complex_numbers(self, x):
      y = torch.empty((2, 258, 258))
      y[0], y[1] = x[0].real, x[0].imag
      return y

    def __getitem__(self, idx):
        filepath = os.path.join(self.dir, self.labels.iloc[idx,0])
        spec = torch.load(filepath)
        label = self.labels.iloc[idx,1]
        
        if self.transform:
            spec = self.split_complex_numbers(spec)
        if self.target_transform:
            label = self.target_transform(label)
        
        return spec, label


# data loader
root_path = '/content'
gtzan_trn = GTZAN_Dataset('dataset_files_train.csv', root_path, transform=True)
gtzan_tst  = GTZAN_Dataset('dataset_files_test.csv', root_path, transform=True)
trn_dataloader = DataLoader(gtzan_trn, batch_size=64, shuffle=True)
tst_dataloader = DataLoader(gtzan_tst, batch_size=64, shuffle=True)

## model

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()

        self.cnn_layers = nn.Sequential(
            nn.Conv2d(2, 16, kernel_size=7, stride=1),   # 252, 252, 16
            #in-channel, out, filter size, stride
            nn.Tanh(),
            #activate function
            nn.MaxPool2d(2),      # 126, 126, 16
            #poolsize 
            
            nn.Conv2d(16, 32, kernel_size=5, stride=1),  # 122, 122, 32
            nn.Tanh(),
            nn.MaxPool2d(2),      # 61, 61, 32
            
            nn.Conv2d(32, 64, kernel_size=6, stride=1), # 56, 56, 64
            nn.Tanh(),
            nn.MaxPool2d(2),      # 28, 28, 64
            
            nn.Conv2d(64, 128, kernel_size=5, stride=1), # 24, 24, 128
            nn.Tanh(),
            nn.MaxPool2d(2),       # 12, 12, 128
            
            nn.Flatten()
            #one column
        )

        self.linear_layers = nn.Sequential(
            nn.Linear(12*12*128, 2048),
            nn.Linear(2048, 10)
        )

    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x

In [None]:
model = CNN()
print(model)

CNN(
  (cnn_layers): Sequential(
    (0): Conv2d(2, 16, kernel_size=(7, 7), stride=(1, 1))
    (1): Tanh()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
    (4): Tanh()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 64, kernel_size=(6, 6), stride=(1, 1))
    (7): Tanh()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(64, 128, kernel_size=(5, 5), stride=(1, 1))
    (10): Tanh()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Flatten(start_dim=1, end_dim=-1)
  )
  (linear_layers): Sequential(
    (0): Linear(in_features=18432, out_features=2048, bias=True)
    (1): Linear(in_features=2048, out_features=10, bias=True)
  )
)


In [None]:
learning_rate = 1e-4

model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
for t in range(300):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trn_dataloader, model, loss_fn, optimizer)
    test_loop(tst_dataloader, model, loss_fn)
print("Done!")

In [None]:
torch.save(model.state_dict(), 'drive/MyDrive/sound_classifier/nn_classifier_statedict.pt')
torch.save(model, 'drive/MyDrive/sound_classifier/nn_classifier_full.pt')