In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['data']


### Ομοίως με πριν, φορτώνουμε τα δεδομένα μας σε Data Loaders.

In [2]:
import copy
import gzip
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from torch.utils.data import SubsetRandomSampler, DataLoader

class_mapping = {
    'Rock': 'Rock',
    'Psych-Rock': 'Rock',
    'Indie-Rock': None,
    'Post-Rock': 'Rock',
    'Psych-Folk': 'Folk',
    'Folk': 'Folk',
    'Metal': 'Metal',
    'Punk': 'Metal',
    'Post-Punk': None,
    'Trip-Hop': 'Trip-Hop',
    'Pop': 'Pop',
    'Electronic': 'Electronic',
    'Hip-Hop': 'Hip-Hop',
    'Classical': 'Classical',
    'Blues': 'Blues',
    'Chiptune': 'Electronic',
    'Jazz': 'Jazz',
    'Soundtrack': None,
    'International': None,
    'Old-Time': None
}


def torch_train_val_split(
        dataset, batch_train, batch_eval,
        val_size=.2, shuffle=True, seed=17):
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    val_split = int(np.floor(val_size * dataset_size))
    if shuffle:
        np.random.seed(seed)
        np.random.shuffle(indices)
    train_indices = indices[val_split:]
    val_indices = indices[:val_split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)

    train_loader = DataLoader(dataset,
                              batch_size=batch_train,
                              sampler=train_sampler)
    val_loader = DataLoader(dataset,
                            batch_size=batch_eval,
                            sampler=val_sampler)
    return train_loader, val_loader


def read_spectrogram(spectrogram_file, chroma=True):
    with gzip.GzipFile(spectrogram_file, 'r') as f:
        spectrograms = np.load(f)
    # spectrograms contains a fused mel spectrogram and chromagram
    # Decompose as follows
    return spectrograms.T


class LabelTransformer(LabelEncoder):
    def inverse(self, y):
        try:
            return super(LabelTransformer, self).inverse_transform(y)
        except:
            return super(LabelTransformer, self).inverse_transform([y])

    def transform(self, y):
        try:
            return super(LabelTransformer, self).transform(y)
        except:
            return super(LabelTransformer, self).transform([y])

        
class PaddingTransform(object):
    def __init__(self, max_length, padding_value=0):
        self.max_length = max_length
        self.padding_value = padding_value

    def __call__(self, s):
        if len(s) == self.max_length:
            return s

        if len(s) > self.max_length:
            return s[:self.max_length]

        if len(s) < self.max_length:
            s1 = copy.deepcopy(s)
            pad = np.zeros((self.max_length - s.shape[0], s.shape[1]), dtype=np.float32)
            s1 = np.vstack((s1, pad))
            return s1

        
class SpectrogramDataset(Dataset):
    def __init__(self, path, class_mapping=None, train=True, max_length=-1):
        t = 'train' if train else 'test'
        p = os.path.join(path, t)
        self.index = os.path.join(path, "{}_labels.txt".format(t))
        self.files, labels = self.get_files_labels(self.index, class_mapping)
        self.feats = [read_spectrogram(os.path.join(p, f)) for f in self.files]
        self.feat_dim = self.feats[0].shape[1]
        self.lengths = [len(i) for i in self.feats]
        self.max_length = max(self.lengths) if max_length <= 0 else max_length
        self.zero_pad_and_stack = PaddingTransform(self.max_length)
        self.label_transformer = LabelTransformer()
        if isinstance(labels, (list, tuple)):
            self.labels = np.array(self.label_transformer.fit_transform(labels)).astype('int64')

    def get_files_labels(self, txt, class_mapping):
        with open(txt, 'r') as fd:
            lines = [l.rstrip().split('\t') for l in fd.readlines()[1:]]
        files, labels = [], []
        for l in lines:
            label = l[1]
            if class_mapping:
                label = class_mapping[l[1]]
            if not label:
                continue
            files.append(l[0])
            labels.append(label)
        return files, labels

    def __getitem__(self, item):
        l = min(self.lengths[item], self.max_length)
        return self.zero_pad_and_stack(self.feats[item]), self.labels[item], l

    def __len__(self):
        return len(self.labels)
      
if __name__ == '__main__':
    specs = SpectrogramDataset('../input/data/data/fma_genre_spectrograms_beat', train=True, class_mapping=class_mapping, max_length=-1)
    train_loader, val_loader = torch_train_val_split(specs, 32 ,32, val_size=.33)
    test_loader = DataLoader(SpectrogramDataset('../input/data/data/fma_genre_spectrograms_beat', train=False, class_mapping=class_mapping, max_length=-1), 32)


## **Ορισμός CNN model** 
### Αποτελείται από 4 layers για την εξαγωγή χαρακτηριστικών (2d Convolution, Batch Normalisation, ReLU activation function & 2d Max Pooling) και από ένα linear layer-dropout- linear layer για το classification.

In [3]:
import torch.nn as nn
import torch
torch.set_default_tensor_type(torch.DoubleTensor)

class BasicCNN(nn.Module):
    def __init__(self, input_dim, output_dim, channels=[12, 24, 48, 96], kernels=[3, 3, 3, 3]):
        super(BasicCNN, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        self.layer1 = nn.Sequential(nn.Conv2d(in_channels=self.input_dim, out_channels=channels[0], kernel_size=kernels[0]),
                                    nn.BatchNorm2d(channels[0]),
                                    nn.ReLU(inplace=True),
                                    nn.MaxPool2d(kernel_size=kernels[0]))
        self.layer2 = nn.Sequential(nn.Conv2d(in_channels=channels[0], out_channels=channels[1], kernel_size=kernels[1]),
                                    nn.BatchNorm2d(channels[1]),
                                    nn.ReLU(inplace=True),
                                    nn.MaxPool2d(kernel_size=kernels[1]))
        self.layer3 = nn.Sequential(nn.Conv2d(in_channels=channels[1], out_channels=channels[2], kernel_size=kernels[2]),
                                    nn.BatchNorm2d(channels[2]),
                                    nn.ReLU(inplace=True),
                                    nn.MaxPool2d(kernel_size=kernels[2]))
        self.layer4 = nn.Sequential(nn.Conv2d(in_channels=channels[2], out_channels=channels[3], kernel_size=kernels[3]),
                                    nn.BatchNorm2d(channels[3]),
                                    nn.ReLU(inplace=True),
                                    nn.MaxPool2d(kernel_size=kernels[3]))
        self.linear = nn.Sequential(nn.Linear(channels[3], channels[3]),
                                    nn.Dropout(inplace=True),
                                    nn.Linear(channels[3], self.output_dim))
    
    def forward(self, x):
        x = x.view(x.shape[0], self.input_dim, x.shape[2], x.shape[1])
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        
        return x

## Εκπαίδευση και αξιολόγηση μοντέλου

In [4]:
def accuracy(model, data_loader):
    model.eval()
    correct, total = 0, 0
    for feats, labels, lens in data_loader:
        with torch.no_grad():
            scores = model(feats)
            prediction = scores.argmax(dim=1)
            correct += (prediction == labels).sum().item()
            total += feats.shape[0]
    return 100 * correct / total

model = BasicCNN(input_dim=1, output_dim=10)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0007)

for epoch in range(20):
    if epoch == 5:
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) # fine tuning
    model.train()
    for feats, labels, lens in (train_loader):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        optimizer.zero_grad()

        # Step 3. Run our forward pass.
        pred_labels = model(feats)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(pred_labels, labels)
        #print("Loss is: " + str(loss))
        loss.backward()
        optimizer.step()
    print("Epoch " + str(epoch) + " | Accuracy (on validation set) is: " + str(accuracy(model, val_loader)))

Epoch 0 | Accuracy (on validation set) is: 21.44736842105263
Epoch 1 | Accuracy (on validation set) is: 29.736842105263158
Epoch 2 | Accuracy (on validation set) is: 29.342105263157894
Epoch 3 | Accuracy (on validation set) is: 37.63157894736842
Epoch 4 | Accuracy (on validation set) is: 21.842105263157894
Epoch 5 | Accuracy (on validation set) is: 38.421052631578945
Epoch 6 | Accuracy (on validation set) is: 38.026315789473685
Epoch 7 | Accuracy (on validation set) is: 38.55263157894737
Epoch 8 | Accuracy (on validation set) is: 38.68421052631579
Epoch 9 | Accuracy (on validation set) is: 40.526315789473685
Epoch 10 | Accuracy (on validation set) is: 39.21052631578947
Epoch 11 | Accuracy (on validation set) is: 38.1578947368421
Epoch 12 | Accuracy (on validation set) is: 40.13157894736842
Epoch 13 | Accuracy (on validation set) is: 39.60526315789474
Epoch 14 | Accuracy (on validation set) is: 39.473684210526315
Epoch 15 | Accuracy (on validation set) is: 38.421052631578945
Epoch 16 | 

### Ξεκινάμε αρχικά με σχετικά μεγάλο learning rate, αλλά μετά από ένα μικρό αριθμό εποχών το χαμηλώνουμε. Με 20 εποχές πετυχαίνουμε σημαντική βελτιώση στο validation set (σε σχέση με το LSTM, περί το 40%), ωστόσο, όπως βλέπουμε παρακάτω, τα αποτελέσματα στο training set είναι χειρότερα απ' τα αναμενόμενα.

In [5]:
print("Accuracy on test set: " + str(accuracy(model, test_loader)))

Accuracy on test set: 14.08695652173913
