In [1]:
import torch
from sklearn.model_selection import train_test_split
from models.AST import AST
import os
import librosa
import numpy as np
from torchinfo import summary
from torch import nn
from torch.optim import Adam
from Utils import create_dataloader, k_fold_cross_validation
from torcheval.metrics import MulticlassAUROC, MulticlassF1Score

# limit GPU usage

In [2]:
torch.cuda.set_per_process_memory_fraction(0.625)

# Read Data 

In [3]:
root = '../Data/genres_original'
genres = os.listdir(root)
x = []
y = []
length = []
sr = 16*1000
for genre in genres:
    genre_root = os.path.join(root, genre)
    audios = os.listdir(genre_root)
    for audio in audios:
        audio_path = os.path.join(genre_root, audio)
        signal, sr = librosa.load(audio_path, sr=sr)
        x.append(signal)
        length.append(len(signal))
        y.append(genres.index(genre))
min_length = min(length)
print("finsh reading data")

finsh reading data


# Conversion, Compression and Normalisation

In [4]:
top_db = 80
for i in range(len(x)):
    signal = x[i][:min_length]
    mel_spect = librosa.feature.melspectrogram(y=signal,sr=sr,n_fft=1024) # convert signals to mel spectrogram
    mel_spect = librosa.power_to_db(mel_spect, ref=np.max, top_db=top_db) # log compression
    x[i] = mel_spect/-top_db # normalisation
print("finish conversion and compression")

finish conversion and compression


# Split Data

In [5]:
x = np.asarray(x)
x = x.transpose((0,2,1))
y = np.asarray(y)
print(x.shape)

(1000, 936, 128)


In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                    stratify=y,shuffle=True)
# k-fold cross validation
k = 5
xs_train, ys_train, xs_valid, ys_valid = k_fold_cross_validation(x_train,y_train,k)
print("finish splitting data")

finish splitting data


In [7]:
batch_size = 16
dataloaders_train = []
dataloaders_valid = []
for i in range(k):
    dataloaders_train.append(create_dataloader(xs_train[i], ys_train[i], batch_size=batch_size))
    dataloaders_valid.append(create_dataloader(xs_valid[i], ys_valid[i], batch_size=batch_size))
dataloader_test = create_dataloader(x_test, y_test, batch_size=batch_size)
print("finish creating dataloaders")

finish creating dataloaders


In [8]:
model = AST(10,input_fdim=x.shape[2],input_tdim=x.shape[1])
model.cuda()
loss_function = nn.CrossEntropyLoss()
opt = Adam(model.parameters(), lr=1e-5)
summary(model,[(batch_size,x.shape[1],x.shape[2])])

frequncey stride=10, time stride=10
number of patches=1116


  x = F.scaled_dot_product_attention(


Layer (type:depth-idx)                        Output Shape              Param #
AST                                           [16, 10]                  2,398,160
├─VisionTransformerDistilled: 1-3             --                        (recursive)
│    └─ASTPatchEmbed: 2-1                     [16, 1116, 768]           --
│    │    └─Conv2d: 3-1                       [16, 768, 12, 93]         197,376
│    └─Dropout: 2-2                           [16, 1118, 768]           --
├─ModuleList: 1-2                             --                        --
│    └─Sequential: 2-3                        [16, 1118, 768]           --
│    │    └─Block: 3-2                        [16, 1118, 768]           7,087,872
│    │    └─Dropout: 3-3                      [16, 1118, 768]           --
│    └─Sequential: 2-4                        [16, 1118, 768]           --
│    │    └─Block: 3-4                        [16, 1118, 768]           7,087,872
│    │    └─Dropout: 3-5                      [16, 1118, 768

In [9]:
for i in range(20):
    print("-------epoch  {} -------".format(i + 1))
    epoch_loss = 0
    epoch_accuracy = 0
    for j in range(k):
        print(f'fold {j+1}:')
        loss_train = 0
        accuracy_train = 0
        train_size = 0
        for batch_idx, (data, target) in enumerate(dataloaders_train[j]):
            model.train()
            output = model(data)
            loss = loss_function(output, target)
            opt.zero_grad()
            loss.backward()
            opt.step()
            loss_train += loss.item()*len(data)
            accuracy = (output.argmax(1) == target).sum()
            accuracy_train += accuracy
            train_size += len(data)
        print("train set loss: {}".format(loss_train/train_size))
        print("train set accuracy: {}".format(accuracy_train /train_size))

        loss_valid = 0
        accuracy_valid = 0
        valid_size = 0
        for batch_idx, (data, target) in enumerate(dataloaders_valid[j]):
            model.eval()
            with torch.no_grad():
                output = model(data)
                loss = loss_function(output, target)
                loss_valid += loss.item()*len(data)
                accuracy = (output.argmax(1) == target).sum()
                accuracy_valid += accuracy
                valid_size += len(data)
        print("valid set loss: {}".format(loss_valid/valid_size))
        print("valid set accuracy: {}".format(accuracy_valid/valid_size))
        epoch_loss += loss_valid/valid_size
        epoch_accuracy += accuracy_valid/valid_size
    print("epoch loss: {}".format(epoch_loss/k))
    print("epoch accuracy: {}".format(epoch_accuracy/k))
print("finish training")

-------epoch  1 -------
fold 1:
train set loss: 2.31796875
train set accuracy: 0.09687500447034836
valid set loss: 2.374609375
valid set accuracy: 0.08125000447034836
fold 2:
train set loss: 2.318603515625
train set accuracy: 0.10000000149011612
valid set loss: 2.3205078125
valid set accuracy: 0.11250000447034836
fold 3:
train set loss: 2.31259765625
train set accuracy: 0.11562500149011612
valid set loss: 2.32109375
valid set accuracy: 0.10000000149011612
fold 4:
train set loss: 2.31669921875
train set accuracy: 0.10312499850988388
valid set loss: 2.2873046875
valid set accuracy: 0.13125000894069672
fold 5:
train set loss: 2.303955078125
train set accuracy: 0.09218750149011612
valid set loss: 2.321875
valid set accuracy: 0.11874999850988388
epoch loss: 2.325078125
epoch accuracy: 0.10875000059604645
-------epoch  2 -------
fold 1:
train set loss: 2.2982421875
train set accuracy: 0.12812499701976776
valid set loss: 2.3455078125
valid set accuracy: 0.08749999850988388
fold 2:
train set l

In [10]:
loss_test = 0
accuracy_test = 0
AUC_test = 0
f1_score_test = 0
test_size = 0
for batch_idx, (data, target) in enumerate(dataloader_test):
    model.eval()
    with torch.no_grad():
        output = model(data)
        loss = loss_function(output, target)
        loss_test += loss.item()*len(data)
        accuracy = (output.argmax(1) == target).sum()
        accuracy_test += accuracy
        test_size += len(data)
        auc = MulticlassAUROC(num_classes=10)
        auc.update(output, target)
        AUC_test += auc.compute()*len(data)
        f1 = MulticlassF1Score(num_classes=10)
        f1.update(output,target)
        f1_score_test += f1.compute()*len(data)        
print("test set loss: {}".format(loss_test/test_size))
print("test set accuracy: {}".format(accuracy_test/test_size))
print("test set AUC: {}".format(AUC_test/test_size))
print("test set f1-score: {}".format(f1_score_test/test_size))

test set loss: 2.04
test set accuracy: 0.3050000071525574
test set AUC: 0.7442171573638916
test set f1-score: 0.3050000071525574
