In [1]:
import torch
from sklearn.model_selection import train_test_split
from models.ResNet18 import ResNet18
import os
import librosa
import numpy as np
from torchinfo import summary
from torch import nn
from torch.optim import Adam
from Utils import create_dataloader, k_fold_cross_validation
from torcheval.metrics import MulticlassAUROC, MulticlassF1Score

# limit GPU usage

In [2]:
torch.cuda.set_per_process_memory_fraction(0.625)

# Read Data 

In [3]:
root = '../Data/genres_original'
genres = os.listdir(root)
x = []
y = []
length = []
sr = 16*1000
for genre in genres:
    genre_root = os.path.join(root, genre)
    audios = os.listdir(genre_root)
    for audio in audios:
        audio_path = os.path.join(genre_root, audio)
        signal, sr = librosa.load(audio_path, sr=sr)
        x.append(signal)
        length.append(len(signal))
        y.append(genres.index(genre))
min_length = min(length)
print("finsh reading data")

finsh reading data


# Conversion and Compression

In [4]:
for i in range(len(x)):
    signal = x[i][:min_length]
    mel_spect = librosa.feature.melspectrogram(y=signal,sr=sr) # convert signals to mel spectrogram
    mel_spect = librosa.power_to_db(mel_spect, ref=np.max) # log compression
    x[i] = mel_spect
print("finish conversion and compression")

finish conversion and compression


# Split Data

In [5]:
x = np.asarray(x)
x = x.transpose((0,2,1))
x = x.reshape(x.shape[0],1,x.shape[1],x.shape[2])
y = np.asarray(y)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                    stratify=y,shuffle=True)
# k-fold cross validation
k = 5
xs_train, ys_train, xs_valid, ys_valid = k_fold_cross_validation(x_train,y_train,k)
print("finish splitting data")

finish splitting data


In [7]:
batch_size = 16
dataloaders_train = []
dataloaders_valid = []
for i in range(k):
    dataloaders_train.append(create_dataloader(xs_train[i], ys_train[i], batch_size=batch_size))
    dataloaders_valid.append(create_dataloader(xs_valid[i], ys_valid[i], batch_size=batch_size))
dataloader_test = create_dataloader(x_test, y_test, batch_size=batch_size)
print("finish creating dataloaders")

finish creating dataloaders


# Model Construction

In [8]:
model = ResNet18(10,in_channels=1)
model.cuda()
loss_function = nn.CrossEntropyLoss()
opt = Adam(model.parameters(), lr=0.01)
summary(model,[(16,1,x.shape[2],x.shape[3])])

Layer (type:depth-idx)                   Output Shape              Param #
ResNet18                                 [16, 10]                  --
├─Sequential: 1-1                        [16, 64, 234, 32]         --
│    └─Conv2d: 2-1                       [16, 64, 467, 63]         3,200
│    └─BatchNorm2d: 2-2                  [16, 64, 467, 63]         128
│    └─ReLU: 2-3                         [16, 64, 467, 63]         --
│    └─MaxPool2d: 2-4                    [16, 64, 234, 32]         --
├─ResidualBlock: 1-2                     [16, 64, 234, 32]         --
│    └─Conv2d: 2-5                       [16, 64, 234, 32]         36,928
│    └─BatchNorm2d: 2-6                  [16, 64, 234, 32]         128
│    └─ReLU: 2-7                         [16, 64, 234, 32]         --
│    └─Conv2d: 2-8                       [16, 64, 234, 32]         36,928
│    └─BatchNorm2d: 2-9                  [16, 64, 234, 32]         128
├─ResidualBlock: 1-3                     [16, 64, 234, 32]         --
│

# Train

In [9]:
for i in range(20):
    print("-------epoch  {} -------".format(i + 1))
    for j in range(k):
        print(f'fold {j+1}:')
        loss_train = 0
        accuracy_train = 0
        train_size = 0
        for batch_idx, (data, target) in enumerate(dataloaders_train[j]):
            model.train()
            output = model(data)
            loss = loss_function(output, target)
            opt.zero_grad()
            loss.backward()
            opt.step()
            loss_train += loss.item()*len(data)
            accuracy = (output.argmax(1) == target).sum()
            accuracy_train += accuracy
            train_size += len(data)
        print("train set loss: {}".format(loss_train/train_size))
        print("train set accuracy: {}".format(accuracy_train /train_size))

        loss_valid = 0
        accuracy_valid = 0
        valid_size = 0
        for batch_idx, (data, target) in enumerate(dataloaders_valid[j]):
            model.eval()
            with torch.no_grad():
                output = model(data)
                loss = loss_function(output, target)
                loss_valid += loss.item()*len(data)
                accuracy = (output.argmax(1) == target).sum()
                accuracy_valid += accuracy
                valid_size += len(data)
        print("valid set loss: {}".format(loss_valid/valid_size))
        print("valid set accuracy: {}".format(accuracy_valid/valid_size))
print("finish training")

-------epoch  1 -------
fold 1:
train set loss: 2.2375601828098297
train set accuracy: 0.15625
valid set loss: 2.3207841873168946
valid set accuracy: 0.08125000447034836
fold 2:
train set loss: 2.1745291352272034
train set accuracy: 0.28593751788139343
valid set loss: 2.2214985609054567
valid set accuracy: 0.20624999701976776
fold 3:
train set loss: 2.107090723514557
train set accuracy: 0.3359375
valid set loss: 2.30118305683136
valid set accuracy: 0.08749999850988388
fold 4:
train set loss: 2.064554062485695
train set accuracy: 0.37187501788139343
valid set loss: 2.337942290306091
valid set accuracy: 0.16875000298023224
fold 5:
train set loss: 2.023384374380112
train set accuracy: 0.38593751192092896
valid set loss: 2.3215286254882814
valid set accuracy: 0.17499999701976776
-------epoch  2 -------
fold 1:
train set loss: 2.005623322725296
train set accuracy: 0.4046874940395355
valid set loss: 2.295549178123474
valid set accuracy: 0.125
fold 2:
train set loss: 1.9922253221273423
train 

# Test

In [10]:
loss_test = 0
accuracy_test = 0
AUC_test = 0
f1_score_test = 0
test_size = 0
for batch_idx, (data, target) in enumerate(dataloader_test):
    model.eval()
    with torch.no_grad():
        output = model(data)
        loss = loss_function(output, target)
        loss_test += loss.item()*len(data)
        accuracy = (output.argmax(1) == target).sum()
        accuracy_test += accuracy
        test_size += len(data)
        auc = MulticlassAUROC(num_classes=10)
        auc.update(output, target)
        AUC_test += auc.compute()*len(data)
        f1 = MulticlassF1Score(num_classes=10)
        f1.update(output,target)
        f1_score_test += f1.compute()*len(data)        
print("test set loss: {}".format(loss_test/test_size))
print("test set accuracy: {}".format(accuracy_test/test_size))
print("test set AUC: {}".format(AUC_test/test_size))
print("test set f1-score: {}".format(f1_score_test/test_size))

test set loss: 1.8205833005905152
test set accuracy: 0.5099999904632568
test set AUC: 0.8730062246322632
test set f1-score: 0.5099999904632568
