In [1]:
import torch
from copy import deepcopy
from torch import nn
from sklearn.metrics import confusion_matrix
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, SGD, AdamW
import numpy as np
import torchaudio
from IPython.display import Audio
import matplotlib.pyplot as plt
import os
import torchvision.transforms as vtransforms
import deepfool
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

In [2]:
device = "cuda"
directory = "DIRECTORY HERE"   ## This must include the parent directory of folds
csv_file = np.genfromtxt("UrbanSound8K.csv",delimiter=",",dtype=str)  ## csv file must be in the present directory
inds = []
for i in range(1,11):
    inds += [ np.where(csv_file[:,5] == f"{i}")[0] ]

In [3]:
def load_file(i): # /media/carlos/5E48D2B648D28BE1/Users/pedro/Desktop/audio/
    file,fold = csv_file[i,0],csv_file[i,5]
    return torchaudio.load(directory + f"fold{fold}/{file}")

## PRE-PROCESS
def mono_channel_equal_size(wave,new_freq):
    waveform = wave.mean(0)
    waveform = np.resize(waveform,4 * new_freq) ## resize to 4 seconds. some do not have 44100 sampling rate and will have less time.
    return torch.from_numpy(waveform).to(device)

In [4]:
class Fold_Set(Dataset):
    def __init__(self,fold, transform=None, shorten_factor = 15):
        super().__init__()
        self.fold = fold
        self.dir = directory+f"fold{fold}/"
        self.inds_fold = inds[fold-1]
        self.new_freq = 44100 // shorten_factor
        self.shorten_factor = shorten_factor
        
    def __len__(self):
        return len(self.inds_fold)
    def __getitem__(self,i):
        label = csv_file[self.inds_fold[i],6]
        audio,sample_rate = load_file(self.inds_fold[i])
        resampler = torchaudio.transforms.Resample(sample_rate,self.new_freq)
        audio_preprocessed = resampler(audio)
        audio_preprocessed = mono_channel_equal_size(audio_preprocessed, self.new_freq)
        audio_preprocessed /= audio_preprocessed.max()
        
        spectre = torchaudio.transforms.MelSpectrogram(sample_rate = self.new_freq, 
                                                       normalized = True, n_fft = 16384 ,win_length = int(400e-3*self.new_freq), 
                                                       hop_length = int(31.5e-3*self.new_freq), n_mels = 128 ).to(device)
        audio_spectro_mel = torch.unsqueeze(spectre(audio_preprocessed), dim= 0)
        to_DB = torchaudio.transforms.AmplitudeToDB(top_db = 80)
        audio_spectro_mel = to_DB(audio_spectro_mel)
        return audio_spectro_mel , int(label)

In [5]:
folds = []
for i in range(10):
    folds += [Fold_Set(i+1,shorten_factor = 2)]

In [6]:
## SAVE MEL SPECROGRAMS TO AVOID REDUNDANT RECALCULATIONS
for j in range(10):
    for i in range(len(folds[j])):
        torch.save(folds[j][i],f"./preprocessed/fold{j+1}/sample{i}.pk")   ##in the present directory there must be preprocessed/fold1, etc... directories for the 10 folds

In [24]:
class Fold_PreprocessedSet(Dataset):
    def __init__(self,fold,aug = True): ## CHANGE DIR IN THE END
        super().__init__()
        self.fold = fold
        self.dir = lambda i: f"./preprocessed/fold{fold}/sample{i}.pk"
        self.transform = nn.Sequential(
            vtransforms.RandomApply(transforms=[torchaudio.transforms.FrequencyMasking(10),torchaudio.transforms.TimeMasking(30)]),
            vtransforms.RandomApply(transforms=[vtransforms.RandomRotation(5),vtransforms.RandomVerticalFlip() ,vtransforms.RandomHorizontalFlip()]),
            vtransforms.Normalize(.3,0.4)
                                        )
        self.transform_test = vtransforms.Normalize(.3,0.4)
        self.aug = aug
    def __len__(self):
        return len(inds[self.fold-1])
    def __getitem__(self,i):
        sample, label = torch.load(self.dir(i))
        if self.aug == False:
            return self.transform_test(sample),label
        return self.transform(sample),label

In [25]:
folds = [] # redefine this list with the proper preprocessed dataset
fold_loaders = [] # define the loaders for training
val_loaders = [] # loaders for validation
test_loaders = [] # loader for testing
batch_size = 64
def custom_collate(batch):
    a,b = torch.utils.data.default_collate(batch)
    return a,b.to(device)  ## collate function to use proper device at all times
for i in range(10):
    folds += [Fold_PreprocessedSet(i+1)]
for i in range(10):
    full_train = folds[(i+2) % 10]
    for j in range(3,10):
        full_train += folds[(i+j) % 10]
    fold_loaders += [DataLoader(full_train,batch_size = batch_size, shuffle = True, collate_fn = custom_collate)] 
    folds_i_val = Fold_PreprocessedSet( (i+1) % 10 + 1 , aug = False)  ## no augmentation in val/test
    folds_i_test = Fold_PreprocessedSet( i % 10 + 1 , aug = False)
    val_loaders += [DataLoader(folds_i_val,batch_size = batch_size, shuffle = True, collate_fn = custom_collate)]
    test_loaders += [DataLoader(folds_i_test,batch_size = batch_size, shuffle = True, collate_fn = custom_collate)]

In [26]:
class BasicBlock(nn.Module):
    def __init__(self,in_c,out_c,expansion,stride=1,downsample=None):
        
        super().__init__()
        self.downsample = downsample
        self.conv1 = nn.Conv2d(in_c,out_c,kernel_size = 3,stride = stride,padding = 1,bias = False)
        self.bn1 = nn.BatchNorm2d(out_c)
        self.relu = nn.ReLU(inplace = True)
        self.conv2 = nn.Conv2d(out_c,out_c*expansion,kernel_size = 3,padding = 1,bias = False)
        self.bn2 = nn.BatchNorm2d(out_c*expansion)

    def forward(self,x):
        Id = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            Id = self.downsample(x)
    
        out += Id
        out = self.relu(out)
        return out
        
        
class ResNet(nn.Module):
    def __init__(
        self, 
        block,
        img_channels: int = 1,
        num_classes: int  = 10
    ) -> None:
        super(ResNet, self).__init__()
            # The following `layers` list defines the number of `BasicBlock` 
            # to use to build the network and how many basic blocks to stack
            # together.
        layers = [5, 10]
        self.expansion = 1
        
        self.in_channels = 64
        
        self.conv1 = nn.Conv2d( ##(64,134,134)
            in_channels=img_channels,
            out_channels=self.in_channels,
            kernel_size=7, 
            stride=2,
            padding=3,
            bias=False
        )
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) ## (64,68,68)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        # self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        # self.layer4 = self._make_layer(block, 512, layers[3], stride=2) # Disregard ResNet18 architecture
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(128*self.expansion, num_classes)
    def _make_layer(
        self, 
        block,
        out_channels: int,
        blocks: int,
        stride: int = 1
    ) -> nn.Sequential:
        downsample = None
        if stride != 1:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.in_channels, 
                    out_channels*self.expansion,
                    kernel_size=1,
                    stride=stride,
                    bias=False 
                ),
                nn.BatchNorm2d(out_channels * self.expansion),
            )
        layers = []
        layers.append(
            block(
                self.in_channels, out_channels, self.expansion, stride, downsample
            )
        )
        
        for i in range(1, blocks):
            self.in_channels = out_channels * self.expansion
            layers.append(block(
                self.in_channels,
                out_channels,
                expansion=self.expansion
            ))
        return nn.Sequential(*layers)
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        # x = self.layer3(x)
        # x = self.layer4(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [27]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        
        self.Bottom = nn.Sequential(
            nn.Linear(128*128 , 512, bias = False), ##Normalization nullifies any bias preceding it directly, therefore not needed
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(512,256, bias = False),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256,128, bias = False),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128,128, bias = False),
            nn.BatchNorm1d(128),
            nn.Tanh(),
            nn.Linear(128,256, bias = False),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256,128, bias = False),
            nn.BatchNorm1d(128),
            nn.ReLU(),

            
            nn.Linear(128,10),
            nn.ReLU()
                            )
    def forward(self,x):
        x = self.flatten(x)
        out = self.Bottom ( x  )
        return out

In [37]:
loss = nn.CrossEntropyLoss(label_smoothing = 5e-3)
ConvModel = ResNet(BasicBlock).to(device)
MLP_Model = MLP().to(device)

In [38]:
optimizer_Conv = AdamW(ConvModel.parameters(),amsgrad=True,lr =1e-5,betas = (.99,.999),weight_decay=1e-1)
optimizer_MLP = AdamW(MLP_Model.parameters(),amsgrad=True,lr =1e-5,betas = (.99,.999),weight_decay=1e-1)

In [39]:
def train_NN(start,loader,loss_fn,optimizer,NNmodel, writer):
    torch.backends.cudnn.benchmark = True
    NNmodel.train() ##Training mode: important for Dropout
    size = len(loader.dataset)
    losses = []
    r_loss = 0
    epochs = [start*size]
    for i,(x,y) in enumerate(loader): ## will complete an Epoch

        # y = y.to(device)
        NetForward = NNmodel.forward(x)
        loss = loss_fn(NetForward,y)

        ## Keep track of training loss
        losses += [loss.item()]
        epochs += [epochs[i]+len(x)] #divide by size in the end

        r_loss += losses[i]
        
        if ( (i+1) % 5 == 0):
            avg_loss, r_loss = r_loss / 5, 0
            writer.add_scalar("Loss/Train",avg_loss,epochs[i+1])
        
        #Gradient descent with optimizer
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = ""
    return (losses, epochs[1:])

def test_loop(dataloader, model, loss_fn):

    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "pooled"
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0
    losses = []
    epochs = [0]

    with torch.no_grad():
        for i,(X, y) in enumerate(dataloader):
            
            pred = model(X)
            # y = y.to(device)
            loss = loss_fn(pred, y).item()
            test_loss += loss

            losses += [loss]
            epochs += [ epochs[i]+len(X)/size ]
            
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "" ##HELPS MEMORY ALLOCATION BY CUDA
    
    correct /= size
    print(f"Accuracy:{correct}, Loss: {test_loss}")
    return test_loss, correct

In [40]:
def conf_mat(test_fold,model):
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "pooled"
    model.eval()
    dataloader = test_loaders[test_fold]
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    y_pred,y_true = [],[]
    with torch.no_grad():
        for i,(X, y) in enumerate(dataloader):        
            pred = model(X)
            pred = pred.argmax(1).cpu().numpy() ##list of preds
            # y = y.to(device)
            y_pred.extend(pred)
            y_true.extend(y.cpu().numpy())
    return confusion_matrix(y_true,y_pred)
def test_robustness(test_fold,model):
    
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "pooled"
    dataset = test_loaders[test_fold].dataset
    size = len(dataset)
    running_robustness = 0.0

    for i in range(size):
        image,_ = dataset[i]
        perturbation,_,_,_,_ = deepfool.deepfool(image[None,:], model)
        image_norm = torch.linalg.norm(image)
        perturbation_norm = torch.linalg.norm(perturbation)
        running_robustness += perturbation_norm / image_norm
    return running_robustness/size

In [42]:
def train_cross_fold(test_fold,loss,optimizer,model,writer,writer_val):
    # test_fold with 0 indexing
    n_epochs_target = 12
    start_epoch = 0
    best_loss = 1e29
    
    val_size = len(folds[(test_fold +1) % 10])
    test_size = len(folds[test_fold])
    train_size = 8732 - val_size - test_size   ## total size - val_size - test_size
    losses = np.array([])
    epochs = np.array([0])
    epochs_val = np.array([])
    losses_val = np.array([])
    
    
    for i in range(n_epochs_target):
        losses_j, epochs_j = train_NN(i+start_epoch,fold_loaders[test_fold],loss,optimizer,model,writer)
        epochs = np.append(epochs,  np.array(epochs_j)/train_size )
        losses = np.append(losses, np.array(losses_j))
        val_loss,acc = test_loop(val_loaders[test_fold],model,loss)
        losses_val = np.append( losses_val , val_loss  )
        epochs_val = np.append( epochs_val, start_epoch+i+1 )
        writer_val.add_scalar("Loss/Train",losses_val[-1],epochs_val[-1]*train_size) ## Use tensorboard, if you want, but you can plot with matplotlib too
        if val_loss < best_loss: ##Choose model based on best val accuracy since weight decay is employed
            best_acc = acc
            best_loss = val_loss
            best_model = deepcopy(model)
    epochs = epochs[1:]

    print("TEST")
    test_loss,test_acc = test_loop(test_loaders[test_fold],best_model,loss)
    ConfM = conf_mat(test_fold,best_model)

    return test_loss,test_acc,best_loss,best_acc,ConfM,best_model


In [43]:
## CROSS FOLD VALIDATION FOR RESNET
val_loss_avg = 0
val_acc_avg = 0
test_loss_avg = 0
test_accs = torch.zeros(10)
rob_avg = 0
ConfM_avg = np.zeros((10,10),dtype = "float32")
for i in range(10):
    ConvModel = ResNet(BasicBlock).to(device)
    optimizer_Conv = AdamW(ConvModel.parameters(),amsgrad=True,lr =1e-5,betas = (.99,.999),weight_decay=1e-1)
    
    writer = SummaryWriter(f'runs/treinoConv_fold{i+1}')
    writer_val = SummaryWriter(f'runs/valConv_fold{i+1}')
    test_loss,test_acc,val_loss,val_acc,ConfM,best_model = train_cross_fold(i,loss,optimizer_Conv,ConvModel,writer,writer_val)
    
    rob_avg += test_robustness(i,best_model)
    ConfM_avg += ConfM
    test_loss_avg += test_loss
    val_loss_avg += val_loss
    test_accs[i] = test_acc
    val_acc_avg += val_acc

test_loss_avg /= 10
val_loss_avg /= 10
test_acc_avg = test_accs.mean()
test_acc_std = test_accs.std()
val_acc_avg /= 10
ConfM_avg /= 10
rob_avg /= 10

Accuracy:0.4144144144144144, Loss: 1.7198339785848344
Accuracy:0.5213963963963963, Loss: 1.5483811157090324
Accuracy:0.5833333333333334, Loss: 1.2985307318823678
Accuracy:0.6024774774774775, Loss: 1.222632212298257
Accuracy:0.6295045045045045, Loss: 1.194753987448556
Accuracy:0.6227477477477478, Loss: 1.208483099937439
Accuracy:0.634009009009009, Loss: 1.1604133588927132
Accuracy:0.6565315315315315, Loss: 1.146526962518692
Accuracy:0.6509009009009009, Loss: 1.061855388539178
Accuracy:0.661036036036036, Loss: 1.1493688992091589
Accuracy:0.6779279279279279, Loss: 1.059755619083132
Accuracy:0.6666666666666666, Loss: 0.9907143328871045
TEST
Accuracy:0.6781214203894617, Loss: 1.0441330543586187
Accuracy:0.3567567567567568, Loss: 1.970622984568278
Accuracy:0.44432432432432434, Loss: 1.6019457976023357
Accuracy:0.5589189189189189, Loss: 1.3396233161290487
Accuracy:0.5989189189189189, Loss: 1.1112003763516745
Accuracy:0.5913513513513513, Loss: 1.1829696377118428
Accuracy:0.6432432432432432, Lo

In [44]:
## RESNET RESULTS
print(f"ResNet Test Loss: {test_loss_avg}")
print(f"ResNet Test Accuracy: {test_acc_avg} with std of {test_acc_std}")
print(f"ResNet Validation Loss: {val_loss_avg}")
print(f"ResNet Validation Accuracy: {val_acc_avg}")
print(f"ResNet Test Robustness: {rob_avg}")
print(f"ResNet Test Confusion Matrix:\n {ConfM_avg}")

ResNet Test Loss: 1.0349416095793467
ResNet Test Accuracy: 0.6861258745193481 with std of 0.03806876018643379
ResNet Validation Loss: 0.9339929308925138
ResNet Validation Accuracy: 0.6993360857710706
ResNet Test Robustness: 0.004365378059446812
ResNet Test Confusion Matrix:
 [[49.9  1.5  4.3  7.3  6.  12.9  0.1 12.1  2.6  3.3]
 [ 1.4 28.9  0.3  0.3  2.9  0.1  0.   1.1  1.2  6.7]
 [ 1.3  0.  77.9  8.3  1.3  2.3  1.   1.1  2.7  4.1]
 [ 3.2  0.2  8.5 76.8  2.   1.4  1.8  0.2  4.5  1.4]
 [ 3.1  0.5  1.7  1.6 69.4  2.5  1.3 15.5  3.5  0.9]
 [20.6  0.3  3.6  0.6  6.5 55.6  0.1  9.8  2.   0.9]
 [ 0.3  0.   0.4  1.4  0.7  0.3 33.5  0.5  0.1  0.2]
 [ 9.3  0.   1.   0.  14.4 11.3  0.1 63.5  0.3  0.1]
 [ 1.9  3.3  8.2  3.1  3.9  1.6  0.   0.4 65.1  5.4]
 [ 4.6  4.3  7.2  1.3  2.2  0.4  0.1  1.1  0.9 77.9]]


In [45]:
## CROSS FOLD VALIDATION FOR MLP
val_loss_avg = 0
val_acc_avg = 0
test_loss_avg = 0
test_accs = torch.zeros(10)
ConfM_avg = np.zeros((10,10),dtype = "float32")
for i in range(10):
    MLP_Model = MLP().to(device)
    optimizer_MLP = AdamW(MLP_Model.parameters(),amsgrad=True,lr =1e-5,betas = (.99,.999),weight_decay=1e-1)
    
    writer = SummaryWriter(f'runs/treinoMLP_fold{i+1}')
    writer_val = SummaryWriter(f'runs/valMLP_fold{i+1}')
    test_loss,test_acc,val_loss,val_acc,ConfM,best_model = train_cross_fold(i,loss,optimizer_MLP,MLP_Model,writer,writer_val)
    
    rob_avg += test_robustness(i,best_model)
    ConfM_avg += ConfM
    test_loss_avg += test_loss
    val_loss_avg += val_loss
    test_accs[i] = test_acc
    val_acc_avg += val_acc

test_loss_avg /= 10
val_loss_avg /= 10
test_acc_avg = test_accs.mean()
test_acc_std = test_accs.std()
val_acc_avg /= 10
ConfM_avg /= 10
rob_avg /= 10

Accuracy:0.35135135135135137, Loss: 2.0478422897202626
Accuracy:0.4189189189189189, Loss: 1.9306338429450989
Accuracy:0.44481981981981983, Loss: 1.8684255906513758
Accuracy:0.4527027027027027, Loss: 1.8221396889005388
Accuracy:0.45495495495495497, Loss: 1.8110312904630388
Accuracy:0.45382882882882886, Loss: 1.7867813791547502
Accuracy:0.4752252252252252, Loss: 1.739909793649401
Accuracy:0.48536036036036034, Loss: 1.7082099063055856
Accuracy:0.48761261261261263, Loss: 1.7115002274513245
Accuracy:0.48986486486486486, Loss: 1.6950820173536028
Accuracy:0.49211711711711714, Loss: 1.668157500880105
Accuracy:0.4797297297297297, Loss: 1.6485908712659563
TEST
Accuracy:0.5223367697594502, Loss: 1.6377319608415877
Accuracy:0.39135135135135135, Loss: 2.0149319728215533
Accuracy:0.45297297297297295, Loss: 1.893131637573242
Accuracy:0.4821621621621622, Loss: 1.7832216421763103
Accuracy:0.4735135135135135, Loss: 1.7129541556040446
Accuracy:0.4627027027027027, Loss: 1.7178573290506998
Accuracy:0.45729

In [46]:
## MLP RESULTS
print(f"MLP Test Loss: {test_loss_avg}")
print(f"MLP Test Accuracy: {test_acc_avg} with std of {test_acc_std}")
print(f"MLP Validation Loss: {val_loss_avg}")
print(f"MLP Validation Accuracy: {val_acc_avg}")
print(f"ResNet Test Robustness: {rob_avg}")
print(f"MLP Test Confusion Matrix:\n {ConfM_avg}")

MLP Test Loss: 1.5746764532926973
MLP Test Accuracy: 0.5281551480293274 with std of 0.052598241716623306
MLP Validation Loss: 1.5567683270098742
MLP Validation Accuracy: 0.5208700715586367
ResNet Test Robustness: 0.013501264154911041
MLP Test Confusion Matrix:
 [[43.6  0.5  4.   3.9  6.9 13.7  1.1  8.7  6.5 11.1]
 [ 1.9 26.4  2.9  1.3  1.1  1.2  0.1  4.1  1.2  2.7]
 [ 9.   1.  45.6 10.9  2.5  7.3  1.8  3.2  5.  13.7]
 [ 4.5  1.2 11.  61.4  1.6  2.6  2.1  0.7  7.2  7.7]
 [ 4.7  2.3  9.1  3.2 43.5  5.   0.2 21.2  5.6  5.2]
 [11.2  0.3  9.1  0.5  2.4 54.7  0.1  7.1  8.5  6.1]
 [ 0.8  0.   2.9  2.3  0.7  0.9 25.9  1.2  0.5  2.2]
 [10.3  0.3  3.5  0.3 17.6  2.2  0.  45.   9.5 11.3]
 [ 4.7  0.2  7.6  7.   0.5  2.9  0.   2.6 65.7  1.7]
 [10.   0.8 14.3  5.7  3.9  4.5  0.1  5.6  4.9 50.2]]
