In [55]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split, Dataset, IterableDataset
from torchvision import transforms, datasets
import pytorch_lightning as pl
import pandas as pd
import numpy as np

import torchaudio

import wandb
from pytorch_lightning.loggers import WandbLogger

In [56]:
wandb.login()



True

# Dataset Module

In [57]:

class BirdVoxDataset(Dataset):

    # Argument list
    # path to the BirdVox-20k csv file
    # path to the BirdVox-20k audio files
    
    def __init__(self, csv_path,file_path):
        
        csvData = pd.read_csv(csv_path,dtype = {'hasbird':np.float32})
        self.file_names = []
        self.labels = []
        
        for i in range( 0,len(csvData) ):
            self.file_names.append(csvData.iloc[i,0])
            self.labels.append(csvData.iloc[i,2])
            
        #tutaj label na float    
        self.file_path = file_path
        self.mel_spectogram = torchaudio.transforms.MelSpectrogram(sample_rate=44100,n_fft=1261, n_mels=80, 
                                                                   window_fn=torch.hamming_window,
                                                                   f_min=50, f_max = 12000)
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
    
    def __len__(self):
        
        return len(self.file_names)
    
    def __getitem__(self, index):
        
        path = self.file_path+"/"+self.file_names[index]+".wav"
        
        #Load audio file into torch.Tensor object. 
        waveform, sample_rate = torchaudio.load(path)
        # utworzenie Mal Spektogramu
        specgram = self.mel_spectogram(waveform)
        # transformacja za skali amplitud do decybeli
        transformedAmpToDB = self.amplitude_to_db(specgram)

        # normalizacja
        tensor_minusmean = transformedAmpToDB - transformedAmpToDB.mean()
        soundFormatted = tensor_minusmean/tensor_minusmean.abs().max()

        return soundFormatted,self.labels[index], self.file_names[index]

In [58]:
class BirdVoxDataModule(pl.LightningDataModule):
    
    def __init__(self, csv_path, file_path, batch_size, num_workers):
        super().__init__()
        self.batch_size = batch_size
        self.csv_path = csv_path
        self.file_path = file_path
        self.num_workers = num_workers
    
    def setup(self, stage=None):
        birdvox_dataset = BirdVoxDataset(self.csv_path, self.file_path)
        self.train_set, self.val_set, self.test_set = torch.utils.data.random_split(birdvox_dataset, [16000,3000,1000], generator=torch.Generator().manual_seed(42))

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size = self.batch_size, num_workers= self.num_workers)
    
    def val_dataloader(self):
        return DataLoader(self.val_set, batch_size = self.batch_size, num_workers= self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_set, batch_size = self.batch_size, num_workers= self.num_workers) 
    

In [59]:
# nie do końca wiem, jak działa inicjalizacja wag
def weights_init_kaiming(m):
    classname = m.__class__.__name__

    if classname.find('Conv') != -1:
        torch.nn.init.kaiming_normal(m.weight.data, a=0, mode='fan_in')
    elif classname.find('Linear') != -1:
        torch.nn.init.kaiming_normal(m.weight.data, a=0, mode='fan_in')
    elif classname.find('BatchNorm') != -1:
        torch.nn.init.normal(m.weight.data, 1.0, 0.02)
        torch.nn.init.constant(m.bias.data, 0.0)

#typ xavier i orthogonal nie jest używany
# sprawdzić -jak działa typ kaiming (bo to on jest używany)
def init_weights(net, init_type='normal'):
    if init_type == 'normal':
        net.apply(weights_init_normal)
#     elif init_type == 'xavier':
#         net.apply(weights_init_xavier)
    elif init_type == 'kaiming':
        net.apply(weights_init_kaiming)
#     elif init_type == 'orthogonal':
#         net.apply(weights_init_orthogonal)
    else:
        raise NotImplementedError('initialization method [%s] is not implemented' % init_type)


In [60]:
# dla przypomnienia jakie argumenty są podawane w CNN_Audio_Model
# argumenty uzupełnione na podstawie podanego przykładu zastosowania, w:
#    https://github.com/ozan-oktay/Attention-Gated-Networks/blob/master/models/networks/sononet_grid_attention.py
#  self.compatibility_score1 = AttentionBlock2D(in_channels = filters[2], 
#                                                      gating_channels = filters[3],
#                                                      inter_channels = filters[3],
#                                                      sub_sample_factor = (1,1),
#                                                      mode = 'concatenation',
#                                                      use_W=False,
#                                                      use_phi=True, use_theta=True
#                                                      use_psi = True,
#                                                      nonlinearity1 = 'relu'
#                                                     )

class _GridAttentionBlockND_TORR(torch.nn.Module):
    def __init__(self, in_channels, gating_channels, inter_channels=None, dimension=2, mode='concatenation',
                 sub_sample_factor=(1,1,1), bn_layer=True, use_W=True, use_phi=True, use_theta=True, use_psi=True, nonlinearity1='relu'):
        
        super(_GridAttentionBlockND_TORR, self).__init__()

        assert dimension==2
        assert mode in ['concatenation', 'concatenation_softmax',
                        'concatenation_sigmoid', 'concatenation_mean',
                        'concatenation_range_normalise', 'concatenation_mean_flow']

        # Default parameter set
        self.mode = mode
        
        #w sumie to u mnie akurat dimension jest zawsze 2... można to potem zrefaktoryzować ;) 
        self.dimension = dimension
        
        #chyba nie do końca wiem, czym jest sub_sample_factor
        self.sub_sample_factor = sub_sample_factor if isinstance(sub_sample_factor, tuple) else tuple([sub_sample_factor])*dimension
        self.sub_sample_kernel_size = self.sub_sample_factor

        # Number of channels (pixel dimensions)
        self.in_channels = in_channels
        self.gating_channels = gating_channels
        self.inter_channels = inter_channels

        ## to w naszym przypadku nie jest potrzebne - in_channels są podane
        ##     aczkolwiek dobrze byłoby się dowiedzieć, dlaczego w podanym przypadku to inter_channels = gating_channels (a nie wejście na pół...)
        if self.inter_channels is None:
            self.inter_channels = in_channels // 2
            if self.inter_channels == 0:
                self.inter_channels = 1

                
        conv_nd = nn.Conv2d
        bn = nn.BatchNorm2d
        self.upsample_mode = 'bilinear'


        # initialise id functions
        # Theta^T * x_ij + Phi^T * gating_signal + bias
        self.W = lambda x: x
        self.theta = lambda x: x
        self.psi = lambda x: x
        self.phi = lambda x: x
        self.nl1 = lambda x: x

        # use_W jest podane jako False
        
#         if use_W:
#             if bn_layer:
#                 self.W = nn.Sequential(
#                     conv_nd(in_channels=self.in_channels, out_channels=self.in_channels, kernel_size=1, stride=1, padding=0),
#                     bn(self.in_channels),
#                 )
#             else:
#                 self.W = conv_nd(in_channels=self.in_channels, out_channels=self.in_channels, kernel_size=1, stride=1, padding=0)

        #czy to use_theta jest równoznaczne z używaniem W^T ?
        if use_theta:
            self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                                 kernel_size=self.sub_sample_kernel_size, stride=self.sub_sample_factor, padding=0, bias=False)

        # # # # # # # # # # #
        if use_phi:
            self.phi = conv_nd(in_channels=self.gating_channels, out_channels=self.inter_channels,
                               kernel_size=self.sub_sample_kernel_size, stride=self.sub_sample_factor, padding=0, bias=False)

        if use_psi:
            self.psi = conv_nd(in_channels=self.inter_channels, out_channels=1, kernel_size=1, stride=1, padding=0, bias=True)


        if nonlinearity1:
            if nonlinearity1 == 'relu':
                self.nl1 = lambda x: F.relu(x, inplace=True)

        if 'concatenation' in mode:
            self.operation_function = self._concatenation
        else:
            raise NotImplementedError('Unknown operation function.')

        # Initialise weights
        # Co to jest self.children ?
        # z dokumentacji: Returns an iterator over immediate children modules.
        for m in self.children():
            init_weights(m, init_type='kaiming')

            
        # to też nie jest używane w podanym przypadku
        
#         if use_psi and self.mode == 'concatenation_sigmoid':
#             nn.init.constant(self.psi.bias.data, 3.0)

#         if use_psi and self.mode == 'concatenation_softmax':
#             nn.init.constant(self.psi.bias.data, 10.0)


        # if use_psi and self.mode == 'concatenation_mean':
        #     nn.init.constant(self.psi.bias.data, 3.0)

        # if use_psi and self.mode == 'concatenation_range_normalise':
        #     nn.init.constant(self.psi.bias.data, 3.0)

        
        # trochę bez sensu fragment kodu - czy tutaj powinno być jakoś inaczej?
#         parallel = False
#         if parallel:
#             if use_W: self.W = nn.DataParallel(self.W)
#             if use_phi: self.phi = nn.DataParallel(self.phi)
#             if use_psi: self.psi = nn.DataParallel(self.psi)
#             if use_theta: self.theta = nn.DataParallel(self.theta)

    def forward(self, x, g):
        '''
        :param x: (b, c, t, h, w)
        :param g: (b, g_d)
        :return:
        '''

        output = self.operation_function(x, g)  ## w naszym przypadku _concatenation(x ,g )
        return output

    def _concatenation(self, x, g):
        input_size = x.size()
        batch_size = input_size[0]
        assert batch_size == g.size(0)

        #############################
        # compute compatibility score

        # theta => (b, c, t, h, w) -> (b, i_c, t, h, w)
        # phi   => (b, c, t, h, w) -> (b, i_c, t, h, w)
        theta_x = self.theta(x)
        theta_x_size = theta_x.size()

        #  nl(theta.x + phi.g + bias) -> f = (b, i_c, t/s1, h/s2, w/s3)

        # to chyba przepróbkowanie g, żeby było wielkości theta
        # pytanie: dlaczego na g jest jeszcze raz przez conv2d ?
        phi_g = F.upsample(self.phi(g), size=theta_x_size[2:], mode=self.upsample_mode)

        f = theta_x + phi_g
        f = self.nl1(f)  #sigmoid1 - relu

        psi_f = self.psi(f)  #conv2d
        
        #koniec pierwszej części równania

        ############################################
        # Zaraz, zaraz...
        # Tam było podane jako mode concatenation - a to zdaje się powoduje w tym miejscu błąd...
        # To jest po prostu normalizacja (w równaniu oznaczona sigma2) 
        # zastosuję wersję dla concatenation_range_normalise
        
#         # normalisation -- scale compatibility score
#         #  psi^T . f -> (b, 1, t/s1, h/s2, w/s3)
#         if self.mode == 'concatenation_softmax':
#             sigm_psi_f = F.softmax(psi_f.view(batch_size, 1, -1), dim=2)
#             sigm_psi_f = sigm_psi_f.view(batch_size, 1, *theta_x_size[2:])
#         elif self.mode == 'concatenation_mean':
#             psi_f_flat = psi_f.view(batch_size, 1, -1)
#             psi_f_sum = torch.sum(psi_f_flat, dim=2)#clamp(1e-6)
#             psi_f_sum = psi_f_sum[:,:,None].expand_as(psi_f_flat)

#             sigm_psi_f = psi_f_flat / psi_f_sum
#             sigm_psi_f = sigm_psi_f.view(batch_size, 1, *theta_x_size[2:])
#         elif self.mode == 'concatenation_mean_flow':
#             psi_f_flat = psi_f.view(batch_size, 1, -1)
#             ss = psi_f_flat.shape
#             psi_f_min = psi_f_flat.min(dim=2)[0].view(ss[0],ss[1],1)
#             psi_f_flat = psi_f_flat - psi_f_min
#             psi_f_sum = torch.sum(psi_f_flat, dim=2).view(ss[0],ss[1],1).expand_as(psi_f_flat)

#             sigm_psi_f = psi_f_flat / psi_f_sum
#             sigm_psi_f = sigm_psi_f.view(batch_size, 1, *theta_x_size[2:])
#         elif self.mode == 'concatenation_range_normalise':
#             psi_f_flat = psi_f.view(batch_size, 1, -1)
#             ss = psi_f_flat.shape
#             psi_f_max = torch.max(psi_f_flat, dim=2)[0].view(ss[0], ss[1], 1)
#             psi_f_min = torch.min(psi_f_flat, dim=2)[0].view(ss[0], ss[1], 1)

#             sigm_psi_f = (psi_f_flat - psi_f_min) / (psi_f_max - psi_f_min).expand_as(psi_f_flat)
#             sigm_psi_f = sigm_psi_f.view(batch_size, 1, *theta_x_size[2:])

#         elif self.mode == 'concatenation_sigmoid':
#             sigm_psi_f = F.sigmoid(psi_f)
#         else:
#             raise NotImplementedError

        # na coś muszę ustawić sigm_psi_f, więc dam to co w 'concatenation_range_normalise'
        # to jest zdaje się drugie równanie, czyli na tym, co wcześniej zostało policzone dokonywana jest NORMALIZACJA
        psi_f_flat = psi_f.view(batch_size, 1, -1)
        ss = psi_f_flat.shape
        psi_f_max = torch.max(psi_f_flat, dim=2)[0].view(ss[0], ss[1], 1)
        psi_f_min = torch.min(psi_f_flat, dim=2)[0].view(ss[0], ss[1], 1)

        sigm_psi_f = (psi_f_flat - psi_f_min) / (psi_f_max - psi_f_min).expand_as(psi_f_flat)
        sigm_psi_f = sigm_psi_f.view(batch_size, 1, *theta_x_size[2:])
        

        # sigm_psi_f is attention map! upsample the attentions and multiply
        sigm_psi_f = F.upsample(sigm_psi_f, size=input_size[2:], mode=self.upsample_mode)
        y = sigm_psi_f.expand_as(x) * x
        W_y = self.W(y)

        return W_y, sigm_psi_f

#AttentionBlock2D
class GridAttentionBlock2D_TORR(_GridAttentionBlockND_TORR):
        def __init__(self, in_channels, gating_channels, inter_channels=None, mode='concatenation',
                 sub_sample_factor=(1,1), bn_layer=True,
                 use_W=True, use_phi=True, use_theta=True, use_psi=True,
                 nonlinearity1='relu'):
            super(GridAttentionBlock2D_TORR, self).__init__(in_channels,
                                                   inter_channels=inter_channels,
                                                   gating_channels=gating_channels,
                                                   dimension=2, mode=mode,
                                                   sub_sample_factor=sub_sample_factor,
                                                   bn_layer=bn_layer,
                                                   use_W=use_W,
                                                   use_phi=use_phi,
                                                   use_theta=use_theta,
                                                   use_psi=use_psi,
                                                   nonlinearity1=nonlinearity1)


# opis sieci
##### Input -700x80x1
##### Convolution (3x3) -698x78x16
##### Pool (3x3) -232x26x16
        
##### Convolution (3x3) -230x24x16
##### Pool (3x3) -76x8x16
        
##### Convolution (3x3) -74x6x16
##### Pool (3x1) -24x6x16
        
##### Convolution (3x3) -22x4x16
##### Pool (3x1)-7x4x16
        
##### Dense (256) -256
##### Dense (32) -32
##### Dense (1) -1

In [61]:
class CNN_Audio_Model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        
        ##templatka:
        ## Conv2d(int(in_channels), int(n_filters), kernel_size)
        
        #nie wiem w sumie czy robienie tych filters ma jakiś głębszy sens w tym przypadku, skoro wszędzie jest 16
        filters = [16,16,16,16]
        
        self.prediction_changed_by_AG = 0
        self.prediction_changed_by_AG_training = []       
        self.prediction_changed_by_AG_validation = []     
        self.prediction_changed_by_AG_testing = []
        
        # # # # # # # # # # # # # # # # # # # # # # # #
        #convolution layers
        self.layer1 = torch.nn.Sequential(torch.nn.Conv2d(1,16,kernel_size=3),
                                          torch.nn.BatchNorm2d(16),
                                          torch.nn.LeakyReLU(0.001),
                                          torch.nn.MaxPool2d((3,3)) )
        
        self.layer2 = torch.nn.Sequential(torch.nn.Conv2d(16,16,kernel_size=3),
                                          torch.nn.BatchNorm2d(16),
                                          torch.nn.LeakyReLU(0.001),
                                          torch.nn.MaxPool2d((3,3)) )
        
        self.layer3 = torch.nn.Sequential(torch.nn.Conv2d(16,16,kernel_size=3),
                                          torch.nn.BatchNorm2d(16),
                                          torch.nn.LeakyReLU(0.001),
                                          torch.nn.MaxPool2d((1,3)))

        self.layer4 = torch.nn.Sequential(torch.nn.Conv2d(16,16,kernel_size=3),
                                          torch.nn.BatchNorm2d(16),
                                          torch.nn.LeakyReLU(0.001),
                                          torch.nn.MaxPool2d((1,3))
                                         # torch.nn.Flatten()
                                         )
        
        ## czy tu wystarczy tak to rozdzielić, czy trzeba g pozyskiwać jeszcze nieco wcześniej?
        # czy to końcowe Flatten powinno już być w dense_layers
        self.flatten_afterConv4 = torch.nn.Flatten()
        
        #dense layers
        self.dense_input_size = 7*4*16
        self.dropout = torch.nn.Dropout()
        self.fc1 = torch.nn.Linear(self.dense_input_size,256)
        self.batch1 = torch.nn.BatchNorm1d(256) 
        self.leakyReLU = torch.nn.LeakyReLU(0.001)
        
        self.fc2 = torch.nn.Linear(256,32)
        self.batch2 = torch.nn.BatchNorm1d(32) #i na tym leakyRelu
        
        self.fc3 = torch.nn.Linear(32,1) #i na tym sigmoid
        self.sigmoid = torch.nn.Sigmoid()
        
        self.flatten = torch.nn.Flatten(start_dim=0)
        
        # # # # # # # # # # # # # # # # # # # # # # # #
        #attention maps
        
        #trochę nie do końca wiem czym jest szereg tych argumentów
        self.compatibility_score1 = GridAttentionBlock2D_TORR(in_channels = filters[2], 
                                                     gating_channels = filters[3],
                                                     inter_channels = filters[3],
                                                     sub_sample_factor = (1,1),
                                                     mode = 'concatenation',
                                                     use_W=False,
                                                     use_phi=True, use_theta = True,
                                                     use_psi = True,
                                                     nonlinearity1 = 'relu'
                                                    )
        self.compatibility_score2 = GridAttentionBlock2D_TORR(in_channels = filters[3], 
                                                     gating_channels = filters[3],
                                                     inter_channels = filters[3],
                                                     sub_sample_factor = (1,1),
                                                     mode = 'concatenation',
                                                     use_W=False,
                                                     use_phi=True,use_theta = True,
                                                     use_psi = True,
                                                     nonlinearity1 = 'relu'
                                                    )
        
        # # # # # # # # # # # # # # # # # # # # # # # #
        # Aggragation Strategies 
        
        self.attention_filter_sizes = [filters[2],filters[3]]
        n_classes = 2
        
        self.classifier = nn.Linear(filters[2]+filters[3]+filters[3], n_classes)
        self.classifier1 = nn.Linear(filters[2], 1)
        self.classifier2 = nn.Linear(filters[3], 1)
        self.classifier3 = nn.Linear(filters[3], 1)
        self.classifiers = [self.classifier1, self.classifier2, self.classifier3]

        self.aggregate = self.aggregation_concat_with_dense
        
        # # # # # # # # # # # # # # # # # # # # # # # #
        # initialise weights
        #to, co jest tam, a czego do końca nie rozumiem... (i co to self.modules? )
        # The self.modules() method returns an iterable to the many layers or “modules” defined in the model class.
        for m in self.modules():
            if isinstance(m, torch.nn.Conv2d):
                init_weights(m, init_type = 'kaiming')
            elif isinstance(m, torch.nn.BatchNorm2d):
                init_weights(m, init_type = 'kaiming')
        
        
        # compute the accuracy -- no need to roll your own!
        self.train_acc = pl.metrics.Accuracy()
        self.valid_acc = pl.metrics.Accuracy()
        self.test_acc = pl.metrics.Accuracy()
        
        self.validation_wrong_classified = []
        self.validation_wrong_classified_epoch = []
        
    # ***************************************************** #
        
    # attended_maps - (g1,g2,g)
    
    #to jest agragacja, gdy po AG na wyjściu robimy dense_layers
    # liczona jest średnia z trzech wyjść
    def aggregation_concat_with_dense(self, *attended_maps):

    #jeśli jest dense
        a1 = attended_maps[0].reshape( (1,int( attended_maps[0].size()[0] ) ) )
        a2 = attended_maps[1].reshape((1,int( attended_maps[1].size()[0]) )) 
        a3 = attended_maps[2].reshape((1,int( attended_maps[2].size()[0]) ))
        aggregation_cat = torch.cat((a1,a2,a3), dim=0)

        aggregation_mean = torch.mean(aggregation_cat, 0)


        for idx,mean_v in enumerate(aggregation_mean):
            if round(float(mean_v)) != round(float(attended_maps[2][idx])):
                self.prediction_changed_by_AG += 1

        return aggregation_mean
    
    
    # ***************************************************** #
    # próby przeprowadzenia agregacji, gdy warstwy po AG nie przechodzą przez dense_layers
    # lecz są to raczej próby - jak na razie nie działa
    def aggregation_concat_standard(self, *attended_maps):
        print("a1: ", attended_maps[0])
        print("a2: ", attended_maps[1])
        print("a3: ", attended_maps[2])
        c1 = self.classifier1(attended_maps[0])
        c2 = self.classifier2(attended_maps[1])
#         c3 = self.classifier3(attended_maps[2])
        
        print("c1: ", c1)
        print("c2: ", c2)
#         print("c3: ", attended_maps[2])

#         #jeśli nie ma
        c1 = c1.reshape((1,int( c1.size()[0]) ))
        c2 = c2.reshape((1,int( c2.size()[0]) ))
        g = attended_maps[2].reshape((1,int( attended_maps[2].size()[0]) ))
        print("c1 after: ", c1)
        aggregation_cat = torch.cat((c1,c2,g), dim=0)
        print("aggregation_cat: ", aggregation_cat)

        aggregation_mean = torch.mean(aggregation_cat, 0)
        print("aggregation_mean: ", aggregation_mean)

        
        return aggregation_mean
    
#         result = [ clf(att) for clf, att in zip(self.classifiers, attended_maps) ]
#         return result
#         return self.classifier(torch.cat(attended_maps, dim=1))
    
    # ***************************************************** #
    def dense_layers(self,inputs, input_size):
        self.dense_input_size = input_size
        #dense layers
        
        # czy tutaj to flatten ok?
        x=self.flatten_afterConv4(inputs)
        x=self.dropout(x)
        
        #to w sumie można jakoś rozpisać, żeby tego Linear tutaj nie inicjalizować
        self.fc1 = torch.nn.Linear(self.dense_input_size,256)
        x=self.fc1(x)
        x=self.batch1(x)
        x=self.leakyReLU(x)
        
        x=self.dropout(x)
        x=self.fc2(x)
        x=self.batch2(x)
        x=self.leakyReLU(x)
        
        x=self.dropout(x)
        
        x=self.fc3(x)
        
        x = self.sigmoid(x)
        x=self.flatten(x)
        return x
        
    # ***************************************************** #
    def forward(self,inputs):
        
        #convolution layers
        conv_layer1=self.layer1(inputs)
        conv_layer2=self.layer2(conv_layer1)
        conv_layer3=self.layer3(conv_layer2)
        conv_layer4=self.layer4(conv_layer3)
        

        after_dense = self.dense_layers(conv_layer4, 7*4*16)
        
        ## Attention Mechanism
        
        #czy tutaj właściwie nie powinno być rozdzielenie conv_layer4 tak, żeby była przekazywana jako g bez maxpoolingu? albo bez flatten
        g_conv1, att1 = self.compatibility_score1(conv_layer2 ,conv_layer4)
        g_conv2, att2 = self.compatibility_score2(conv_layer3,conv_layer4)
        
        ## dense on attention filters
        # Pytanie: czy na g_conv1 i g_conv2 też dense_layers? Czy tak jak w tamtym kodzie:
#         fsizes = self.attention_filter_sizes
#         g1 = torch.sum(g_conv1.view(batch_size, fsizes[0], -1), dim=-1)
#         g2 = torch.sum(g_conv2.view(batch_size, fsizes[0], -1), dim=-1)
        
        # wersja z dense_layers na warstwach po self.compatibility_score1 (mechaniźmie AG)

        g1 = self.dense_layers(g_conv1, 76*8*16)
        g2 = self.dense_layers(g_conv2, 24*6*16)
        
        output = self.aggregate(g1,g2,after_dense)
        return output
    

    # ***************************************************** #
    #z artykułu: The network is trained on binary cross entropy loss using accuracy as a metric.
    def cross_entropy_loss(self, logits, labels):
        return F.binary_cross_entropy(logits, labels)

    def training_step(self, train_batch, batch_idx):
        x, y, f = train_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        
        self.log('train_loss', loss, on_epoch=True, sync_dist=True)
        
        y = y.int()
        accuracy = self.train_acc(logits, y)
        self.log('train_acc', self.train_acc, on_epoch=True, sync_dist=True)
        
        return {'loss': loss, 'accuracy': accuracy}

    def validation_step(self, val_batch, batch_idx):
        x, y, f = val_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)

        y = y.int()
        accuracy = self.valid_acc(logits, y)
        
        list_file_names = []
        #trochę na wprost tworzenie listy tych nagrań, które zostały źle zaklasyfikowane
        for id in range(len(f)):
            if round(float(logits[id])) != y[id]:
                self.validation_wrong_classified_epoch.append(f[id])
                
        return {'val_loss': loss, 'val_accuracy': accuracy}

    def test_step(self, test_batch, batch_idx):
        x, y, f = test_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        y = y.int()
        accuracy = self.test_acc(logits, y)
        
        return {'test_loss': loss, 'test_accuracy': accuracy}
    
    
    def training_epoch_end(self, outputs):
        
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        avg_accuracy = torch.stack([x['accuracy'] for x in outputs]).mean()

        self.log('training_epoch_end_accuracy', avg_accuracy, sync_dist=True)
        self.log('training_epoch_end_loss', avg_loss, sync_dist=True)
        self.log('lr', self.optimizers().param_groups[0]['lr'], sync_dist=True)
        
        self.prediction_changed_by_AG_training.append(self.prediction_changed_by_AG)
        self.prediction_changed_by_AG = 0
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_accuracy = torch.stack([x['val_accuracy'] for x in outputs]).mean()
            
        self.log('validation_epoch_end_accuracy', avg_accuracy, sync_dist=True)
        self.log('validation_epoch_end_loss', avg_loss, sync_dist=True)
        self.validation_wrong_classified.append(self.validation_wrong_classified_epoch.copy())
        self.validation_wrong_classified_epoch.clear()
        
        self.prediction_changed_by_AG_validation.append(self.prediction_changed_by_AG)
        self.prediction_changed_by_AG = 0

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_accuracy = torch.stack([x['test_accuracy'] for x in outputs]).mean()
        
        self.log('test_epoch_end_accuracy', avg_accuracy, sync_dist=True)
        self.log('test_epoch_end_loss', avg_loss, sync_dist=True)
        
        self.prediction_changed_by_AG_testing.append(self.prediction_changed_by_AG)
        self.prediction_changed_by_AG = 0

    #według artykułu: For training,ADAM optimizer is used with an initial learning rate of 0.001. 
    # ! The learning rate was reduced by a factor of 0.2 if there was no improvement in validation accuracy 
    #over five consecutive epochs.
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.2, patience = 5)
        return {
            'optimizer': optimizer,
            'lr_scheduler': lr_scheduler,
            'monitor': 'validation_epoch_end_loss'
        }

In [62]:

wandb_logger = WandbLogger(project="birdVox-NeuralNetwork_withAG")

# przykładowe ścieżki:
csv_path= './BirdVox/BirdVoxDCASE20k.csv'
file_path='./BirdVox/data/wav'

#batch_size ~ 32, 64 [32-128] to standard
batch_size = 32

#num_workers = 24 if cpu
num_workers = 0


# z ograniczeniem epok:
trainer = pl.Trainer(
    logger = wandb_logger,  #W&B integration
    log_every_n_steps = 50, #set the logging frequency
    max_epochs=1,           #number of epochs  
    gpus =0,
    progress_bar_refresh_rate=50
)

birdvox_dm = BirdVoxDataModule(csv_path, file_path, batch_size, num_workers)
model = CNN_Audio_Model()

trainer.fit(model, birdvox_dm)
trainer.save_checkpoint("model_50e_AG.ckpt")

result = trainer.test(model)
print(result)

print("prediction_changed_by_AG_training: ", model.prediction_changed_by_AG_training )
print("prediction_changed_by_AG_validation: ", model.prediction_changed_by_AG_validation )
print("prediction_changed_by_AG_testing : ", model.prediction_changed_by_AG_testing )
wandb.finish()


GPU available: True, used: False
TPU available: None, using: 0 TPU cores
  torch.nn.init.kaiming_normal(m.weight.data, a=0, mode='fan_in')
  torch.nn.init.normal(m.weight.data, 1.0, 0.02)
  torch.nn.init.constant(m.bias.data, 0.0)

   | Name                 | Type                      | Params
--------------------------------------------------------------------
0  | layer1               | Sequential                | 192   
1  | layer2               | Sequential                | 2.4 K 
2  | layer3               | Sequential                | 2.4 K 
3  | layer4               | Sequential                | 2.4 K 
4  | flatten_afterConv4   | Flatten                   | 0     
5  | dropout              | Dropout                   | 0     
6  | fc1                  | Linear                    | 114 K 
7  | batch1               | BatchNorm1d               | 512   
8  | leakyReLU            | LeakyReLU                 | 0     
9  | fc2                  | Linear                    | 8.2 K 
10 | b

Validation sanity check: 0it [00:00, ?it/s]

  value = torch.tensor(value, device=device, dtype=torch.float)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]



Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_epoch_end_accuracy': 0.58984375,
 'test_epoch_end_loss': 0.6790212392807007}
--------------------------------------------------------------------------------
[{'test_epoch_end_accuracy': 0.58984375, 'test_epoch_end_loss': 0.6790212392807007}]
prediction_changed_by_AG_training:  [4409]
prediction_changed_by_AG_validation:  [0, 403]
prediction_changed_by_AG_testing :  [126]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss_step,0.53111
train_acc_step,0.8125
epoch,0.0
_runtime,3661.0
_timestamp,1618483907.0
_step,999.0
train_loss_epoch,0.67116
train_acc_epoch,0.56606
training_epoch_end_accuracy,0.56606
training_epoch_end_loss,0.67116


0,1
train_loss_step,▆▆██▇▇▅▅▅▁
train_acc_step,▅▄▂▁▁▃▅▅▅█
epoch,▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▂▂▃▄▅▅▆▇██
_timestamp,▁▂▂▃▄▅▅▆▇██
_step,▁▁▂▂▂▃▃▄▄▄█
train_loss_epoch,▁
train_acc_epoch,▁
training_epoch_end_accuracy,▁
training_epoch_end_loss,▁
