In [1]:
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split, Dataset, IterableDataset
from torchvision import transforms, datasets
import pytorch_lightning as pl
import pandas as pd
import numpy as np

import torchaudio

import wandb
from pytorch_lightning.loggers import WandbLogger



In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mulatwo[0m (use `wandb login --relogin` to force relogin)


True

# Dataset Module

In [3]:

class BirdVoxDataset(Dataset):

    # Argument list
    # path to the BirdVox-20k csv file
    # path to the BirdVox-20k audio files
    
    def __init__(self, csv_path_B,file_path_B,csv_path_F,file_path_F):
        
        csvDataB = pd.read_csv(csv_path_B,dtype = {'hasbird':np.float32})
        csvDataF = pd.read_csv(csv_path_F,dtype = {'itemid': 'string','hasbird':np.float32})
        csvData= csvDataB + csvDataF
        csvData = pd.concat([csvDataB,csvDataF])
        print(csvDataB)
        print(csvDataF)
        print(csvData)
        self.file_names = []
        self.labels = []
        
        for i in range( 0,len(csvData) ):
            self.file_names.append(csvData.iloc[i,0])
            self.labels.append(csvData.iloc[i,2])
            
        #tutaj label na float    
        self.file_path_B = file_path_B
        self.file_path_F = file_path_F
        self.lenghtB = csvDataB.size/3
        
        print("Uwaga! Sprawdzenie rozmiaru BirdVoxa: ", self.lenghtB)
        self.mel_spectogram = torchaudio.transforms.MelSpectrogram(sample_rate=44100,n_fft=1261, n_mels=80, 
                                                                   window_fn=torch.hamming_window,
                                                                   f_min=50, f_max = 12000)
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
        
        self.resize = transforms.Resize((80,700))
        self.cropp = transforms.CenterCrop((80,700))
        
    
    def __len__(self):
        
        return len(self.file_names)
    
    def __getitem__(self, index):
        
        if index < self.lenghtB :
            path = self.file_path_B+"/"+self.file_names[index]+".wav"
        else:
            path = self.file_path_F+"/"+self.file_names[index]+".wav"
        
#         print(path)
        #Load audio file into torch.Tensor object. 
        waveform, sample_rate = torchaudio.load(path)
        # utworzenie Mal Spektogramu
        specgram = self.mel_spectogram(waveform)
        
        #uwaga! To nie jest za ładne miejsce - możnaby rozwiązać ten problem inaczej chyba
        if specgram.size()[2]<700:
            specgram  = self.resize(specgram)
        else:
            specgram  = self.cropp(specgram)
            
        # transformacja za skali amplitud do decybeli
        transformedAmpToDB = self.amplitude_to_db(specgram)

        # normalizacja
        tensor_minusmean = transformedAmpToDB - transformedAmpToDB.mean()
        soundFormatted = tensor_minusmean/tensor_minusmean.abs().max()

        return soundFormatted,self.labels[index], self.file_names[index]

In [4]:
class BirdVoxDataModule(pl.LightningDataModule):
    
    def __init__(self, csv_path_B, file_path_B,csv_path_F, file_path_F, batch_size, num_workers):
        super().__init__()
        self.batch_size = batch_size
        self.csv_path_B = csv_path_B
        self.file_path_B = file_path_B
        self.csv_path_F = csv_path_F
        self.file_path_F = file_path_F
        
        self.num_workers = num_workers
    
    def setup(self, stage=None):
        birdvox_dataset = BirdVoxDataset(self.csv_path_B, self.file_path_B,self.csv_path_F, self.file_path_F)
        self.train_set, self.val_set, self.test_set = torch.utils.data.random_split(birdvox_dataset, [22152,4154,1384], generator=torch.Generator().manual_seed(42))
        print(self.val_set[0])

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size = self.batch_size, num_workers= self.num_workers)
    
    def val_dataloader(self):
        return DataLoader(self.val_set, batch_size = self.batch_size, num_workers= self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_set, batch_size = self.batch_size, num_workers= self.num_workers) 
    

# opis sieci
##### Input -700x80x1
##### Convolution (3x3) -698x78x16
##### Pool (3x3) -232x26x16
        
##### Convolution (3x3) -230x24x16
##### Pool (3x3) -76x8x16
        
##### Convolution (3x3) -74x6x16
##### Pool (3x1) -24x6x16
        
##### Convolution (3x3) -22x4x16
##### Pool (3x1)-7x4x16
        
##### Dense (256) -256
##### Dense (32) -32
##### Dense (1) -1

In [5]:
class CNN_Audio_Model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        
        #convolution layers
        self.layer1 = torch.nn.Sequential(torch.nn.Conv2d(1,16,kernel_size=3),
                                          torch.nn.BatchNorm2d(16),
                                          torch.nn.LeakyReLU(0.001),
                                          torch.nn.MaxPool2d((3,3)) )
        
        self.layer2 = torch.nn.Sequential(torch.nn.Conv2d(16,16,kernel_size=3),
                                          torch.nn.BatchNorm2d(16),
                                          torch.nn.LeakyReLU(0.001),
                                          torch.nn.MaxPool2d((3,3)) )
        
        self.layer3 = torch.nn.Sequential(torch.nn.Conv2d(16,16,kernel_size=3),
                                          torch.nn.BatchNorm2d(16),
                                          torch.nn.LeakyReLU(0.001),
                                          torch.nn.MaxPool2d((1,3)))

        self.layer4 = torch.nn.Sequential(torch.nn.Conv2d(16,16,kernel_size=3),
                                          torch.nn.BatchNorm2d(16),
                                          torch.nn.LeakyReLU(0.001),
                                          torch.nn.MaxPool2d((1,3)),
                                          torch.nn.Flatten())
        
        #dense layers
        self.dropout = torch.nn.Dropout()
        self.fc1 = torch.nn.Linear(7*4*16,256)
        self.batch1 = torch.nn.BatchNorm1d(256) 
        self.leakyReLU = torch.nn.LeakyReLU(0.001)
        
        self.fc2 = torch.nn.Linear(256,32)
        self.batch2 = torch.nn.BatchNorm1d(32) #i na tym leakyRelu
        
        self.fc3 = torch.nn.Linear(32,1) #i na tym sigmoid
        self.sigmoid = torch.nn.Sigmoid()
        
        self.flatten = torch.nn.Flatten(start_dim=0)
        
        # compute the accuracy -- no need to roll your own!
        self.train_acc = pl.metrics.Accuracy()
        self.valid_acc = pl.metrics.Accuracy()
        self.test_acc = pl.metrics.Accuracy()
        
        self.validation_wrong_classified = []
        self.validation_wrong_classified_epoch = []
        
        
    def forward(self,x):
        
        #convolution layers
        x=self.layer1(x)
        x=self.layer2(x)
        x=self.layer3(x)
        x=self.layer4(x)

        #dense layers
        x=self.dropout(x)
        x=self.fc1(x)
        x=self.batch1(x)
        x=self.leakyReLU(x)
        
        x=self.dropout(x)
        x=self.fc2(x)
        x=self.batch2(x)
        x=self.leakyReLU(x)
        
        x=self.dropout(x)
        
        x=self.fc3(x)
        
        x = self.sigmoid(x)
        
        #PYTANIE: czy to tutaj może być, czy jest to problem jednak?
        #problematyczny shape tensora dla cross_entropy, dlatego reshape
        #był: tensor([[0.4876], ... , [0.4875]]) po: tensor([0.4876, ... ,0.4875])
        # reshape na flatten
        x=self.flatten(x)
        return x
    
    #z artykułu: The network is trained on binary cross entropy loss using accuracy as a metric.
    def cross_entropy_loss(self, logits, labels):
        return F.binary_cross_entropy(logits, labels)

    def training_step(self, train_batch, batch_idx):
        x, y, f = train_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        
        self.log('train_loss', loss, on_epoch=True, sync_dist=True)
        
        y = y.int()
        accuracy = self.train_acc(logits, y)
        self.log('train_acc', self.train_acc, on_epoch=True, sync_dist=True)
        
        return {'loss': loss, 'accuracy': accuracy}

    def validation_step(self, val_batch, batch_idx):
        x, y, f = val_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
#         print('names: ', f)
#         print('logits: ', logits)
#         print('y: ',y)
#         print('loss: ', loss)
        y = y.int()
        accuracy = self.valid_acc(logits, y)
#         print('accuracy: ',accuracy)
        
        list_file_names = []
        #trochę na wprost tworzenie listy tych nagrań, które zostały źle zaklasyfikowane
        for id in range(len(f)):
            if round(float(logits[id])) != y[id]:
                self.validation_wrong_classified_epoch.append(f[id])
                
        return {'val_loss': loss, 'val_accuracy': accuracy}

    def test_step(self, test_batch, batch_idx):
        x, y, f = test_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        y = y.int()
        accuracy = self.test_acc(logits, y)
        
        return {'test_loss': loss, 'test_accuracy': accuracy}
    
    
    def training_epoch_end(self, outputs):
        
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        avg_accuracy = torch.stack([x['accuracy'] for x in outputs]).mean()

        self.log('training_epoch_end_accuracy', avg_accuracy, sync_dist=True)
        self.log('training_epoch_end_loss', avg_loss, sync_dist=True)
        self.log('lr', self.optimizers().param_groups[0]['lr'], sync_dist=True)
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_accuracy = torch.stack([x['val_accuracy'] for x in outputs]).mean()
            
        self.log('validation_epoch_end_accuracy', avg_accuracy, sync_dist=True)
        self.log('validation_epoch_end_loss', avg_loss, sync_dist=True)
        self.validation_wrong_classified.append(self.validation_wrong_classified_epoch.copy())
        self.validation_wrong_classified_epoch.clear()

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_accuracy = torch.stack([x['test_accuracy'] for x in outputs]).mean()
        
        self.log('test_epoch_end_accuracy', avg_accuracy, sync_dist=True)
        self.log('test_epoch_end_loss', avg_loss, sync_dist=True)

    #według artykułu: For training,ADAM optimizer is used with an initial learning rate of 0.001. 
    # ! The learning rate was reduced by a factor of 0.2 if there was no improvement in validation accuracy 
    #over five consecutive epochs.
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.2, patience = 5)
        return {
            'optimizer': optimizer,
            'lr_scheduler': lr_scheduler,
            'monitor': 'validation_epoch_end_loss'
        }

In [7]:

wandb_logger = WandbLogger(project="birdVox+Freefield-NeuralNetwork")

# przykładowe ścieżki:
csv_path_B= './BirdVox/BirdVoxDCASE20k.csv'
file_path_B='./BirdVox/data/wav'

csv_path_F= './freefield1010/ff1010bird_metadata_2018.csv'
file_path_F='./freefield1010/wav'


#batch_size ~ 32, 64 [32-128] to standard
batch_size = 32

#num_workers = 24 if cpu
num_workers = 24


# z ograniczeniem epok:
trainer = pl.Trainer(
    logger = wandb_logger,  #W&B integration
    log_every_n_steps = 50, #set the logging frequency
    max_epochs=25,           #number of epochs  
    gpus =0,
    progress_bar_refresh_rate=50
)

birdvox_dm = BirdVoxDataModule(csv_path_B, file_path_B,csv_path_F, file_path_F , batch_size, num_workers)
model = CNN_Audio_Model()

trainer.fit(model, birdvox_dm)
trainer.save_checkpoint("B+F_model_25e_vallist.ckpt")

result = trainer.test(model)
#UWAGA! Jeszcze nie jest ok, bo epoki!
# print('validation_wrong_classified',model.validation_wrong_classified)
# print(' ')
# print('\n\n przedlast_epoch_of_validation: ', model.validation_wrong_classified[-2])
# print(' ')
# print('\n\n last_epoch_of_validation: ', model.validation_wrong_classified[-1])
print(result)
wandb.finish()

# zwykły sposób na zapis tego i odczytanie z pliku
with open('B+F_wrong_classified_validation_file_names_25e.txt', 'w') as filehandle:
    for listitem in model.validation_wrong_classified[-1]:
        filehandle.write('%s\n' % listitem)

GPU available: True, used: False
TPU available: None, using: 0 TPU cores


                                     itemid          datasetid  hasbird
0      00053d90-e4b9-4045-a2f1-f39efc90cfa9  BirdVox-DCASE-20k      1.0
1      000db435-a40f-4ad9-a74e-d1af284d2c44  BirdVox-DCASE-20k      0.0
2      001059c0-e04f-42fc-a8e2-11aad24dc6fb  BirdVox-DCASE-20k      1.0
3      00106202-f61e-467d-a80f-070d90421952  BirdVox-DCASE-20k      0.0
4      00129593-77ca-40b2-a512-75d178071250  BirdVox-DCASE-20k      0.0
...                                     ...                ...      ...
19995  fff12db0-9cbe-4155-ac4a-b0b88d84c1d7  BirdVox-DCASE-20k      0.0
19996  fff78736-b90a-498e-a18d-a27cd3b83578  BirdVox-DCASE-20k      0.0
19997  fff80e7a-7913-4a58-ab9b-4facffe04e56  BirdVox-DCASE-20k      0.0
19998  fff847f3-fcfe-43b6-a7c2-85cd84a05cee  BirdVox-DCASE-20k      0.0
19999  fffda998-2df9-4055-ab82-2e16b95338a7  BirdVox-DCASE-20k      0.0

[20000 rows x 3 columns]
      itemid   datasetid  hasbird
0      64486  ff1010bird      0.0
1       2525  ff1010bird      0.0
2      4

[34m[1mwandb[0m: wandb version 0.10.22 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



   | Name      | Type        | Params
-------------------------------------------
0  | layer1    | Sequential  | 192   
1  | layer2    | Sequential  | 2.4 K 
2  | layer3    | Sequential  | 2.4 K 
3  | layer4    | Sequential  | 2.4 K 
4  | dropout   | Dropout     | 0     
5  | fc1       | Linear      | 114 K 
6  | batch1    | BatchNorm1d | 512   
7  | leakyReLU | LeakyReLU   | 0     
8  | fc2       | Linear      | 8.2 K 
9  | batch2    | BatchNorm1d | 64    
10 | fc3       | Linear      | 33    
11 | sigmoid   | Sigmoid     | 0     
12 | flatten   | Flatten     | 0     
13 | train_acc | Accuracy    | 0     
14 | valid_acc | Accuracy    | 0     
15 | test_acc  | Accuracy    | 0     
-------------------------------------------
131 K     Trainable params
0         Non-trainable params
131 K     Total params
0.524     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  value = torch.tensor(value, device=device, dtype=torch.float)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

                                     itemid          datasetid  hasbird
0      00053d90-e4b9-4045-a2f1-f39efc90cfa9  BirdVox-DCASE-20k      1.0
1      000db435-a40f-4ad9-a74e-d1af284d2c44  BirdVox-DCASE-20k      0.0
2      001059c0-e04f-42fc-a8e2-11aad24dc6fb  BirdVox-DCASE-20k      1.0
3      00106202-f61e-467d-a80f-070d90421952  BirdVox-DCASE-20k      0.0
4      00129593-77ca-40b2-a512-75d178071250  BirdVox-DCASE-20k      0.0
...                                     ...                ...      ...
19995  fff12db0-9cbe-4155-ac4a-b0b88d84c1d7  BirdVox-DCASE-20k      0.0
19996  fff78736-b90a-498e-a18d-a27cd3b83578  BirdVox-DCASE-20k      0.0
19997  fff80e7a-7913-4a58-ab9b-4facffe04e56  BirdVox-DCASE-20k      0.0
19998  fff847f3-fcfe-43b6-a7c2-85cd84a05cee  BirdVox-DCASE-20k      0.0
19999  fffda998-2df9-4055-ab82-2e16b95338a7  BirdVox-DCASE-20k      0.0

[20000 rows x 3 columns]
      itemid   datasetid  hasbird
0      64486  ff1010bird      0.0
1       2525  ff1010bird      0.0
2      4

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_epoch_end_accuracy': 0.9098011255264282,
 'test_epoch_end_loss': 0.26148900389671326}
--------------------------------------------------------------------------------
[{'test_epoch_end_accuracy': 0.9098011255264282, 'test_epoch_end_loss': 0.26148900389671326}]


  value = torch.tensor(value, device=device, dtype=torch.float)


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss_step,0.48725
train_acc_step,0.875
epoch,24.0
_runtime,5000.0
_timestamp,1615938539.0
_step,34649.0
train_loss_epoch,0.21874
train_acc_epoch,0.91897
training_epoch_end_accuracy,0.91906
training_epoch_end_loss,0.21863


0,1
train_loss_step,▆█▅▃▅▃▄▆▃▆▃▃▃▃▄▅▃▂█▃▃▂▃▄▁▄▃▂▂▃▄▃▃▁▁▁▁▂▂▂
train_acc_step,▅▁▅▇▃▆▅▅▇▅▇▇▇▇▇▄▆█▂▇▅▇▇▆█▅▇▇██▇▇▇████▇▇▇
epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄█
train_loss_epoch,█▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁
train_acc_epoch,▁▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████
training_epoch_end_accuracy,▁▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████
training_epoch_end_loss,█▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁


In [26]:
places = []
with open('wrong_classified_validation_file_names.txt', 'r') as filehandle:
    for line in filehandle:
        # remove linebreak which is the last character of the string
        currentPlace = line[:-1]

        # add item to the list
        places.append(currentPlace)