In [None]:
%pip install pytorch_lightning
%pip install torchmetrics

In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import sklearn
import torchaudio
import torchmetrics
import os
import torch.optim as optim
import re
import pytorch_lightning as pl
import torchaudio.transforms as T
import torch.nn.functional as F
from tqdm import tqdm
from sklearn import preprocessing
from sklearn import model_selection
from torch.utils.data import DataLoader
from glob import glob

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
train_set = pd.read_csv('/home/shemetov@ad.speechpro.com/Desktop/DetectAudio/train.csv')
test_set = pd.read_csv('/home/shemetov@ad.speechpro.com/Desktop/DetectAudio/sample_submission.csv')
train_set.head(2)


Unnamed: 0,fname,label
0,8bcbcc394ba64fe85ed4.wav,Finger_snapping
1,00d77b917e241afa06f1.wav,Squeak


In [3]:
le = sklearn.preprocessing.LabelEncoder()
le = le.fit(train_set['label'].values)
le.transform(train_set['label'].values)
train_set['label_encoded'] = le.transform(train_set['label'].values)
train_set.head(2)

Unnamed: 0,fname,label,label_encoded
0,8bcbcc394ba64fe85ed4.wav,Finger_snapping,16
1,00d77b917e241afa06f1.wav,Squeak,34


In [4]:
def padding_to_wav(wav,pad_size,mode = 'None'):
  p = (pad_size - len(wav.flatten())) // 2 + 1
  if p>0:
    if mode == 'constant':
      wav = torch.nn.functional.pad(wav, (p, p), value=0.0)
    else:
      wav = torch.nn.functional.pad(wav, (p, p), mode='reflect')
  wav = wav[:, :pad_size]
  return wav
  

In [4]:
n_fft = 256
win_length = None #Window size. (Default: n_fft)
hop_length = 100 #Length of hop between STFT windows. (Default: win_length // 2)
n_mels = 128
n_mfcc = 40


mfcc_transform = T.MFCC(
    n_mfcc = n_mfcc
)



In [13]:
class AudioDataset(torch.utils.data.Dataset):
  def __init__(self, dir_path, X, y = None):
    self.X = X
    self.y = y
    self.average_len_wav = 108929
    self.dir_path = dir_path

  def __getitem__(self, index):
    path_to_wav = os.path.join(self.dir_path, self.X[index])
    wav_pad,sr = torchaudio.load(path_to_wav)
    #feature = torch.transpose(mfcc_transform(wav_pad),0,1)
    #print(feature.shape)
    wav_pad = padding_to_wav(wav,self.average_len_wav,'constant')
    if self.y is not None:
      return wav_pad, self.y[index]
    return wav_pad

  def __len__(self):
    return len(self.X)

In [14]:
DIR_TRAIN_PATH = '/home/shemetov@ad.speechpro.com/Desktop/DetectAudio/audio_train'
DIR_TEST_PATH = '/home/shemetov@ad.speechpro.com/Desktop/DetectAudio/audio_test'

BATCH_SIZE = 41

X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(train_set.fname.values, train_set.label_encoded.values, 
                                                  test_size=0.2, random_state=42)

train_loader = DataLoader(AudioDataset(os.path.join(DIR_TRAIN_PATH, 'train'), X_train, y_train),batch_size=BATCH_SIZE, num_workers=12)

val_loader = DataLoader(AudioDataset(os.path.join(DIR_TRAIN_PATH, 'train'), X_val, y_val),batch_size=BATCH_SIZE, num_workers=12)

test_loader = DataLoader(AudioDataset(os.path.join(DIR_TEST_PATH, 'test'), test_set.fname.values, None),batch_size=BATCH_SIZE, shuffle=False, num_workers=12)

In [16]:
print(next(iter(train_loader)).shape) 


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/shemetov@ad.speechpro.com/.local/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/shemetov@ad.speechpro.com/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/home/shemetov@ad.speechpro.com/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 172, in default_collate
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "/home/shemetov@ad.speechpro.com/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 172, in <listcomp>
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "/home/shemetov@ad.speechpro.com/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 137, in default_collate
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable


In [None]:
class CNNet(pl.LightningModule):
    def __init__(self,le,in_channels = 1,out_channels = 24,kernel_size = 3,num_output = 41):
        super().__init__()
        #self.model = model
        self.le = le
        self.mfcc = mfcc_transform
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size,padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.pool1 = nn.MaxPool2d(2,2)
        #self.drop1 = nn.Dropout(0.2)
        self.conv2 = nn.Conv2d(out_channels, 32, 3,padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(2,2)
        #self.drop2 = nn.Dropout(0.2)
        self.conv3 = nn.Conv2d(32, 64, 3,padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        #self.drop3 = nn.Dropout(0.2)
        self.pool3 = nn.MaxPool2d(2,2)
        self.conv4 = nn.Conv2d(64, 128, 3,padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        #self.drop4 = nn.Dropout(0.2)
        self.pool4 = nn.MaxPool2d(2,2)
        #self.conv5 = nn.Conv2d(2 * out_channels, 2 * out_channels, 3)
        #self.bn5 = nn.BatchNorm2d(2 * out_channels)
        #self.drop5 = nn.Dropout(0.2)
        #self.pool5 = nn.MaxPool2d(2,2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(8704, 128)
        self.drop1 = nn.Dropout(0.2)
        #self.fc2 = nn.Linear(128, 128)
        self.fc2 = nn.Linear(128, 41)
        #self.drop3 = nn.Dropout(0.5)
        #self.fc4 = nn.Linear(64, 41)
        self.f1_train = torchmetrics.F1Score(num_classes=num_output)
        self.f1_val = torchmetrics.F1Score(num_classes=num_output)
        self.acc_train = torchmetrics.Accuracy(num_classes=num_output)
        self.acc_val = torchmetrics.Accuracy(num_classes=num_output)

    def forward(self, x):
        x = self.mfcc(x)
        x = self.pool1(self.bn1(self.conv1(x)))
        x = self.pool2(self.bn2(self.conv2(x)))
        x = self.pool3(self.bn3(self.conv3(x)))
        x = self.pool4(self.bn4(self.conv4(x)))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.drop1(x)
        x = self.fc2(x)
        return F.log_softmax(x,dim=1) 

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = F.cross_entropy(y_pred, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.f1_train(y_pred, y)
        self.acc_train(y_pred, y)
        return loss

    def training_epoch_end(self, outs):
        f1 = self.f1_train.compute()
        acc = self.acc_train.compute()
        self.log('train_f1_epoch', f1)
        self.log('train_acc_epoch', acc)
        print('train_f1', f1, 'train_acc', acc)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = F.cross_entropy(y_pred, y)
        self.f1_val(y_pred, y)
        self.acc_val(y_pred, y)
        self.log("val_loss", loss)

    def validation_epoch_end(self, validation_step_outputs):
        f1 = self.f1_val.compute()
        acc = self.acc_val.compute()
        self.log('val_f1_epoch', f1)
        self.log('val_acc_epoch', acc)
        print('val_f1', f1, 'val_acc', acc)
        
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        x = batch
        y_pred = self(x)
        y_pred = torch.argmax(y_pred, dim=1)
        y_pred = self.le.inverse_transform(y_pred.cpu().detach().numpy())
        return y_pred

    def configure_optimizers(self):
        return optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.1)

In [24]:
class CNNet(pl.LightningModule):
    def __init__(self,le,in_channels = 1,out_channels = 24,kernel_size = 3,num_output = 41):
        super().__init__()
        #self.model = model
        self.le = le
        self.mfcc = mfcc_transform
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size,padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.pool1 = nn.MaxPool2d(2,2)
        #self.drop1 = nn.Dropout(0.2)
        self.conv2 = nn.Conv2d(out_channels, 32, 3,padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(2,2)
        #self.drop2 = nn.Dropout(0.2)
        self.conv3 = nn.Conv2d(32, 64, 3,padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        #self.drop3 = nn.Dropout(0.2)
        self.pool3 = nn.MaxPool2d(2,2)
        self.conv4 = nn.Conv2d(64, 128, 3,padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        #self.drop4 = nn.Dropout(0.2)
        self.pool4 = nn.MaxPool2d(2,2)
        #self.conv5 = nn.Conv2d(2 * out_channels, 2 * out_channels, 3)
        #self.bn5 = nn.BatchNorm2d(2 * out_channels)
        #self.drop5 = nn.Dropout(0.2)
        #self.pool5 = nn.MaxPool2d(2,2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(8704, 128)
        self.drop1 = nn.Dropout(0.2)
        #self.fc2 = nn.Linear(128, 128)
        self.fc2 = nn.Linear(128, 41)
        #self.drop3 = nn.Dropout(0.5)
        #self.fc4 = nn.Linear(64, 41)
        self.f1_train = torchmetrics.F1Score(num_classes=num_output)
        self.f1_val = torchmetrics.F1Score(num_classes=num_output)
        self.acc_train = torchmetrics.Accuracy(num_classes=num_output)
        self.acc_val = torchmetrics.Accuracy(num_classes=num_output)

    def forward(self, x):
        x = self.mfcc(x)
        x = self.pool1(self.bn1(self.conv1(x)))
        x = self.pool2(self.bn2(self.conv2(x)))
        x = self.pool3(self.bn3(self.conv3(x)))
        x = self.pool4(self.bn4(self.conv4(x)))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.drop1(x)
        x = self.fc2(x)
        return F.log_softmax(x,dim=1) 

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = F.cross_entropy(y_pred, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.f1_train(y_pred, y)
        self.acc_train(y_pred, y)
        return loss

    def training_epoch_end(self, outs):
        f1 = self.f1_train.compute()
        acc = self.acc_train.compute()
        self.log('train_f1_epoch', f1)
        self.log('train_acc_epoch', acc)
        print('train_f1', f1, 'train_acc', acc)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = F.cross_entropy(y_pred, y)
        self.f1_val(y_pred, y)
        self.acc_val(y_pred, y)
        self.log("val_loss", loss)

    def validation_epoch_end(self, validation_step_outputs):
        f1 = self.f1_val.compute()
        acc = self.acc_val.compute()
        self.log('val_f1_epoch', f1)
        self.log('val_acc_epoch', acc)
        print('val_f1', f1, 'val_acc', acc)
        
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        x = batch
        y_pred = self(x)
        y_pred = torch.argmax(y_pred, dim=1)
        y_pred = self.le.inverse_transform(y_pred.cpu().detach().numpy())
        return y_pred

    def configure_optimizers(self):
        return optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.1)

In [None]:
trainer = pl.Trainer(max_epochs=100,accelerator="gpu")
model = CNNet(le = le)
trainer.fit(model, train_dataloaders=train_loader,val_dataloaders=val_loader)

In [26]:
pred = trainer.predict(model, test_loader)
pred = np.concatenate(pred)
print(pred)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 111it [00:00, ?it/s]

['Knock' 'Acoustic_guitar' 'Knock' ... 'Shatter' 'Double_bass' 'Oboe']


In [58]:
print(predictions[0])

tensor([[ -4.6247, -10.0852,  -5.0876,  ...,  -5.4911,  -4.8031,  -6.9315],
        [ -0.0927, -12.2059,  -8.6241,  ...,  -9.4892,  -7.1551,  -9.3535],
        [ -4.0457,  -9.4591,  -6.7104,  ...,  -5.6077,  -4.8786,  -4.3759],
        ...,
        [ -5.1363,  -7.2333,  -6.3280,  ...,  -3.2359,  -4.2287,  -7.8392],
        [ -5.9941,  -4.8185,  -4.1072,  ...,  -3.0035,  -6.4710,  -3.0381],
        [ -5.4579,  -3.5796,  -4.4503,  ...,  -3.0990,  -4.8004,  -2.2101]])


In [27]:
df = pd.DataFrame({'fname':test_set.fname.values, 'label':pred})
df = df.reset_index(drop=True)
df.to_csv('response_result.csv', index=False)
df

Unnamed: 0,fname,label
0,973a8b381d1875ebe120.wav,Knock
1,052e1cbeb4391d6af234.wav,Acoustic_guitar
2,16eb98d96319450e0949.wav,Knock
3,ff13c136c04f274229a5.wav,Cowbell
4,b458211304f7a14b29b2.wav,Cough
...,...,...
3785,959206e695c2ceed3d3b.wav,Snare_drum
3786,3624e1c6d306945d7ba9.wav,Bass_drum
3787,4a22b7dd3f519e2a9111.wav,Shatter
3788,a2c95e48acc6a3ec2696.wav,Double_bass


###Результат экспериментов

####Набор данных:
* Длительность каждой записи: 6 сек (Была посчитана средняя продолжительность всех аудиозаписей и использован метод padding, где по краям были добавлены нулевые значения)
* Признаки: MFCC Features
* Размерность: [5683, 1, 256, 213]


####**Эксперимент №1**:
*   Оптимизация: Adam( lr=0.0001, weight_decay=0.1)
*   Эпох: 25
*   Batch-size: 32
*   Коэф. Dropout: 0.2
*   MaxPool размер: 2
*   Количество сверточных/полносвязных слоев: 4x Conv2d, 2x Full Layer

MFCC:
* n_fft = 2048
* win_length = None
* hop_length = 512
* n_mels = 256
* n_mfcc = 40

Результат:

* train f1-score: 0.5761
* val f1-score: 0.506

####**Эксперимент №2**:
*   Оптимизация: Adam( lr=0.0001, weight_decay=0.1)
*   Эпох: 57
*   Batch-size: 32
*   Коэф. Dropout: 0.2
*   MaxPool размер: 2
*   Количество сверточных/полносвязных слоев: 4x Conv2d, 2x Full Layer

MFCC:
* n_fft = 2048
* win_length = None
* hop_length = 512
* n_mels = 256
* n_mfcc = 40

Результат:

* train f1-score: 0.6524
* val f1-score: 0.5414

####**Эксперимент №3**:
*   Оптимизация: Adam( lr=0.0001, weight_decay=0.1)
*   Эпох: 75
*   Batch-size: 41
*   Коэф. Dropout: 0.2
*   MaxPool размер: 2
*   Количество сверточных/полносвязных слоев: 4x Conv2d, 2x Full Layer

MFCC:
* n_fft = 2048
* win_length = None 
* hop_length = 512
* n_mels = 256
* n_mfcc = 40

Результат:

* train f1-score: 0.6910
* val f1-score: 0.5570

####**Эксперимент №4**:
*   Оптимизация: Adam( lr=0.0001, weight_decay=0.1)
*   Эпох: 100
*   Batch-size: 41
*   Коэф. Dropout: 0.2
*   MaxPool размер: 2
*   Количество сверточных/полносвязных слоев: 4x Conv2d, 2x Full Layer

MFCC:
* n_fft = 2048
* win_length = None 
* hop_length = 512
* n_mels = 256
* n_mfcc = 40

Результат:

* train f1-score: 0.7233
* val f1-score: 0.5627

####**Эксперимент №5**:
*   Оптимизация: Adam( lr=0.0001, weight_decay=0.1)
*   Эпох: 100
*   Batch-size: 41
*   Коэф. Dropout: 0.2
*   MaxPool размер: 2
*   Количество сверточных/полносвязных слоев: 4x Conv2d, 2x Full Layer

MFCC:
* n_fft = 256
* win_length = None #Window size.
* hop_length = 128 #Length of hop between STFT windows.
* n_mels = 128
* n_mfcc = 40

Результат:

* train f1-score: 0.6993
* val f1-score: 0.5581
* kaggle: 0.55057

####**Эксперимент №6**:
*   Оптимизация: Adam( lr=0.0001, weight_decay=0.1)
*   Эпох: 100
*   Batch-size: 41
*   Коэф. Dropout: 0.2
*   MaxPool размер: 2
*   Количество сверточных/полносвязных слоев: 4x Conv2d, 2x Full Layer

MFCC:
* n_mfcc = 40
* Остальные параметры по-умолчанию метода

Результат:

* train f1-score: 0.6927
* val f1-score: 0.5528
* loss: 1.31
* kaggle: 0.55672