## ESC-50: Dataset for Environmental Sound Classification
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification.

The dataset consists of 5-second-long recordings organized into 50 semantical classes (with 40 examples per class) loosely arranged into 5 major categories:

The dataset has been prearranged into 5 folds for comparable cross-validation, making sure that fragments from the same original source file are contained in a single fold.


## Content
**audio/*.wav**

    2000 audio recordings in WAV format (5 seconds, 44.1 kHz, mono) with the following naming convention:

    {FOLD}-{CLIP_ID}-{TAKE}-{TARGET}.wav

    {FOLD} - index of the cross-validation fold,
    {CLIP_ID} - ID of the original Freesound clip,
    {TAKE} - letter disambiguating between different fragments from the same Freesound clip,
    {TARGET} - class in numeric format [0, 49].
**esc50.csv**
   
##### ESC-50 Data Fields
    - audio: tensor containing audio data.
    - labels: tensor containing labels for the audio.
    - esc10: The esc10 column indicates if a given file belongs to the ESC-10 subset (10 selected classes, CC BY license)
    - take: tensor containing the revision number of the audio.
    - fold: tensor containing fold number of the audio.
    - target: tensor containing id of the audio.
    - src_file: tensor containing file name of the audio.

In [1]:
import os
import pandas as pd
import torchaudio,torch
from torch.utils.data import Dataset


In [2]:
df = pd.read_csv('./meta/esc50.csv')
df.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [3]:
df.shape

(2000, 7)

In [4]:
class ESCDataset(Dataset):
    
    def __init__(self, 
                 annotations_file, 
                 audio_dir,
                 transformation,
                 target_sample_rate,
                 num_samples,
                 device
                ):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        
        
    def __len__(self):
        return len(self.annotations)
    
    def get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index,0])
        return path
    
    def get_audio_sample_label(self, index):
        return self.annotations.iloc[index,2]
    
    def update_sample_rate(self,signal,sr):
        if sr!=self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr,self.target_sample_rate)
            signal = resampler(signal)
        return signal
            
    def convert_channel_to_mono(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim= True)
        return signal
    
    def cut_signal(self,signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:,:self.num_samples]
        return signal
    
    def right_padding(self,signal):
        signal_len = signal.shape[1]
        if signal_len < self.num_samples:
            num_missing_samples = self.num_samples - signal_len
            last_dim_padding = (0,num_missing_samples)#1,3
            #[1,1,1] -> [0,1,1,1,0,0,0]
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
    
    def __getitem__(self,index):
        '''
        loading the waveform of audio sample associated with index.
        Also return label associated with it
        '''
        audio_sample_path = self.get_audio_sample_path(index)
        label = self.get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        print(signal)
        # Update all sample rates to a constant target sample rate
        signal = self.update_sample_rate(signal,sr)
        print(signal)
        #converting channels to mono
        signal = self.convert_channel_to_mono(signal)
        signal = self.cut_signal(signal)
        signal = self.right_padding(signal)
        signal = self.transformation(signal)
        return signal, label

In [5]:
ANNOTATION_FILE = './meta/esc50.csv'
AUDIO_DIR = './audio/audio/'
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

#Callable Objects
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

esc = ESCDataset(
                    ANNOTATION_FILE,
                 AUDIO_DIR,
                 mel_spectrogram,
                 SAMPLE_RATE,
                 NUM_SAMPLES,
                 device
                )
print(f"There are {len(esc)} samples in dataset")

There are 2000 samples in dataset


In [6]:
from torchsummary import summary
from torch import nn

class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 5 * 4, 50)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 64, 44))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   

In [7]:
BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001

In [8]:
def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")

In [9]:
from torch.utils.data import DataLoader
def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader
train_dataloader = create_data_loader(esc, BATCH_SIZE)

In [10]:
    # construct model and assign it to device
    cnn = CNNNetwork().to(device)
    print(cnn)

    # initialise loss funtion + optimiser
    loss_fn = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(cnn.parameters(),
                                 lr=LEARNING_RATE)

    # train model
    train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)


CNNNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=50, bias=True)
  (softmax): Softmax(dim=1)
)
Epoch 1
tensor([[0., 0., 0.,  ..., 0., 

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:

# save model
torch.save(cnn.state_dict(), "esc.pth")
print("Trained feed forward net saved at feedforwardnet.pth")

In [11]:
!pip install -r https://raw.githubusercontent.com/MicrosoftDocs/pytorchfundamentals/main/audio-pytorch/install-packages.txt

Collecting sox
  Downloading sox-1.4.1-py2.py3-none-any.whl (39 kB)
Installing collected packages: sox
Successfully installed sox-1.4.1
