In [1]:
%matplotlib inline
import os
import random
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import model_selection
from sklearn import preprocessing
import IPython.display as ipd
import cv2



In [2]:
#!ls

In [2]:
train_file = "train.csv"
audio_dir = "audio_files/"

meta_data = pd.read_csv(train_file)
meta_data.head()

Unnamed: 0,file_name,target
0,c2c4efc4-0c7c-4696-8743-dd9d08209d71.wav,six
1,a0513500-ab86-4720-87ac-4916f06988ff.wav,five
2,7a9e9857-6a9f-4306-aab0-d2de315fae24.wav,wow
3,efc1d37d-bca9-4a84-b1cd-242396f52c73.wav,cat
4,21c3143d-3a8f-4e84-b48a-a8e25b2757c3.wav,forward


In [3]:
data_size = meta_data.shape
data_size

(74080, 2)

In [6]:
x = list(meta_data.loc[:,"file_name"])
y = list(meta_data.loc[:, "target"])

print(len(x),len(y))
target_set = set(y)
class_dic = dict()

for i,j in enumerate(target_set):
    class_dic[j] = i

print(class_dic)
x_dir = []
y_dir = []
ind = 0

while ind < len(x):
    file_path = audio_dir + x[ind]
    class_num = class_dic[y[ind]]
    x_dir.append(file_path)
    y_dir.append(class_num)

    ind = ind + 1  
    #if ind%(len(x)/10) == 0:
    #    print('process = {}%'.format(ind*10/(len(x)/10)))
    #    print("x_dir = {}, y_dir = {}".format(len(x_dir),len(y_dir)))    

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_dir, y_dir, test_size=0.2, stratify=y_dir)
print("x train:{0}\ny train:{1}\nx test:{2}\ny test:{3}".format(len(x_train),
                                                                len(y_train), 
                                                                len(x_test), 
                                                                len(y_test)))
print("x_train = \n", x_train[0:10])
print("y_train = \n", y_train[0:10])                                                    

74080 74080
{'bird': 0, 'zero': 1, 'eight': 2, 'left': 3, 'backward': 4, 'four': 5, 'cat': 6, 'off': 7, 'forward': 8, 'one': 9, 'house': 10, 'dog': 11, 'tree': 12, 'go': 13, 'happy': 14, 'marvin': 15, 'visual': 16, 'no': 17, 'down': 18, 'bed': 19, 'up': 20, 'yes': 21, 'sheila': 22, 'learn': 23, 'follow': 24, 'right': 25, 'stop': 26, 'three': 27, 'six': 28, 'on': 29, 'five': 30, 'seven': 31, 'nine': 32, 'wow': 33, 'two': 34}
x train:59264
y train:59264
x test:14816
y test:14816
x_train = 
 ['audio_files/47c38c0c-6d2b-4c55-8acc-dfa3b20bc0c6.wav', 'audio_files/dd2e3aa1-b3b5-4006-ae8d-ed15be7378b7.wav', 'audio_files/29ce847b-742b-47fa-a96d-4730f1d86d08.wav', 'audio_files/0b18a604-9097-4c48-9fc9-c1273ae7e1e6.wav', 'audio_files/59cc42f7-fd45-4a7c-a892-d162407ace67.wav', 'audio_files/30847870-eb87-4e88-8f0f-cee678250d6a.wav', 'audio_files/91c3b48b-a65d-4323-b0d6-545c2dfd904e.wav', 'audio_files/5c590ac0-1578-48da-8479-9554545c0dbc.wav', 'audio_files/ca6b84f0-de91-4bef-bd17-4d4d3f05f41b.wav', '

In [7]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchaudio
import torch.nn.functional as F
import torchvision
from torch.autograd import Variable

from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [8]:
class SpeechDataset(Dataset):
    def __init__(self, data_path, target=None, is_test=False, augmentation=None):
        super().__init__()
        self.data_path = data_path
        self.target = target
        self.is_test = is_test
        self.augmentation = augmentation
        self.duration = 1000
        self.sr = 16000
        self.n_fft = 1024
        self.hop_length = None
        self.n_mels = 64
        self.top_db = 80

    def __len__(self):
        return len(self.data_path)

    # def __getitem__(self, item):
    #     wav, sr = load_wave_data(self.data_path[item])
    #     #melsp_x = librosa.feature.melspectrogram(y=wav, sr=sr, hop_length=512, n_mels=32, n_fft=2048)
    #     #melsp_x = librosa.feature.melspectrogram(y=norm_wav, sr=16000, hop_length=512, n_mels=32, n_fft=2048)
    #     #melsp_x = calculate_melsp(wav,n_mels=64,n_fft=1024,hop_length=256)
    #     melsp_x = calculate_melsp(wav,n_mels=64,n_fft=1024,hop_length=None)
    #     norm_melsp = (melsp_x - sample_mean) / (sample_std + 1e-6)

    #     norm_melsp = norm_melsp[np.newaxis, :, :]
    #     norm_melsp = norm_melsp.astype('float32')
    #     norm_melsp = torch.from_numpy(norm_melsp)
    #     return norm_melsp, self.target[item]

    def __getitem__(self,idx):
        file_path = self.data_path[idx]
        class_id = self.target[idx]

        samples,sr = torchaudio.load(file_path)
        samples = self._pad_trunc(samples,self.sr)

        spect = torchaudio.transforms.MelSpectrogram(
            self.sr,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            n_mels=self.n_mels
        )(samples)

        spect = torchaudio.transforms.AmplitudeToDB(top_db=self.top_db)(spect)
        spect = self.rechannel(spect,self.sr,3)

        return spect, self.target[idx]
        

    def _pad_trunc(self, samples, sr):
        num_rows, signal_len = samples.shape
        max_len = sr // 1000 * self.duration

        if (signal_len > max_len):
            # Truncate the signal to the given length
            samples = samples[:, max_len]

        elif (signal_len < max_len):
            # Length of padding to add at the beginning and end of the signal
            pad_begin_len = random.randint(0, max_len - signal_len)
            pad_end_len = max_len - signal_len - pad_begin_len

            # Pad with 0s
            pad_begin = torch.zeros((1, pad_begin_len))
            pad_end = torch.zeros((1, pad_end_len))

            samples = torch.cat((pad_begin, samples, pad_end), 1)

        return samples

    def rechannel(self, spect, sr, num_channel):
        if (spect.shape[0] == num_channel):
            # Nothing to do
            return spect

        if (num_channel == 1):
            # Convert from stereo to mono by selecting only the first channel
            spect = spect[:1, :]
        else:
            # Convert from mono to stereo by duplicating the first channel
            spect = torch.cat([spect, spect, spect])

        return spect

    def _time_shift(self, samples, sr, shift_limit):
        _, sig_len = samples.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return samples.roll(shift_amt)
    


In [9]:
config = {
    'batch_size': 64,
    'num_workers': 0,
    'epochs': 10,
    'device': 'cpu'
}


In [10]:
train_dataset = SpeechDataset(
    data_path = x_train,
    target = y_train,
    is_test = False
)
valid_dataset = SpeechDataset(
    data_path = x_test,
    target = y_test,
    is_test = False
)
train_loader = DataLoader(
    train_dataset,
    batch_size = config['batch_size'],
    shuffle = True,
    num_workers = config['num_workers'],
    drop_last = True
)
valid_loader = DataLoader(
    valid_dataset,
    batch_size = config['batch_size'],
    shuffle = False,
    num_workers = config['num_workers'],
    drop_last = False
)

In [12]:
model = torchvision.models.resnet18(num_classes=35)
#model = torchvision.models.mobilenet_v3_small(num_classes=35)
#model.features[0][0] = nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
model.to(config['device'])

# 获取优化方法
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4, betas=(0.9,0.99), eps=1e-6, weight_decay=5e-4)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-2)
# 获取学习率衰减函数
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1, 2, 3, 4], gamma=0.3)
# 获取损失函数
loss = torch.nn.CrossEntropyLoss()
device = 'cpu'

In [21]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


In [18]:
def validation(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
    test_loss /= num_batches
    correct /= size
    print(f"Val Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

In [22]:
epochs = 30
for t in tqdm(range(epochs)):
    print(f"Epoch {t + 1}\n-------------------------------")
    train(train_loader, model, loss, optimizer)
    validation(valid_loader, model, loss)

print("Done!")


  0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1
-------------------------------
loss: 2.481280  [    0/59264]
loss: 1.703591  [ 6400/59264]
loss: 1.234425  [12800/59264]
loss: 1.049305  [19200/59264]
loss: 0.965315  [25600/59264]
loss: 0.907588  [32000/59264]
loss: 0.742032  [38400/59264]
loss: 0.751323  [44800/59264]
loss: 0.496773  [51200/59264]
loss: 0.571004  [57600/59264]


  3%|▎         | 1/30 [07:39<3:42:02, 459.38s/it]

Val Error: 
 Accuracy: 81.0%, Avg loss: 0.625335 

Epoch 2
-------------------------------
loss: 0.727611  [    0/59264]
loss: 0.672994  [ 6400/59264]
loss: 0.525397  [12800/59264]
loss: 0.631269  [19200/59264]
loss: 0.359109  [25600/59264]
loss: 0.356472  [32000/59264]
loss: 0.567875  [38400/59264]
loss: 0.430809  [44800/59264]
loss: 0.392609  [51200/59264]
loss: 0.377854  [57600/59264]


  7%|▋         | 2/30 [15:16<3:33:46, 458.10s/it]

Val Error: 
 Accuracy: 85.6%, Avg loss: 0.478784 

Epoch 3
-------------------------------
loss: 0.341169  [    0/59264]
loss: 0.272391  [ 6400/59264]
loss: 0.261559  [12800/59264]
loss: 0.180442  [19200/59264]
loss: 0.353578  [25600/59264]
loss: 0.425014  [32000/59264]
loss: 0.374524  [38400/59264]
loss: 0.432237  [44800/59264]
loss: 0.385376  [51200/59264]
loss: 0.262756  [57600/59264]


 10%|█         | 3/30 [22:52<3:25:39, 457.02s/it]

Val Error: 
 Accuracy: 87.7%, Avg loss: 0.405206 

Epoch 4
-------------------------------
loss: 0.287221  [    0/59264]
loss: 0.456866  [ 6400/59264]
loss: 0.189802  [12800/59264]
loss: 0.225572  [19200/59264]
loss: 0.281038  [25600/59264]
loss: 0.258909  [32000/59264]
loss: 0.137221  [38400/59264]
loss: 0.172309  [44800/59264]
loss: 0.278949  [51200/59264]
loss: 0.293372  [57600/59264]


 13%|█▎        | 4/30 [30:26<3:17:30, 455.80s/it]

Val Error: 
 Accuracy: 88.0%, Avg loss: 0.398976 

Epoch 5
-------------------------------
loss: 0.297384  [    0/59264]
loss: 0.171815  [ 6400/59264]
loss: 0.169197  [12800/59264]
loss: 0.178189  [19200/59264]
loss: 0.228392  [25600/59264]
loss: 0.162928  [32000/59264]
loss: 0.296638  [38400/59264]
loss: 0.238760  [44800/59264]
loss: 0.218751  [51200/59264]
loss: 0.346815  [57600/59264]


 17%|█▋        | 5/30 [38:42<3:15:55, 470.22s/it]

Val Error: 
 Accuracy: 88.5%, Avg loss: 0.391536 

Epoch 6
-------------------------------
loss: 0.162231  [    0/59264]
loss: 0.167179  [ 6400/59264]
loss: 0.163926  [12800/59264]
loss: 0.108232  [19200/59264]
loss: 0.168646  [25600/59264]
loss: 0.028973  [32000/59264]
loss: 0.131156  [38400/59264]
loss: 0.185285  [44800/59264]
loss: 0.078865  [51200/59264]
loss: 0.149404  [57600/59264]


 20%|██        | 6/30 [46:23<3:06:54, 467.29s/it]

Val Error: 
 Accuracy: 89.0%, Avg loss: 0.381273 

Epoch 7
-------------------------------
loss: 0.094476  [    0/59264]
loss: 0.050540  [ 6400/59264]
loss: 0.227236  [12800/59264]
loss: 0.168350  [19200/59264]
loss: 0.153283  [25600/59264]
loss: 0.183500  [32000/59264]
loss: 0.221535  [38400/59264]
loss: 0.169383  [44800/59264]
loss: 0.108187  [51200/59264]
loss: 0.141752  [57600/59264]


 23%|██▎       | 7/30 [54:00<2:57:51, 464.00s/it]

Val Error: 
 Accuracy: 89.8%, Avg loss: 0.360992 

Epoch 8
-------------------------------
loss: 0.182353  [    0/59264]
loss: 0.079080  [ 6400/59264]
loss: 0.106048  [12800/59264]
loss: 0.225905  [19200/59264]
loss: 0.152329  [25600/59264]
loss: 0.221972  [32000/59264]
loss: 0.078138  [38400/59264]
loss: 0.166395  [44800/59264]
loss: 0.123466  [51200/59264]
loss: 0.164292  [57600/59264]


 27%|██▋       | 8/30 [1:01:37<2:49:13, 461.51s/it]

Val Error: 
 Accuracy: 89.6%, Avg loss: 0.367865 

Epoch 9
-------------------------------
loss: 0.092914  [    0/59264]
loss: 0.126417  [ 6400/59264]
loss: 0.204462  [12800/59264]
loss: 0.048378  [19200/59264]
loss: 0.242186  [25600/59264]
loss: 0.240746  [32000/59264]
loss: 0.095853  [38400/59264]
loss: 0.134812  [44800/59264]
loss: 0.122115  [51200/59264]
loss: 0.032261  [57600/59264]


 30%|███       | 9/30 [1:09:13<2:40:58, 459.94s/it]

Val Error: 
 Accuracy: 90.0%, Avg loss: 0.360124 

Epoch 10
-------------------------------
loss: 0.204911  [    0/59264]
loss: 0.043918  [ 6400/59264]
loss: 0.176894  [12800/59264]
loss: 0.153172  [19200/59264]
loss: 0.090665  [25600/59264]
loss: 0.029441  [32000/59264]
loss: 0.013002  [38400/59264]
loss: 0.137095  [44800/59264]
loss: 0.088534  [51200/59264]
loss: 0.144370  [57600/59264]


 33%|███▎      | 10/30 [1:16:49<2:32:57, 458.86s/it]

Val Error: 
 Accuracy: 90.1%, Avg loss: 0.355942 

Epoch 11
-------------------------------
loss: 0.046307  [    0/59264]
loss: 0.065432  [ 6400/59264]
loss: 0.062928  [12800/59264]
loss: 0.046242  [19200/59264]
loss: 0.125323  [25600/59264]
loss: 0.157498  [32000/59264]
loss: 0.221623  [38400/59264]
loss: 0.076559  [44800/59264]
loss: 0.062225  [51200/59264]
loss: 0.086346  [57600/59264]


 37%|███▋      | 11/30 [1:24:26<2:25:03, 458.09s/it]

Val Error: 
 Accuracy: 89.3%, Avg loss: 0.391933 

Epoch 12
-------------------------------
loss: 0.179102  [    0/59264]
loss: 0.230071  [ 6400/59264]
loss: 0.052478  [12800/59264]
loss: 0.022511  [19200/59264]
loss: 0.130658  [25600/59264]
loss: 0.133813  [32000/59264]
loss: 0.068243  [38400/59264]
loss: 0.141631  [44800/59264]
loss: 0.105669  [51200/59264]
loss: 0.158861  [57600/59264]


 40%|████      | 12/30 [1:32:02<2:17:14, 457.50s/it]

Val Error: 
 Accuracy: 90.5%, Avg loss: 0.349351 

Epoch 13
-------------------------------
loss: 0.097403  [    0/59264]
loss: 0.038319  [ 6400/59264]
loss: 0.074442  [12800/59264]
loss: 0.148877  [19200/59264]
loss: 0.163629  [25600/59264]
loss: 0.157423  [32000/59264]
loss: 0.016992  [38400/59264]
loss: 0.248562  [44800/59264]
loss: 0.049423  [51200/59264]
loss: 0.076536  [57600/59264]


 43%|████▎     | 13/30 [1:39:39<2:09:34, 457.30s/it]

Val Error: 
 Accuracy: 89.6%, Avg loss: 0.383860 

Epoch 14
-------------------------------
loss: 0.067690  [    0/59264]
loss: 0.035314  [ 6400/59264]
loss: 0.045585  [12800/59264]
loss: 0.090224  [19200/59264]
loss: 0.064982  [25600/59264]
loss: 0.027026  [32000/59264]
loss: 0.032408  [38400/59264]
loss: 0.069573  [44800/59264]
loss: 0.040553  [51200/59264]
loss: 0.131552  [57600/59264]


 47%|████▋     | 14/30 [1:47:15<2:01:50, 456.91s/it]

Val Error: 
 Accuracy: 90.4%, Avg loss: 0.359323 

Epoch 15
-------------------------------
loss: 0.018678  [    0/59264]
loss: 0.099424  [ 6400/59264]
loss: 0.028182  [12800/59264]
loss: 0.058189  [19200/59264]
loss: 0.078993  [25600/59264]
loss: 0.074377  [32000/59264]
loss: 0.012255  [38400/59264]
loss: 0.046494  [44800/59264]
loss: 0.101967  [51200/59264]
loss: 0.014202  [57600/59264]


 50%|█████     | 15/30 [1:54:50<1:54:07, 456.48s/it]

Val Error: 
 Accuracy: 90.1%, Avg loss: 0.366688 

Epoch 16
-------------------------------
loss: 0.065124  [    0/59264]
loss: 0.197607  [ 6400/59264]
loss: 0.048792  [12800/59264]
loss: 0.018334  [19200/59264]
loss: 0.024021  [25600/59264]
loss: 0.161030  [32000/59264]
loss: 0.135320  [38400/59264]
loss: 0.093639  [44800/59264]
loss: 0.062098  [51200/59264]
loss: 0.030565  [57600/59264]


 53%|█████▎    | 16/30 [2:02:26<1:46:27, 456.26s/it]

Val Error: 
 Accuracy: 91.3%, Avg loss: 0.335023 

Epoch 17
-------------------------------
loss: 0.051810  [    0/59264]
loss: 0.023190  [ 6400/59264]
loss: 0.024438  [12800/59264]
loss: 0.124784  [19200/59264]
loss: 0.035647  [25600/59264]
loss: 0.036831  [32000/59264]
loss: 0.026537  [38400/59264]
loss: 0.080696  [44800/59264]
loss: 0.124071  [51200/59264]
loss: 0.126743  [57600/59264]


 57%|█████▋    | 17/30 [2:10:02<1:38:48, 456.05s/it]

Val Error: 
 Accuracy: 90.2%, Avg loss: 0.380136 

Epoch 18
-------------------------------
loss: 0.206036  [    0/59264]
loss: 0.108448  [ 6400/59264]
loss: 0.074660  [12800/59264]
loss: 0.154914  [19200/59264]
loss: 0.142447  [25600/59264]
loss: 0.048690  [32000/59264]
loss: 0.109977  [38400/59264]
loss: 0.024796  [44800/59264]
loss: 0.044026  [51200/59264]
loss: 0.048411  [57600/59264]


 60%|██████    | 18/30 [2:17:38<1:31:15, 456.27s/it]

Val Error: 
 Accuracy: 90.6%, Avg loss: 0.368159 

Epoch 19
-------------------------------
loss: 0.019072  [    0/59264]
loss: 0.158551  [ 6400/59264]
loss: 0.027287  [12800/59264]
loss: 0.085952  [19200/59264]
loss: 0.088495  [25600/59264]
loss: 0.060069  [32000/59264]
loss: 0.018539  [38400/59264]
loss: 0.099724  [44800/59264]
loss: 0.016585  [51200/59264]
loss: 0.034788  [57600/59264]


 63%|██████▎   | 19/30 [2:25:16<1:23:42, 456.64s/it]

Val Error: 
 Accuracy: 90.5%, Avg loss: 0.354998 

Epoch 20
-------------------------------
loss: 0.129995  [    0/59264]
loss: 0.034785  [ 6400/59264]
loss: 0.066376  [12800/59264]
loss: 0.042215  [19200/59264]
loss: 0.078081  [25600/59264]
loss: 0.045370  [32000/59264]
loss: 0.196507  [38400/59264]
loss: 0.012283  [44800/59264]
loss: 0.137484  [51200/59264]
loss: 0.092905  [57600/59264]


 67%|██████▋   | 20/30 [2:32:51<1:16:02, 456.26s/it]

Val Error: 
 Accuracy: 90.3%, Avg loss: 0.368510 

Epoch 21
-------------------------------
loss: 0.039604  [    0/59264]
loss: 0.041188  [ 6400/59264]
loss: 0.054463  [12800/59264]
loss: 0.138704  [19200/59264]
loss: 0.009970  [25600/59264]
loss: 0.022348  [32000/59264]
loss: 0.068610  [38400/59264]
loss: 0.111193  [44800/59264]
loss: 0.045509  [51200/59264]
loss: 0.110812  [57600/59264]


 70%|███████   | 21/30 [2:40:25<1:08:20, 455.56s/it]

Val Error: 
 Accuracy: 90.6%, Avg loss: 0.359956 

Epoch 22
-------------------------------
loss: 0.152253  [    0/59264]
loss: 0.042979  [ 6400/59264]
loss: 0.025487  [12800/59264]
loss: 0.046286  [19200/59264]
loss: 0.021817  [25600/59264]
loss: 0.081538  [32000/59264]
loss: 0.049092  [38400/59264]
loss: 0.025666  [44800/59264]
loss: 0.071321  [51200/59264]
loss: 0.044811  [57600/59264]


 73%|███████▎  | 22/30 [2:47:58<1:00:38, 454.85s/it]

Val Error: 
 Accuracy: 90.7%, Avg loss: 0.365083 

Epoch 23
-------------------------------
loss: 0.015372  [    0/59264]
loss: 0.017734  [ 6400/59264]
loss: 0.062024  [12800/59264]
loss: 0.016421  [19200/59264]
loss: 0.148045  [25600/59264]
loss: 0.136597  [32000/59264]
loss: 0.092718  [38400/59264]
loss: 0.067598  [44800/59264]
loss: 0.044527  [51200/59264]
loss: 0.049367  [57600/59264]


 77%|███████▋  | 23/30 [2:55:33<53:02, 454.64s/it]  

Val Error: 
 Accuracy: 90.7%, Avg loss: 0.357472 

Epoch 24
-------------------------------
loss: 0.028847  [    0/59264]
loss: 0.095372  [ 6400/59264]
loss: 0.020485  [12800/59264]
loss: 0.005762  [19200/59264]
loss: 0.035319  [25600/59264]
loss: 0.144566  [32000/59264]
loss: 0.004689  [38400/59264]
loss: 0.082771  [44800/59264]
loss: 0.100948  [51200/59264]
loss: 0.006666  [57600/59264]


 80%|████████  | 24/30 [3:03:08<45:29, 454.98s/it]

Val Error: 
 Accuracy: 90.5%, Avg loss: 0.375626 

Epoch 25
-------------------------------
loss: 0.022804  [    0/59264]
loss: 0.008842  [ 6400/59264]
loss: 0.057302  [12800/59264]
loss: 0.067881  [19200/59264]
loss: 0.073436  [25600/59264]
loss: 0.258781  [32000/59264]
loss: 0.151298  [38400/59264]
loss: 0.022033  [44800/59264]
loss: 0.008191  [51200/59264]
loss: 0.066541  [57600/59264]


 83%|████████▎ | 25/30 [3:10:44<37:56, 455.27s/it]

Val Error: 
 Accuracy: 91.1%, Avg loss: 0.351441 

Epoch 26
-------------------------------
loss: 0.093213  [    0/59264]
loss: 0.030957  [ 6400/59264]
loss: 0.048241  [12800/59264]
loss: 0.010443  [19200/59264]
loss: 0.042558  [25600/59264]
loss: 0.150710  [32000/59264]
loss: 0.057052  [38400/59264]
loss: 0.138341  [44800/59264]
loss: 0.029883  [51200/59264]
loss: 0.146252  [57600/59264]


 87%|████████▋ | 26/30 [3:18:20<30:22, 455.56s/it]

Val Error: 
 Accuracy: 91.1%, Avg loss: 0.349251 

Epoch 27
-------------------------------
loss: 0.057592  [    0/59264]
loss: 0.035693  [ 6400/59264]
loss: 0.008665  [12800/59264]
loss: 0.109614  [19200/59264]
loss: 0.028424  [25600/59264]
loss: 0.016911  [32000/59264]
loss: 0.027305  [38400/59264]
loss: 0.063807  [44800/59264]
loss: 0.136076  [51200/59264]
loss: 0.057775  [57600/59264]


 90%|█████████ | 27/30 [3:25:56<22:46, 455.56s/it]

Val Error: 
 Accuracy: 91.2%, Avg loss: 0.349135 

Epoch 28
-------------------------------
loss: 0.006554  [    0/59264]
loss: 0.021598  [ 6400/59264]
loss: 0.095106  [12800/59264]
loss: 0.022719  [19200/59264]
loss: 0.024740  [25600/59264]
loss: 0.064131  [32000/59264]
loss: 0.058004  [38400/59264]
loss: 0.082712  [44800/59264]
loss: 0.123295  [51200/59264]
loss: 0.105499  [57600/59264]


 93%|█████████▎| 28/30 [3:33:33<15:11, 455.84s/it]

Val Error: 
 Accuracy: 91.3%, Avg loss: 0.347277 

Epoch 29
-------------------------------
loss: 0.009500  [    0/59264]
loss: 0.008876  [ 6400/59264]
loss: 0.051380  [12800/59264]
loss: 0.037584  [19200/59264]
loss: 0.111212  [25600/59264]
loss: 0.023766  [32000/59264]
loss: 0.122791  [38400/59264]
loss: 0.078754  [44800/59264]
loss: 0.062445  [51200/59264]
loss: 0.069912  [57600/59264]


 97%|█████████▋| 29/30 [3:41:08<07:35, 455.75s/it]

Val Error: 
 Accuracy: 90.5%, Avg loss: 0.377131 

Epoch 30
-------------------------------
loss: 0.012395  [    0/59264]
loss: 0.016510  [ 6400/59264]
loss: 0.022604  [12800/59264]
loss: 0.115674  [19200/59264]
loss: 0.018079  [25600/59264]
loss: 0.086660  [32000/59264]
loss: 0.064706  [38400/59264]
loss: 0.036662  [44800/59264]
loss: 0.031165  [51200/59264]
loss: 0.022986  [57600/59264]


100%|██████████| 30/30 [3:48:43<00:00, 457.45s/it]

Val Error: 
 Accuracy: 91.2%, Avg loss: 0.356323 

Done!





In [23]:
submission = pd.read_csv("sample_submission.csv")
test_filename = list(submission.loc[:,"file_name"])
test_dir = []
for item in test_filename:
  ans = audio_dir + item
  test_dir.append(ans)

test_dir[0:10]

['audio_files/6f757c30-32f8-4857-a4e6-53a8b154b838.wav',
 'audio_files/36c4e1b3-9427-4905-8653-1e43bbddd938.wav',
 'audio_files/59599150-4db7-494b-8b8a-d35eee129b70.wav',
 'audio_files/a71314a4-afcf-465e-9f17-0d5768b69dce.wav',
 'audio_files/cc61243b-a830-4af7-94f2-962787e5268f.wav',
 'audio_files/0cc0aaa2-8a2e-4fff-9f2a-48c3b8736d48.wav',
 'audio_files/fce534da-f644-435c-855a-77512872dcd1.wav',
 'audio_files/0c8d2f3e-8538-4118-a096-7ddcdeddc331.wav',
 'audio_files/b0049719-427a-409d-848a-8c10b88a9799.wav',
 'audio_files/cb6be4cc-f462-44bd-ac99-784a9ae4c811.wav']

In [27]:
def pad_trunc(samples, sr):
    num_rows, signal_len = samples.shape
    max_len = sr // 1000 * 1000

    if (signal_len > max_len):
        # Truncate the signal to the given length
        samples = samples[:, max_len]

    elif (signal_len < max_len):
        # Length of padding to add at the beginning and end of the signal
        pad_begin_len = random.randint(0, max_len - signal_len)
        pad_end_len = max_len - signal_len - pad_begin_len

        # Pad with 0s
        pad_begin = torch.zeros((1, pad_begin_len))
        pad_end = torch.zeros((1, pad_end_len))

        samples = torch.cat((pad_begin, samples, pad_end), 1)

    return samples

def rechannel(spect, sr, num_channel):
        if (spect.shape[0] == num_channel):
            # Nothing to do
            return spect

        if (num_channel == 1):
            # Convert from stereo to mono by selecting only the first channel
            spect = spect[:1, :]
        else:
            # Convert from mono to stereo by duplicating the first channel
            spect = torch.cat([spect, spect, spect])

        return spect

def time_shift(samples, sr, shift_limit):
        _, sig_len = samples.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return samples.roll(shift_amt)

def get_keys(d, value):
    return [k for k,v in d.items() if v == value]

result = []
for fpath in tqdm(test_dir):   
    samples, sr = torchaudio.load(fpath)
    samples = pad_trunc(samples,16000)

    spect = torchaudio.transforms.MelSpectrogram(
        sample_rate = 16000,
        n_fft=1024,
        hop_length=None,
        n_mels=64
    )(samples)

    spect = torchaudio.transforms.AmplitudeToDB(top_db=80)(spect)
    spect = rechannel(spect,16000,3)
    spect = spect[np.newaxis,:]

    
    output = model(spect)
    ans = torch.nn.functional.softmax(output)
    ans = ans.data.cpu().numpy()
    lab = np.argsort(ans)[0][-1]
    result.append(lab)
    
res1 = []
for elem in result:
    tg = get_keys(class_dic, elem)
    res1.append(tg[0])    

submission["target"] = res1
submission.to_csv("new_submission.csv", index=None)
submission.head()

  ans = torch.nn.functional.softmax(output)
100%|██████████| 31749/31749 [03:23<00:00, 156.32it/s]


Unnamed: 0,file_name,target
0,6f757c30-32f8-4857-a4e6-53a8b154b838.wav,follow
1,36c4e1b3-9427-4905-8653-1e43bbddd938.wav,four
2,59599150-4db7-494b-8b8a-d35eee129b70.wav,bird
3,a71314a4-afcf-465e-9f17-0d5768b69dce.wav,no
4,cc61243b-a830-4af7-94f2-962787e5268f.wav,left
