In [1]:
import random
import pickle
import torch
import torchaudio
from torchaudio import transforms
from torch.utils.data import random_split
from torch.utils.data import  Dataset, random_split
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F
from torch.nn import init
from torch import nn 

In [None]:
### AudioUtil And SoundDS are classes are copied from prepare_dataset for pickle.load the dataset

In [3]:
class AudioUtil():
  # ----------------------------
  # Load an audio file. Return the signal as a tensor and the sample rate
  # ----------------------------
  @staticmethod
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    return (sig, sr)
  
  @staticmethod
  def rechannel(aud, new_channel):
    sig, sr = aud

    if (sig.shape[0] == new_channel):
      # Nothing to do
      return aud

    if (new_channel == 1):
      # Convert from stereo to mono by selecting only the first channel
      resig = sig[:1, :]
    else:
      # Convert from mono to stereo by duplicating the first channel
      resig = torch.cat([sig, sig])

    return ((resig, sr))

  @staticmethod
  def resample(aud, newsr):
      sig, sr = aud
  
      if (sr == newsr):
        # Nothing to do
        return aud
  
      num_channels = sig.shape[0]
      # Resample first channel
      resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
      if (num_channels > 1):
        # Resample the second channel and merge both channels
        retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
        resig = torch.cat([resig, retwo])
  
      return ((resig, newsr))
  
  @staticmethod
  def pad_trunc(aud, max_ms):
      sig, sr = aud
      num_rows, sig_len = sig.shape
      max_len = sr//1000 * max_ms
  
      if (sig_len > max_len):
        # Truncate the signal to the given length
        sig = sig[:,:max_len]
  
      elif (sig_len < max_len):
        # Length of padding to add at the beginning and end of the signal
        pad_begin_len = random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len
  
        # Pad with 0s
        pad_begin = torch.zeros((num_rows, pad_begin_len))
        pad_end = torch.zeros((num_rows, pad_end_len))
  
        sig = torch.cat((pad_begin, sig, pad_end), 1)
      return (sig, sr)

  @staticmethod
  def time_shift(aud, shift_limit):
      sig,sr = aud
      _, sig_len = sig.shape
      shift_amt = int(random.random() * shift_limit * sig_len)
      return (sig.roll(shift_amt), sr)      
  
  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
      sig,sr = aud
      top_db = 80
  
      # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
      spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
  
      # Convert to decibels
      spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
      return (spec)
  
  @staticmethod
  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
      _, n_mels, n_steps = spec.shape
      mask_value = spec.mean()
      aug_spec = spec
  
      freq_mask_param = max_mask_pct * n_mels
      for _ in range(n_freq_masks):
        aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
  
      time_mask_param = max_mask_pct * n_steps
      for _ in range(n_time_masks):
        aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
  
      return aug_spec

In [2]:
class SoundDS(Dataset):
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    self.duration = 12000
    self.sr = 22050
    self.channel = 1
    self.shift_pct = 0.4
    self.dataset = []
    self.load() 
   
  def load(self) :         
    for idx in tqdm( range( len(self) ) ) :
        audio_file = self.data_path + self.df.loc[idx, 'path']
        class_id = self.df.loc[idx, 'gender']
    
        aud = AudioUtil.open(audio_file)
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)
        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
    
        self.dataset.append((aug_sgram, class_id))
     
  def __len__(self):
     return len(self.df)    
 
  def __getitem__(self, idx): 
      return self.dataset[idx]

In [4]:
myds = pickle.load(open("dataset.bin","rb")) #dataset_10000_8

In [5]:
# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

In [7]:
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for data in tqdm(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]
        
        #if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

    model.eval()
    with torch.no_grad() : 
        correct_prediction = 0
        total_prediction = 0
        for data in tqdm(val_dl):
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Zero the parameter gradients

            # forward + backward + optimize
            outputs = model(inputs)

            _, prediction = torch.max(outputs,1)
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]
        print( correct_prediction/total_prediction )

  print('Finished Training')

In [8]:
from cnn_model import *
num_epochs=15  # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs)
### Validation loss is printed after each epoch 

  return torch._C._cuda_getDeviceCount() > 0
100%|██████████| 2000/2000 [00:58<00:00, 34.09it/s]


Epoch: 0, Loss: 0.48, Accuracy: 0.78


100%|██████████| 500/500 [00:04<00:00, 113.40it/s]


0.867


100%|██████████| 2000/2000 [01:00<00:00, 32.80it/s]


Epoch: 1, Loss: 0.32, Accuracy: 0.88


100%|██████████| 500/500 [00:03<00:00, 125.75it/s]


0.905125


100%|██████████| 2000/2000 [00:59<00:00, 33.71it/s]


Epoch: 2, Loss: 0.27, Accuracy: 0.90


100%|██████████| 500/500 [00:03<00:00, 125.07it/s]


0.917125


100%|██████████| 2000/2000 [01:01<00:00, 32.76it/s]


Epoch: 3, Loss: 0.24, Accuracy: 0.91


100%|██████████| 500/500 [00:03<00:00, 131.09it/s]


0.927125


100%|██████████| 2000/2000 [00:59<00:00, 33.70it/s]


Epoch: 4, Loss: 0.22, Accuracy: 0.92


100%|██████████| 500/500 [00:03<00:00, 130.20it/s]


0.936625


100%|██████████| 2000/2000 [00:58<00:00, 34.38it/s]


Epoch: 5, Loss: 0.20, Accuracy: 0.93


100%|██████████| 500/500 [00:03<00:00, 138.02it/s]


0.940875


100%|██████████| 2000/2000 [01:05<00:00, 30.65it/s]


Epoch: 6, Loss: 0.18, Accuracy: 0.94


100%|██████████| 500/500 [00:04<00:00, 122.24it/s]


0.943


100%|██████████| 2000/2000 [02:25<00:00, 13.74it/s]  


Epoch: 7, Loss: 0.17, Accuracy: 0.94


100%|██████████| 500/500 [00:04<00:00, 121.51it/s]


0.943375


100%|██████████| 2000/2000 [00:56<00:00, 35.32it/s]


Epoch: 8, Loss: 0.16, Accuracy: 0.95


100%|██████████| 500/500 [00:03<00:00, 138.14it/s]


0.94275


100%|██████████| 2000/2000 [00:57<00:00, 34.76it/s]


Epoch: 9, Loss: 0.14, Accuracy: 0.95


100%|██████████| 500/500 [00:04<00:00, 121.56it/s]


0.945375


100%|██████████| 2000/2000 [01:00<00:00, 32.91it/s]


Epoch: 10, Loss: 0.13, Accuracy: 0.96


100%|██████████| 500/500 [00:04<00:00, 121.04it/s]


0.94525


100%|██████████| 2000/2000 [01:01<00:00, 32.74it/s]


Epoch: 11, Loss: 0.12, Accuracy: 0.96


100%|██████████| 500/500 [00:03<00:00, 139.18it/s]


0.942875


100%|██████████| 2000/2000 [00:56<00:00, 35.38it/s]


Epoch: 12, Loss: 0.11, Accuracy: 0.96


100%|██████████| 500/500 [00:03<00:00, 139.52it/s]


0.945625


100%|██████████| 2000/2000 [00:54<00:00, 36.46it/s]


Epoch: 13, Loss: 0.10, Accuracy: 0.97


100%|██████████| 500/500 [00:03<00:00, 135.53it/s]


0.946125


100%|██████████| 2000/2000 [00:36<00:00, 55.18it/s]


Epoch: 14, Loss: 0.09, Accuracy: 0.97


100%|██████████| 500/500 [00:02<00:00, 193.90it/s]


0.946625
Finished Training


100%|██████████| 500/500 [00:02<00:00, 186.10it/s]

0.946625





In [None]:
###Final evaluation 
myModel.eval()
with torch.no_grad() : 
     correct_prediction = 0
     total_prediction = 0
     for data in tqdm(val_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # forward + backward + optimize
        outputs = myModel(inputs)

        _, prediction = torch.max(outputs,1)
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]
     print( correct_prediction/total_prediction )

In [9]:
torch.save(myModel.state_dict(), 'model_dict.pt')