In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import librosa
import librosa.display
from glob import glob
import os
import time
import random
from google.colab import drive
from PIL import Image

from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F
from fastprogress import master_bar, progress_bar
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_original = pd.read_csv('/content/drive/My Drive/Samsung-audio-classification/finaldataset.csv')

In [None]:
base_dir="/content/drive/My Drive/Samsung-audio-classification/filteredDataSet"
all_audio_path = glob(os.path.join(base_dir,'*.wav'))
audioid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                   for x in glob(os.path.join(base_dir,'*.wav'))}

In [None]:
df_original['path'] = df_original['fname'].map(audioid_path_dict.get)
l=list(df_original['fname']) 
for k in df_original['fname']:
    s = l.index(int(k))
    df_original['path'][s] = audioid_path_dict.get(str(k))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
import wave
!pip install soundfile
import soundfile as sf
!pip install pydub
from pydub import AudioSegment
def load_audio_file(file_path):
    #input_length=int(librosa.get_duration(filename=file_path))*1000
    input_length=90000
    data = librosa.core.load(file_path)[0] 
    if len(data)>input_length:
        data = data[:input_length]
    else:
        data = np.pad(data, (0, max(0, input_length - len(data))), "constant")
    return data
df_original['duration'] = df_original['path'].map(audioid_path_dict.get)
ll=list(df_original['path']) 
for file_path in df_original['path']:
  ss = ll.index(file_path)
  audiofile = AudioSegment.from_file(file_path)
  df_original['duration'][ss]=audiofile.duration_seconds

Collecting soundfile
  Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl
Installing collected packages: soundfile
Successfully installed soundfile-0.10.3.post1
Collecting pydub
  Downloading https://files.pythonhosted.org/packages/7b/d1/fbfa79371a8cd9bb15c2e3c480d7e6e340ed5cc55005174e16f48418333a/pydub-0.24.1-py2.py3-none-any.whl
Installing collected packages: pydub
Successfully installed pydub-0.24.1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
USE_GPU = torch.cuda.is_available()

In [None]:
melspec_params = {
    'n_mels': 128,
    'duration': 4*22050,
    'hop_length': 512,
    'n_fft': 2048,
    'fmin': 20
}

In [None]:
def create_melspec(params, audio_data, sampling_rate):
    S = librosa.feature.melspectrogram(audio_data, 
                                       sr=sampling_rate, 
                                       n_mels=params['n_mels'],
                                       hop_length=params['hop_length'],
                                       n_fft=params['n_fft'],
                                       fmin=params['fmin'],
                                       fmax=(sampling_rate // 2))
    Sb = librosa.power_to_db(S, ref=np.max)
    Sb = Sb.astype(np.float32)
    
    return Sb
        
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    X = np.stack([X, X, X], axis=-1)
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    
    if (_max - _min) > eps:
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(Xstd, dtype=np.uint8)
        
    return V  

def display_melspec(params, mels, sampling_rate): 
    librosa.display.specshow(mels, x_axis='time', y_axis='mel',
                             sr=sampling_rate, hop_length=params['hop_length'],
                             fmin=params['fmin'], fmax=(sampling_rate // 2))
    plt.colorbar()
    plt.show()

In [None]:
def load_audio(params, file_path):
    y, sr = librosa.load(file_path)
    return y, sr

In [None]:
class TrainDataset(Dataset):
    def __init__(self, melspecs, labels, transforms):
        super().__init__()
        self.melspecs = melspecs
        self.labels = labels
        self.transforms = transforms
        
    def __len__(self):
        return len(self.melspecs)
    
    def __getitem__(self, idx):
        image = Image.fromarray(self.melspecs[idx], mode='RGB')        
        image = self.transforms(image).div_(255)       
        label = self.labels[idx]
        
        return image, label

In [None]:
transforms_dict = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(0.5),
        transforms.ToTensor(),
    ])
}

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = F.avg_pool2d(x, 2)
        return x

In [None]:
class Classifier(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        
        self.conv = nn.Sequential(
            ConvBlock(in_channels=3, out_channels=64),
            ConvBlock(in_channels=64, out_channels=128),
            ConvBlock(in_channels=128, out_channels=256),
            ConvBlock(in_channels=256, out_channels=512),
        )
        
        self.fc = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(512, 128),
            nn.PReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(128),
            nn.Linear(128, num_classes),
        )

    def forward(self, x):
        x = self.conv(x)
        x = torch.mean(x, dim=3)
        x, _ = torch.max(x, dim=2)
        x = self.fc(x)
        return x

In [None]:
model_params = {
    'num_epochs': 1, 
    'batch_size': 64,
    'learning_rate': 0.001,
    'num_clases': 10, 
    'eta_min': 1e-5,
    't_max': 10
}

In [None]:
def set_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if USE_GPU:
        torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 73
set_seeds(SEED)

In [None]:
df_original['wavfiles'] = df_original['path'].map(audioid_path_dict.get) 
for file_paths in df_original['path']:
  ss = ll.index(file_paths)
  audiofiles = load_audio_file(file_paths)
  df_original['wavfiles'][ss]=audiofiles

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/us

In [None]:
df_original['wavfiles']

0      [6.56151e-08, -1.3972792e-07, 2.2742304e-07, -...
1      [0.004087707, 0.0060616545, 0.005211632, 0.005...
2      [4.714174e-07, -5.7810753e-06, -3.4197885e-05,...
3      [2.4037927e-05, -4.6620462e-05, -6.38087e-06, ...
4      [-6.9467183e-09, -2.2305112e-07, 9.725055e-07,...
                             ...                        
838    [-0.0029378538, -0.0022915544, -0.0055930084, ...
839    [0.0022265476, -0.0004194972, -0.0030296305, 0...
840    [0.006186856, 0.0031995305, -0.00027330106, 0....
841    [-0.007664759, -0.00026177574, 0.0014854934, 0...
842    [0.0039003717, -0.0060389885, -0.008751982, 0....
Name: wavfiles, Length: 843, dtype: object

In [None]:
df_original.head()

Unnamed: 0,fname,labels,mids,split,Unnamed: 4,path,duration,wavfiles
0,429037,"Male_speech_and_man_speaking,Speech,Human_voice","/m/05zppz,/m/09x0r,/m/09l8g",train,,/content/drive/My Drive/Samsung-audio-classifi...,1.12617,"[6.56151e-08, -1.3972792e-07, 2.2742304e-07, -..."
1,40964,"Male_speech_and_man_speaking,Yell,Shout,Speech...","/m/05zppz,/m/07sr1lc,/m/07p6fty,/m/09x0r,/m/09l8g",train,,/content/drive/My Drive/Samsung-audio-classifi...,2.37376,"[0.004087707, 0.0060616545, 0.005211632, 0.005..."
2,236553,"Male_speech_and_man_speaking,Speech,Human_voice","/m/05zppz,/m/09x0r,/m/09l8g",train,,/content/drive/My Drive/Samsung-audio-classifi...,4.14766,"[4.714174e-07, -5.7810753e-06, -3.4197885e-05,..."
3,236555,"Male_speech_and_man_speaking,Speech,Human_voice","/m/05zppz,/m/09x0r,/m/09l8g",train,,/content/drive/My Drive/Samsung-audio-classifi...,3.8458,"[2.4037927e-05, -4.6620462e-05, -6.38087e-06, ..."
4,236556,"Male_speech_and_man_speaking,Speech,Human_voice","/m/05zppz,/m/09x0r,/m/09l8g",train,,/content/drive/My Drive/Samsung-audio-classifi...,3.13179,"[-6.9467183e-09, -2.2305112e-07, 9.725055e-07,..."


In [None]:
seconddata=df_original.copy()
filter1 = seconddata["labels"]=="Child_speech_and_kid_speaking,Speech,Human_voice"
seconddata.where(filter1,inplace=True)
seconddata.dropna()
kidspeech=list(seconddata['path'])
fold1 = [x for x in kidspeech if x == x]
thirddata=df_original.copy()
filter2 = thirddata["labels"]=="Traffic_noise_and_roadway_noise,Motor_vehicle_(road),Vehicle"
thirddata.where(filter2,inplace=True)
thirddata.dropna()
traffic=list(thirddata['path'])
fold2 = [x for x in traffic if x == x]
fold2.append('/content/drive/My Drive/Samsung-audio-classification/filteredDataSet/50869.wav')
fold2.append('/content/drive/My Drive/Samsung-audio-classification/filteredDataSet/3183.wav')
fold2.append('/content/drive/My Drive/Samsung-audio-classification/filteredDataSet/325246.wav')
fold2.append('/content/drive/My Drive/Samsung-audio-classification/filteredDataSet/336651.wav')
fold2.append('/content/drive/My Drive/Samsung-audio-classification/filteredDataSet/334903.wav')
fold2.append('/content/drive/My Drive/Samsung-audio-classification/filteredDataSet/75825.wav')
fourthdata=df_original.copy()
filter3 = fourthdata["labels"]=="Male_speech_and_man_speaking,Speech,Human_voice"
filter4 = fourthdata['labels']=='Female_speech_and_woman_speaking,Speech,Human_voice'
fourthdata.where(filter3,inplace=True)
fourthdata.dropna()
conversation=list(fourthdata['path'])
fold3 = [x for x in traffic if x == x]
convotemp=[]
fifthdata=df_original.copy()
fifthdata.where(filter4,inplace=True)
fifthdata.dropna()
convotemp=list(fifthdata['path'])
cleanedconvo = [x for x in convotemp if x == x]
for y in cleanedconvo:
  fold3.append(y)
bgdnoise=[]
sixthdata=df_original.copy()
idx = np.where((sixthdata['labels']=='Coin_(dropping),Domestic_sounds_and_home_sounds') | (sixthdata['labels']=='Dishes_and_pots_and_pans,Glass,Domestic_sounds_and_home_sounds') | (sixthdata['labels']=='Squeak,Walk_and_footsteps,Door,Domestic_sounds_and_home_sounds'))
bgdnoise=list(sixthdata.loc[idx]['path'])
fold4=[]
for w in bgdnoise:
  fold4.append(w)

In [None]:
sub_dirs = ['fold' + str(x) for x in np.arange(1,5)]

In [None]:
sub_dirs

['fold1', 'fold2', 'fold3', 'fold4']

In [None]:
#training begins here
#X_train = []
#train_labels = []
#X_valid = [] 
#valid_labels = [] 
seventhdata=df_original.copy()
f1 = np.where(seventhdata['split']=='train')
f2 = np.where(seventhdata['split']=='val')
X_train=list(seventhdata.loc[f1]['wavfiles'])
X_valid=list(seventhdata.loc[f2]['wavfiles'])
train_labels=list(seventhdata.loc[f1]['fname'])
X_labels=list(seventhdata.loc[f2]['fname'])
train_labels = [int(i) for i in train_labels] 
X_labels = [int(i) for i in X_labels] 

In [None]:
train_dataset = TrainDataset(X_train, train_labels, transforms_dict['train'])
valid_dataset = TrainDataset(X_valid, valid_labels, transforms_dict['train'])

train_loader = DataLoader(train_dataset, batch_size=model_params['batch_size'], shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=model_params['batch_size'], shuffle=False)

In [None]:
if USE_GPU:
        model = Classifier().cuda()
        criterion = nn.CrossEntropyLoss().cuda()
else:
        model = Classifier()
        criterion = nn.CrossEntropyLoss()
    
optimizer = Adam(params=model.parameters(), lr=model_params['learning_rate'], amsgrad=False)
scheduler = CosineAnnealingLR(optimizer, T_max=model_params['t_max'], eta_min=model_params['eta_min'])
    
mb = master_bar(range(model_params['num_epochs']))

In [None]:
for epoch in mb:
 start_time = time.time()
 model.train()
 avg_loss = 0.
 for x_batch, y_batch in progress_bar(train_loader, parent=mb):
   if USE_GPU:
     preds = model(x_batch.cuda())
     loss = criterion(preds, y_batch.cuda())
   else:
     preds = model(x_batch)
     loss = criterion(preds, y_batch)
            
   optimizer.zero_grad()
   loss.backward()
   optimizer.step()

   avg_loss += loss.item() / len(train_loader)

model.eval()
valid_preds = np.zeros((len(X_valid), 10))
avg_val_loss = 0.

for i, (x_batch, y_batch) in enumerate(valid_loader):
  if USE_GPU:
    preds = model(x_batch.cuda()).detach()
    loss = criterion(preds, y_batch.cuda())
  else:
    preds = model(x_batch).detach()
    loss = criterion(preds, y_batch)

  preds = torch.sigmoid(preds)
  valid_preds[i * model_params['batch_size']: (i+1) * model_params['batch_size']] = preds.cpu().numpy()

  avg_val_loss += loss.item() / len(valid_loader)

accuracy = sum(1 for x,y in zip(valid_labels, valid_preds.argmax(axis=1).tolist()) if x == y) / len(valid_labels)    
            
scheduler.step()
        
elapsed = time.time() - start_time
mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  accuracy: {accuracy:.4f}  time: {elapsed:.0f}s')
            
if epoch == 0:
  best_accuracy = accuracy
  torch.save(model.state_dict(), home_directory + 'models/' + sub_dir + '_best_model.pt')
  best_preds = valid_preds.argmax(axis=1).tolist()
else:
  if accuracy > best_accuracy:
    best_accuracy = accuracy
    torch.save(model.state_dict(), home_directory + 'models/' + sub_dir + '_best_model.pt')
    best_preds = valid_preds.argmax(axis=1).tolist()
    
print(sub_dir + ' summary')
print('------------------------')
print(classification_report(np.argmax(valid_preds, axis=1), valid_labels))   
print('------------------------')
print('best accuracy: ' + str(best_accuracy))
print('------------------------')
print('\n')

oof_labels.append(valid_labels)
oof_preds.append(best_preds)


oof_labels_flat = [item for sublist in oof_labels for item in sublist]
oof_preds_flat = [item for sublist in oof_preds for item in sublist]

oof_accuracy = sum(1 for x, y in zip(oof_labels_flat, oof_preds_flat) if x == y) / len(oof_labels_flat)

print('------------------------')
print('out-of-fold prediction accuracy: ' + str(oof_accuracy))

IndexError: ignored