## Importing Libraries

In [1]:
import os
import numpy as np
np.random.seed(1969)
import tensorflow as tf
tf.set_random_seed(1969)


from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import GRU, Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization, Conv3D, ConvLSTM2D,Conv1D,Activation,LSTM
from keras.callbacks import TensorBoard
from keras.models import Sequential
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import random
import os
import pandas as pd
import librosa
import glob
import torch
from torch import *
from torch.autograd import Variable
import random
import torch.nn as nn
import torch.nn.functional as F

Using TensorFlow backend.


## Defining Cyclic Learning Rate

In [5]:
from keras.callbacks import *
class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

## Utility Functions

In [6]:
L=16000
AUDIO_LENGTH=16000
AUDIO_SR=16000
AUDIO_NOISES=[]
# Making Random noise files
for file in ['meld/train/disgust/MEL_dia1005_utt13_negative_DIS.wav',  'meld/train/fear/MEL_dia133_utt15_negative_FEA.wav',   
             'meld/train/happy/MEL_dia95_utt14_positive_HAP.wav', 'meld/train/neutral/MEL_dia96_utt19_neutral_NEU.wav',
             'meld/train/sad/MEL_dia124_utt12_negative_SAD.wav',  'meld/train/sad/MEL_dia148_utt0_negative_SAD.wav']:
    #audio_file = AUDIO_DIR + '/train/audio/_background_noise_/' + file
    wave = librosa.core.load(file, sr=AUDIO_SR)[0]
    AUDIO_NOISES.append(wave)

def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

# Defining log-specgram
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

# Defining padding function
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))
    
# Defining chopping function
def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]
        
# Defining spectrogram for wav file conversion
def get_spectrogram(wav):
    D = librosa.stft(wav, n_fft=480, hop_length=160,
                     win_length=480, window='hamming')
    spect, phase = librosa.magphase(D)
    return spect

# Defining funtion for adding random noise
def tf_random_add_noise_transform(wave, noise_limit=0.2, u=0.5):

    if random.random() < u:
        num_noises = len(AUDIO_NOISES)
        noise = AUDIO_NOISES[np.random.choice(num_noises)]

        wave_length  = len(wave)
        noise_length = len(noise)
        p=noise_length - wave_length - 1
        print(p)
        t = np.random.randint(0, noise_length - wave_length - 1)
        #t = np.random.randint(noise_length - wave_length - 1,0)
        noise = noise[t:t + wave_length]

        alpha = np.random.random() * noise_limit
        wave  = np.clip(alpha * noise + wave, -1, 1)

    return wave

# Defining function for adding random time shift
def tf_random_time_shift_transform(wave, shift_limit=0.2, u=0.5):
    if random.random() < u:
        wave_length  = len(wave)
        shift_limit = shift_limit*wave_length
        shift = np.random.randint(-shift_limit, shift_limit)
        t0 = -min(0, shift)
        t1 =  max(0, shift)
        wave = np.pad(wave, (t0, t1), 'constant')
        wave = wave[:-t0] if t0 else wave[t1:]

    return wave

# Defining function for adding random padding
def tf_random_pad_transform(wave, length=AUDIO_LENGTH):

    if len(wave)<AUDIO_LENGTH:
        L = abs(len(wave)-AUDIO_LENGTH)
        start = np.random.choice(L)
        wave  = np.pad(wave, (start, L-start), 'constant')

    elif len(wave)>AUDIO_LENGTH:
        L = abs(len(wave)-AUDIO_LENGTH)
        start = np.random.choice(L)
        wave  = wave[start: start+AUDIO_LENGTH]

    return wave

# Defining function fixed padding (test file)
def tf_fix_pad_transform(wave, length=AUDIO_LENGTH):
    # wave = np.pad(wave, (0, max(0, AUDIO_LENGTH - len(wave))), 'constant')
    # return wave

    if len(wave)<AUDIO_LENGTH:
        L = abs(len(wave)-AUDIO_LENGTH)
        start = L//2
        wave  = np.pad(wave, (start, L-start), 'constant')

    elif len(wave)>AUDIO_LENGTH:
        L = abs(len(wave)-AUDIO_LENGTH)
        start = L//2
        wave  = wave[start: start+AUDIO_LENGTH]

    return wave


def tf_random_scale_amplitude_transform(wave, scale_limit=0.1, u=0.5):
    if random.random() < u:
        scale = np.random.randint(-scale_limit, scale_limit)
        wave = scale*wave
    return wave

# Defininig function for converting Wave to MFCC
def tf_wave_to_mfcc(wave):

    spectrogram = librosa.feature.melspectrogram(wave, sr=AUDIO_SR, n_mels=40, hop_length=160, n_fft=480, fmin=20, fmax=4000)
    #spectrogram = librosa.power_to_db(spectrogram)
    idx = [spectrogram > 0]
    spectrogram[idx] = np.log(spectrogram[idx])

    dct_filters = librosa.filters.dct(n_filters=40, n_input=40)
    mfcc = [np.matmul(dct_filters, x) for x in np.split(spectrogram, spectrogram.shape[1], axis=1)]
    mfcc = np.hstack(mfcc)
    mfcc = mfcc.astype(np.float32)

    return mfcc

# Defininig function for converting Wave to Melspectrogram
def tf_wave_to_melspectrogram(wave):
    spectrogram = librosa.feature.melspectrogram(wave, sr=AUDIO_SR, n_mels=40, hop_length=160, n_fft=480, fmin=20, fmax=4000)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)

    return spectrogram


# Defininig function for converting Wave to combnation of Melspectrogram and MFCC
def tf_wave_to_melspectrogram_mfcc(wave):

    spectrogram = librosa.feature.melspectrogram(wave, sr=AUDIO_SR, n_mels=40, hop_length=160, n_fft=480, fmin=5, fmax=4500)
    idx = [spectrogram > 0]
    spectrogram[idx] = np.log(spectrogram[idx])

    dct_filters = librosa.filters.dct(n_filters=40, n_input=40)
    mfcc = [np.matmul(dct_filters, x) for x in np.split(spectrogram, spectrogram.shape[1], axis=1)]
    mfcc = np.hstack(mfcc)
    mfcc = mfcc.astype(np.float32)

    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)

    all = np.concatenate((spectrogram[np.newaxis,:],mfcc[np.newaxis,:]))
    return all

# Defininig function for converting Wave to combnation of Log-Melspectrogram
def tf_wave_to_melspectrogram1(wave):
    spectrogram = librosa.feature.melspectrogram(wave, sr=AUDIO_SR, n_mels=40, hop_length=160, n_fft=480, fmin=20, fmax=4000)
    idx = [spectrogram > 0]
    spectrogram[idx] = np.log(spectrogram[idx])
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram


In [10]:
# Train time augmentation
def train_augment(wave):
    wave = tf_random_time_shift_transform(wave, shift_limit=0.2, u=0.5)
#    wave = tf_random_add_noise_transform (wave, noise_limit=0.2, u=0.5)
    wave = tf_random_pad_transform(wave)
    tensor = tf_wave_to_melspectrogram(wave)[np.newaxis,:]
    return tensor

# Test time augmentation
def valid_augment(wave):
    wave = tf_fix_pad_transform(wave)
    tensor = tf_wave_to_melspectrogram(wave)[np.newaxis,:]
    return tensor

## Creating training and validation files

In [65]:
# Manual Label Encoding
mappings={"disgust":int(0),"fear":int(1),"happy":int(2),"neutral":int(3),"sad":int(4)}

In [88]:
new_sample_rate = 16000
y_train = []
x_train = []
mylist1=os.listdir('meld/train/')
for file in mylist1:
    mylist= os.listdir('meld/train/'+file+"/")
    for index,y in enumerate(mylist):
        samples, sample_rate = librosa.core.load('meld/train/'+file+"/"+y,mono=True,sr=16000)
        # Using train augmentation
        specgram=train_augment(samples)
        x_train.append(specgram)
        y_train.append(file)

# Label Encoding target and converting train files to numpy array
x_train = np.array(x_train)
for i in range(len(y_train)):
    if y_train[i] in mappings:
        y_train[i]=mappings[y_train[i]]
y_train=np.array(y_train,dtype="int64")
x_train = torch.from_numpy(x_train)

In [96]:
# new sampling
new_sample_rate = 16000
y_test = []
x_test = []
mylist1=os.listdir('meld/val/')
for file in mylist1:
    mylist= os.listdir('meld/val/'+file+"/")
    for index,y in enumerate(mylist):
        samples, sample_rate = librosa.core.load('meld/val/'+file+"/"+y,mono=True,sr=16000)
        # Using test time augmentation
        specgram=valid_augment(samples)
        x_test.append(specgram)
        y_test.append(file)
        
# Label Encoding target and converting valid files to numpy array
x_test = np.array(x_test)
for i in range(len(y_test)):
    if y_test[i] in mappings:
        y_test[i]=mappings[y_test[i]]
y_test=np.array(y_test,dtype="int64")

## Defining Basic Convnet

In [196]:
class Cnn_Trad_Pool2_Net(nn.Module):
    def __init__(self, in_shape=(1,40,101), num_classes=5 ):

        super(Cnn_Trad_Pool2_Net, self).__init__()
        self.num_classes = num_classes

        self.conv1 = nn.Conv2d(1,  64, kernel_size=(8, 20), stride=(1, 1))
        self.conv2 = nn.Conv2d(64, 64, kernel_size=(4, 10), stride=(1, 1))
        self.fc = nn.Linear(26624,num_classes)


    def forward(self, x):

        x = self.conv1(x)
        x = F.relu(x,inplace=True)
        x = F.max_pool2d(x,kernel_size=(2,2),stride=(2,2))

        x = self.conv2(x)
        x = F.relu(x,inplace=True)
        x = x.view(x.size(0), -1)

        #print(x.size())
        x = F.dropout(x,p=0.5,training=self.training)
        x = self.fc(x)

        return x  #logits

## Final Convnet with residual blocks

In [80]:
class ConvBn2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, dilation=1, stride=1, groups=1, is_bn=True):
        super(ConvBn2d, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding, stride=stride, dilation=dilation, groups=groups, bias=False)
        self.bn   = nn.BatchNorm2d(out_channels)
        if is_bn is False:
            self.bn =None

    def forward(self,x):
        x = self.conv(x)
        if self.bn is not None:
            x = self.bn(x)
        return x

class SeScale(nn.Module):
    def __init__(self, channel, reduction=16):
        super(SeScale, self).__init__()
        self.fc1 = nn.Conv2d(channel, reduction, kernel_size=1, padding=0)
        self.fc2 = nn.Conv2d(reduction, channel, kernel_size=1, padding=0)

    def forward(self, x):
        x = F.adaptive_avg_pool2d(x,1)
        x = self.fc1(x)
        x = F.relu(x, inplace=True)
        x = self.fc2(x)
        x = F.sigmoid(x)
        return x


class ResBlock(nn.Module):
    def __init__(self, in_planes, out_planes, reduction=16):
        super(ResBlock, self).__init__()
        assert(in_planes==out_planes)

        self.conv_bn1 = ConvBn2d(in_planes,  out_planes, kernel_size=3, padding=1, stride=1)
        self.conv_bn2 = ConvBn2d(out_planes, out_planes, kernel_size=3, padding=1, stride=1)
        self.scale    = SeScale(out_planes, reduction)

    def forward(self, x):
        z  = F.relu(self.conv_bn1(x),inplace=True)
        z  = self.conv_bn2(z)
        z  = self.scale(z)*z + x
        z  = F.relu(z,inplace=True)
        return z



## net ##-------

class SeResNet3(nn.Module):
    def __init__(self, in_shape=(1,40,101), num_classes=5 ):
        super(SeResNet3, self).__init__()
        in_channels = in_shape[0]

        self.layer1a = ConvBn2d(in_channels, 16, kernel_size=(3, 3), stride=(1, 1))
        self.layer1b = ResBlock( 16, 16)

        self.layer2a = ConvBn2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
        self.layer2b = ResBlock(32, 32)
        self.layer2c = ResBlock(32, 32)

        self.layer3a = ConvBn2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
        self.layer3b = ResBlock(64, 64)
        self.layer3c = ResBlock(64, 64)

        self.layer4a = ConvBn2d( 64,128, kernel_size=(3, 3), stride=(1, 1))
        self.layer4b = ResBlock(128,128)
        self.layer4c = ResBlock(128,128)

        self.layer5a = ConvBn2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
        self.layer5b = nn.Linear(256,256)

        self.fc = nn.Linear(256,num_classes)


    def forward(self, x):

        x = F.relu(self.layer1a(x),inplace=True)
        x = self.layer1b(x)
        x = F.max_pool2d(x,kernel_size=(2,2),stride=(2,2))

        x = F.dropout(x,p=0.1,training=self.training)
        x = F.relu(self.layer2a(x),inplace=True)
        x = self.layer2b(x)
        x = self.layer2c(x)
        x = F.max_pool2d(x,kernel_size=(2,2),stride=(2,2))

        x = F.dropout(x,p=0.2,training=self.training)
        x = F.relu(self.layer3a(x),inplace=True)
        x = self.layer3b(x)
        x = self.layer3c(x)
        x = F.max_pool2d(x,kernel_size=(2,2),stride=(2,2))

        x = F.dropout(x,p=0.2,training=self.training)
        x = F.relu(self.layer4a(x),inplace=True)
        x = self.layer4b(x)
        x = self.layer4c(x)

        x = F.dropout(x,p=0.2,training=self.training)
        x = F.relu(self.layer5a(x),inplace=True)
        x = F.adaptive_avg_pool2d(x,1)
        x = x.view(x.size(0), -1)
        x = F.relu(self.layer5b(x))

        x = F.dropout(x,p=0.2,training=self.training)
        x = self.fc(x)

        return x  #logits

In [None]:
# Making object of the model
H = 40
W = 101
#model = Cnn_Trad_Pool2_Net(in_shape=(1,H,W), num_classes=5)
#model.cuda()
model=SeResNet3(in_shape=(1,H,W), num_classes=5).cuda()
model.cuda()

## Training

In [82]:
from torch.utils.data import Dataset, DataLoader
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index
    def __len__(self):
        return len(self.dataset)

In [83]:
# Creating tensor dataset
x_train = torch.Tensor(x_train).cuda()
x_test = torch.Tensor(x_test).cuda()
y_train = torch.cuda.LongTensor(y_train).cuda()
y_test = torch.cuda.LongTensor(y_test).cuda()

train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_test, y_test)
    
train = MyDataset(train)
valid = MyDataset(valid)

# Creating train and validation loader objects
train_loader = torch.utils.data.DataLoader(train, batch_size=128, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=128, shuffle=False)

In [84]:
# Defining optimizer and scheduler for network 
from torch.optim.optimizer import Optimizer
step_size = 300
base_lr, max_lr = 0.001, 0.005
optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),
                          lr=max_lr, momentum=0.9, weight_decay=0.0001)

scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,
               step_size=step_size, mode='exp_range',
               gamma=0.99994)

In [85]:
# Defining loss function 
from torch.nn import *
criterion = nn.CrossEntropyLoss()
from torch.nn.utils import clip_grad_norm_
#loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')

In [None]:
# Training
for epoch in range(20):
        
        start_time = time.time()
        model.train()
        avg_loss = 0.  
        for i, (x_batch, y_batch, index) in enumerate(train_loader):
            
            ################################################################################################            
            y_pred = model(x_batch)
            ################################################################################################
            if scheduler:
                scheduler.batch_step()
            ###############################################################################################
            optimizer.zero_grad()

            
            loss.backward()
            clip_grad_norm_(model.parameters(), 1.0)


            
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        
        model.eval()
        
       
        valid_preds_fold = np.zeros((x_test.size(0)))        
        avg_val_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            y_pred = model(x_batch).detach()
            
            avg_val_loss += criterion(y_pred,y_batch).item() / len(valid_loader)
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
            epoch + 1,20, avg_loss, avg_val_loss, elapsed_time))

## Saving the model and loading it for predictions

In [52]:
torch.save(model.state_dict(), "model.pth")
modeltest = SeResNet3(in_shape=(1,H,W), num_classes=5).cuda()
modeltest.load_state_dict(torch.load("model.pth"))
modeltest.eval()

In [54]:
predictions=[]
for i, (x_batch, y_batch, index) in enumerate(valid_loader):
    y_pred = modeltest(x_batch).detach()
    prediction = torch.argmax(y_pred, dim=1)
    
    predictions.append(prediction.tolist())
flattened_list = [y for x in predictions for y in x]