In [2]:
import tensorflow
import matplotlib.pyplot as plt

In [4]:
#%tensorflow_version 1.x
#For training we are taking meeting ES2015 - ES2015a,ES2015b,ES2015c,ES2015d
import librosa
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
file_list = ['ES2015a', 'ES2015b', 'ES2015c', 'ES2015d']



def extract_feature(file_name):
    file = "SPD/audio/" + file_name + ".wav"
    frame_size = 2048
    frame_shift = 512
    y, sr = librosa.load(file)
    print("Sampling Rate: ",sr)
    #MFCC Extraction 
    mfccs = librosa.feature.mfcc(y, sr, n_mfcc=12, hop_length=frame_shift, n_fft=frame_size)
    mfcc_delta = librosa.feature.delta(mfccs)
    mfcc_delta2 = librosa.feature.delta(mfccs, order=2)

    mfcc = mfccs[1:, ]
    norm_mfcc = (mfcc - np.mean(mfcc, axis=1, keepdims=True)) / np.std(mfcc, axis=1, keepdims=True)
    norm_mfcc_delta = (mfcc_delta - np.mean(mfcc_delta, axis=1, keepdims=True)) / np.std(mfcc_delta, axis=1, keepdims=True)
    norm_mfcc_delta2= (mfcc_delta2 - np.mean(mfcc_delta2, axis=1, keepdims=True)) / np.std(mfcc_delta2, axis=1, keepdims=True)

    ac_feature = np.vstack((norm_mfcc, norm_mfcc_delta, norm_mfcc_delta2))
   
 #Loading Annotation File
    ann = pd.read_csv('SPD/annotations_1.csv')
    ann['End_point'] = ann['Duration'] + ann['Offset']

    change_point = []
    for i in range(len(ann['End_point'])):
        dur_1 = int((ann['End_point'][i]-0.075)*sr)  # left 50ms
        dur_2 = int((ann['End_point'][i]+0.075)*sr)  # right 50ms
        change_point.append((dur_1, dur_2))
   
    sub_seq_len = int(3.2*sr/frame_shift)
    sub_seq_step= int(0.8*sr/frame_shift)

    feature_len = ac_feature.shape[1]

    def is_change_point(n):
        flag = False
        for x in change_point:
            if n > x[0] and n < x[1]:
                flag = True
                break

            if n+frame_size-1 > x[0] and n+frame_size-1 < x[1]:
                flag = True
                break
        return flag

    sub_train_x = []
    sub_train_y = []
    for i in range(0, feature_len-sub_seq_len, sub_seq_step):
        sub_seq_x = np.transpose(ac_feature[:, i: i+sub_seq_len])
        sub_train_x.append(sub_seq_x[np.newaxis, :, :])
        tmp = []
        for index in range(i, i+sub_seq_len):
            if is_change_point(index*frame_shift):
                tmp.append(1)
            else:
                tmp.append(0)
        lab_y = np.array(tmp)
        lab_y = np.reshape(lab_y, (1, sub_seq_len))
        sub_train_y.append(lab_y)
    return sub_train_x, sub_train_y


def load_dataset():
    all_x = []
    all_y = []
    for audio_file in file_list:
        new_train_x, new_train_y = extract_feature(audio_file)
        new_train_x = np.vstack(new_train_x)
        new_train_y = np.vstack(new_train_y)
        print(new_train_x.shape)
        print(new_train_y.shape)

        all_x.append(new_train_x)
        all_y.append(new_train_y)
    print(len(all_x))
    print(len(all_y))

    all_x_stack = np.vstack(all_x)
    all_y_stack = np.vstack(all_y)
    print(all_x_stack.shape, all_y_stack.shape)
    print('over')
    return all_x_stack, all_y_stack

In [5]:
from tensorflow import keras
from keras import layers
from keras.layers.core import Dense
from keras.models import Sequential
from keras.layers import Bidirectional, TimeDistributed, Dropout, LayerNormalization
from keras.layers import LSTM
import numpy as np
import keras
from sklearn.preprocessing import LabelEncoder

In [10]:
all_x, all_y = load_dataset()
print(all_y.shape, np.sum(all_y))

subsample_all_x = []
subsample_all_y = []
for index in range(all_y.shape[0]):
  class_positive = sum(all_y[index])
  if class_positive > 5:
    subsample_all_x.append(all_x[index][np.newaxis, :, :])
    subsample_all_y.append(all_y[index])

all_x = np.vstack(subsample_all_x)
all_y = np.vstack(subsample_all_y)
print(all_y.shape, np.sum(all_y))

all_y = all_y[:, :, np.newaxis]

indices = np.random.permutation(all_x.shape[0])
all_x_random = all_x[indices]
all_y_random = all_y[indices]

datasize = all_x_random.shape[0]
train_size = int(datasize*0.97)
train_x = all_x_random[0:train_size]
valid_x = all_x_random[train_size:]

train_y = all_y_random[0:train_size]
valid_y = all_y_random[train_size:]

Sampling Rate:  22050
(1449, 137, 35)
(1449, 137)
Sampling Rate:  22050
(2903, 137, 35)
(2903, 137)
Sampling Rate:  22050
(2443, 137, 35)
(2443, 137)
Sampling Rate:  22050
(2443, 137, 35)
(2443, 137)
4
4
(9238, 137, 35) (9238, 137)
over
(9238, 137) 1146113
(9238, 137) 1146113


In [11]:
from tensorflow import keras
from keras import layers
from keras.layers.core import Dense
from keras.models import Sequential
from keras.layers import Bidirectional, TimeDistributed, Dropout, LayerNormalization
from keras.layers import LSTM
import numpy as np
import keras
from sklearn.preprocessing import LabelEncoder

In [12]:
input_shape = train_x.shape[1:]
#input_shape = train_x.shape
print(input_shape)

(137, 35)


In [13]:
print(train_y.shape)

(8960, 137, 1)


In [14]:
from numpy import save

In [15]:
# save to npy file
save('SPD/train_x_15.npy', train_x)
save('SPD/train_y_15.npy', train_y)
save('SPD/valid_x_15.npy', valid_x)
save('SPD/valid_y_15.npy', valid_y)