In [137]:
#imports
from pydub import AudioSegment
from pydub.utils import make_chunks
import os
from glob import glob

#audio feature extraction library
import librosa
from librosa import feature

import numpy as np
import matplotlib.pyplot as plt

#model
import tensorflow as tf
from spela.spectrogram import Spectrogram 
from spela.melspectrogram import Melspectrogram

from sklearn.model_selection import train_test_split
import soundfile as sf

In [138]:
def findAllAudioFilePaths():
    audioFilesPaths = [y for x in os.walk("Dataset/Youtube Speech Dataset/Dataset") for y in glob(os.path.join(x[0], '*.wav'))]
    return audioFilesPaths

In [139]:
def speakerToLabel(speakerName):
    if speakerName == "Obama":
        return 0
    elif speakerName == "Hillary":
        return 1
    else:
        print("error")
        return -1

In [140]:
def getSpeakerAndAudio(audioPaths):
    audio_Paths = []
    labels = []
    uniqueSpeakers = set()

    for audioPath in audioPaths:
        speakerName = audioPath.split("/")[3]

        audioLength = librosa.get_duration(filename=audioPath)
        
        if audioLength == 1.0:
            audio_Paths.append(audioPath)
            labels.append(speakerToLabel(speakerName))
            uniqueSpeakers.add(speakerName)
        else:
            print("Audio less than 1 second, actual length = {}".format(audioLength))
    
    return audio_Paths, labels, uniqueSpeakers

In [141]:
def getFeatures(audio_Paths):
    
    data_X = []
    
    for path in audio_Paths:
        
        audioFeatureArray = []        
        y, sr = librosa.load(path)

        #mfcc
        mfccArray = librosa.feature.mfcc(y=y, sr=sr)
        
        data_X.append(mfccArray.flatten())
        

    
    return data_X

In [142]:
audioPaths = findAllAudioFilePaths()

audio_Paths, labels, uniqueSpeakers = getSpeakerAndAudio(audioPaths)
 
print("Speakers: {}".format(uniqueSpeakers))
print("Total Dataset size: {}".format(len(audio_Paths)))

data_X = getFeatures(audio_Paths)

print("X data: {}".format(len(data_X)))
print("Y data: {}".format(len(labels)))




Audio less than 1 second, actual length = 0.4819954648526077
Audio less than 1 second, actual length = 0.12598639455782312
Speakers: {'Obama', 'Hillary'}
Total Dataset size: 4620
X data: 4620
Y data: 4620


In [143]:
train_X, test_X, train_Y, test_Y = train_test_split(data_X, labels, test_size=0.2)

train_x = np.array(train_X)
train_y = np.array(train_Y)
test_x = np.array(test_X)
test_y = np.array(test_Y)

print(train_x.shape)

(3696, 880)


In [155]:
# create a model
#from tf.keras.layers import Dense

def create_model(speech_feature):
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Dense(12,input_shape= train_x.shape, activation='relu'))
    model.add(tf.keras.layers.Dense(8, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4)
            , loss = "BinaryCrossentropy"
            , metrics = ["accuracy"])
    return model

In [156]:
model = create_model("spectrogram")
model.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_41 (Dense)             (None, 3696, 12)          10572     
_________________________________________________________________
dense_42 (Dense)             (None, 3696, 8)           104       
_________________________________________________________________
dense_43 (Dense)             (None, 3696, 1)           9         
Total params: 10,685
Trainable params: 10,685
Non-trainable params: 0
_________________________________________________________________


In [159]:
model.fit(x=train_x, y=train_y, epochs=1, validation_data=(test_x, test_y))




<tensorflow.python.keras.callbacks.History at 0x7fba5e377350>