In [41]:
#imports
from pydub import AudioSegment
from pydub.utils import make_chunks
import os
from glob import glob

#audio feature extraction library
import librosa
from librosa import feature

import numpy as np
import matplotlib.pyplot as plt

#model
import tensorflow as tf
from spela.spectrogram import Spectrogram 
from spela.melspectrogram import Melspectrogram

from sklearn.model_selection import train_test_split
import soundfile as sf

In [42]:
def findAllAudioFilePaths():
    audioFilesPaths = [y for x in os.walk("Dataset/Youtube Speech Dataset/Dataset") for y in glob(os.path.join(x[0], '*.wav'))]
    return audioFilesPaths

In [43]:
speakers = ["Obama", "Hillary", "Ivanka", "Trump", "No Speaker"]

def speakerToLabel(speakerName):
    index = speakers.index(speakerName)
    
    if(index == -1):
        print("Speaker not found: {}".format(speakerName))
    
    return index

In [44]:
def getSpeakerAndAudio(audioPaths):
    audio_Paths = []
    labels = []
    uniqueSpeakers = {}

    for audioPath in audioPaths:
        speakerName = audioPath.split("/")[3]

        audioLength = librosa.get_duration(filename=audioPath)
        
        if audioLength == 1.0:
            audio_Paths.append(audioPath)
            labels.append(speakerToLabel(speakerName))
            uniqueSpeakers[speakerName] = uniqueSpeakers.get(speakerName, 0) + 1
        else:
            print("Audio clip discarded, actual length = {}".format(audioLength))
    
    return audio_Paths, labels, uniqueSpeakers

In [45]:
def getFeatures(audio_Paths):
    
    data_X = []
    
    for path in audio_Paths:
        
        audioFeatureArray = []        
        y, sr = librosa.load(path)

        #mfcc
        mfccArray = librosa.feature.mfcc(y=y, sr=sr)
        
        data_X.append(mfccArray.flatten())
        

    
    return data_X

In [46]:
audioPaths = findAllAudioFilePaths()
audio_Paths, labels, uniqueSpeakers = getSpeakerAndAudio(audioPaths)
 
    
print("Speakers: {}".format(uniqueSpeakers))
print("Total Dataset size: {}".format(len(audio_Paths)))


data_X = getFeatures(audio_Paths)

print("X data: {}".format(len(data_X)))
print("Y data: {}".format(len(labels)))




Audio clip discarded, actual length = 0.221
Audio clip discarded, actual length = 0.4819954648526077
Audio clip discarded, actual length = 0.12598639455782312
Audio clip discarded, actual length = 0.7489795918367347
Speakers: {'No Speaker': 579, 'Hillary': 3452, 'Ivanka': 1075, 'Obama': 1168, 'Trump': 2496}
Total Dataset size: 8770
X data: 8770
Y data: 8770


In [47]:
train_X, test_X, train_Y, test_Y = train_test_split(data_X, labels, test_size=0.2)

train_x = np.array(train_X)
train_y = np.array(train_Y)
test_x = np.array(test_X)
test_y = np.array(test_Y)

train_y = tf.keras.utils.to_categorical(train_y)
test_y = tf.keras.utils.to_categorical(test_y)


In [48]:
# create a model
#from tf.keras.layers import Dense

def create_model():
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Dense(12,input_shape= train_x.shape, activation='relu'))
    model.add(tf.keras.layers.Dense(8, activation='relu'))
    model.add(tf.keras.layers.Dense(len(speakers), activation='sigmoid'))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4)
            , loss = "categorical_crossentropy"
            , metrics = ["accuracy"])
    return model

In [49]:
model = create_model()
model.summary()


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 7016, 12)          10572     
_________________________________________________________________
dense_19 (Dense)             (None, 7016, 8)           104       
_________________________________________________________________
dense_20 (Dense)             (None, 7016, 5)           45        
Total params: 10,721
Trainable params: 10,721
Non-trainable params: 0
_________________________________________________________________


In [51]:
model.fit(x=train_x, y=train_y, epochs=50, validation_data=(test_x, test_y))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f98e62a2f10>