In [52]:
#imports
from pydub import AudioSegment
from pydub.utils import make_chunks
import os
from glob import glob

#audio feature extraction library
import librosa
from librosa import feature

import numpy as np
import matplotlib.pyplot as plt

#model
import tensorflow as tf
from spela.spectrogram import Spectrogram 
from spela.melspectrogram import Melspectrogram

from sklearn.model_selection import train_test_split
import soundfile as sf

In [53]:
def findAllAudioFilePaths():
    audioFilesPaths = [y for x in os.walk("Dataset/Youtube Speech Dataset/Dataset") for y in glob(os.path.join(x[0], '*.wav'))]
    return audioFilesPaths

In [54]:
speakers = ["Obama", "Hillary", "Ivanka", "Trump", "No Speaker", "Modi", "Xi-Jinping", "Chadwick-Boseman"]

def speakerToLabel(speakerName):
    index = speakers.index(speakerName)
    
    if(index == -1):
        print("Speaker not found: {}".format(speakerName))
    
    return index

In [55]:
def getSpeakerAndAudio(audioPaths):
    audio_Paths = []
    labels = []
    uniqueSpeakers = {}

    for audioPath in audioPaths:
        speakerName = audioPath.split("/")[3]

        audioLength = librosa.get_duration(filename=audioPath)
        
        if audioLength == 1.0:
            audio_Paths.append(audioPath)
            labels.append(speakerToLabel(speakerName))
            uniqueSpeakers[speakerName] = uniqueSpeakers.get(speakerName, 0) + 1
        else:
            print("Audio clip discarded, actual length = {}".format(audioLength))
    
    return audio_Paths, labels, uniqueSpeakers

In [56]:
def getFeatures(audio_Paths):
    
    data_X = []
    
    for path in audio_Paths:
        
        audioFeatureArray = []        
        y, sr = librosa.load(path)

        #mfcc
        mfccArray = librosa.feature.mfcc(y=y, sr=sr)
        
        data_X.append(mfccArray.flatten())
        

    
    return data_X

In [None]:
audioPaths = findAllAudioFilePaths()
audio_Paths, labels, uniqueSpeakers = getSpeakerAndAudio(audioPaths)
 
    
print("Speakers: {}".format(uniqueSpeakers))
print("Total Dataset size: {}".format(len(audio_Paths)))


data_X = getFeatures(audio_Paths)

print("X data: {}".format(len(data_X)))
print("Y data: {}".format(len(labels)))




Audio clip discarded, actual length = 0.221
Audio clip discarded, actual length = 0.4819954648526077
Audio clip discarded, actual length = 0.66
Audio clip discarded, actual length = 0.3
Audio clip discarded, actual length = 0.6059863945578231
Audio clip discarded, actual length = 0.12598639455782312
Audio clip discarded, actual length = 0.7489795918367347
Speakers: {'No Speaker': 579, 'Hillary': 3452, 'Ivanka': 1075, 'Xi-Jinping': 671, 'Modi': 1944, 'Chadwick-Boseman': 1625, 'Obama': 1168, 'Trump': 2496}
Total Dataset size: 13010


In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(data_X, labels, test_size=0.2)

train_x = np.array(train_X)
train_y = np.array(train_Y)
test_x = np.array(test_X)
test_y = np.array(test_Y)

train_y = tf.keras.utils.to_categorical(train_y)
test_y = tf.keras.utils.to_categorical(test_y)


In [None]:
# create a model
#from tf.keras.layers import Dense

def create_model():
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Dense(12,input_shape= train_x.shape, activation='relu'))
    model.add(tf.keras.layers.Dense(8, activation='relu'))
    model.add(tf.keras.layers.Dense(len(speakers), activation='sigmoid'))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4)
            , loss = "categorical_crossentropy"
            , metrics = ["accuracy"])
    return model

In [None]:
model = create_model()
model.summary()


In [None]:
model.fit(x=train_x, y=train_y, epochs=50, validation_data=(test_x, test_y))