# Read Train and Test dataset

In [6]:
import pandas as pd

#read the training csv file and assign training set
df = pd.read_csv('en_train.csv')
trainAudio = df['LINK']
y_train = df['WORD']

#read the testing csv file and assign testing set
df2 = pd.read_csv('en_test.csv')
testAudio = df2['LINK']
y_test = df2['WORD']

In [7]:
df.head()

Unnamed: 0,LINK,WORD,VALID,SPEAKER,GENDER
0,aachen/common_voice_en_19779773.wav,aachen,True,0aedd0e24f1db5b1ce965f107fd19dd40e0c50f7ead449...,MALE
1,aachen/common_voice_en_19798768.wav,aachen,True,ed0e4d79c6c2889459e88e11724dbd7f2cb2417e6a4320...,FEMALE
2,aachen/common_voice_en_19852665.wav,aachen,True,cd185c1ab8659ae6d21d7b63dc4a5e54a3f65f98e29ff1...,MALE
3,aachen/common_voice_en_20127845.wav,aachen,True,29b8505586cd43382cd695da6b943f401104be710a5b60...,FEMALE
4,aachen/common_voice_en_20449666.wav,aachen,True,372293e65cdab88771e028a4351651ab2eff64438ddafc...,MALE


In [17]:
import librosa
import os

X_train=[]
X_test=[]

#number of classes
path = 'en/clips/'
folders = len(os.listdir(path))
folders

38173

# Convertion

In [None]:
# from pathlib import Path

# paths = Path('en/clips/')

# for f in paths.iterdir():
#     for f2 in f.iterdir():
#         if f2.is_file() and f2.suffix in ['.opus']:
#             f2.rename(f2.with_suffix('.wav'))

In [36]:
def AudioToSpectrogram(filename):
    y,sr=librosa.load(filename, sr=16000) #load the file
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr,n_mels=128,f_min=20,f_max=sr)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

In [None]:
#convert the training audios to spectograms and place it in the Xtrain variable
for x in range(len(df)):
    filename= path + str(trainAudio[x]) #get the filename 
    spectrogram = AudioToSpectrogram(filename)
    X_train.append(spectrogram)


In [None]:

#convert the testing audios to spectograms and place it in the Xtest variable
for x in range(len(df2)):
    filename= path + str(testAudio[x]) #get the filename 
    spectrogram = AudioToSpectrogram(filename)
    X_test.append(spectrogram)

In [None]:
from keras.preprocessing.image import ImageDataGenerator as IDG
trainDatagen = IDG(rescale = 1/255.0)
testDatagen = IDG(rescale = 1/255.0)

directory = "en/clips"
height = 256
width = 256
batchSize = 32

trainGen = trainDatagen.flow_from_directory(
        directory,
        target_size=(height, width),
        batch_size=batchSize,
        class_mode='categorical')

testGen = testDatagen.flow_from_directory(
        directory,
        target_size=(height, width),
        batch_size=batchSize,
        class_mode='categorical')


In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = EarlyStopping(monitor = 'val_loss', patience = '10', verbose = '1', mode = 'auto')
modelPath = 'model.h5'
h5Model = ModelCheckpoint(modelPath, monitor = 'val_accuracy', verbose = 1, save_best_only = True)

# Build Model

In [None]:
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization
from keras.models import Sequential, load_model

model = Sequential([
    #first set of convolution and pooling layer
    Conv2D(32,(3,3), activation = 'relu', input_shape = (250, 250, 1), padding='same'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    #second set of convolution and pooling layer
    Conv2D(32,(3,3), activation = 'relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    #third set of convolution and pooling layer
    Conv2D(32,(3,3), activation = 'relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    #FC layer
    Flatten(),
    Dense(256, activation = 'relu'),
    #final layer, input is number of classes
    Dense(folders, activation = 'softmax')
])

#model compilation
model.compile(optimizer='adam', loss='categorical_cross_entropy', metrics = ['accuracy'])
model.summary()

# Train Model

In [None]:
#training the model
history = model.fit(trainGen,
                    validation_data = testDatagen,
                    batch_size = batchSize,
                    epochs = 10,
                    verbose = 1,
                    callbacks =[h5Model])

# Plot Graph

In [None]:
import numpy as np
import matplotlib.pyplot as plt

#Initialising basic values
accuracy = history.model_training['accuracy']
val_accuracy = history.model_training['val_accuracy']
loss = history.model_training['loss']
val_loss = history.model_training['val_loss']
epochs=range(len(accuracy))

In [None]:
#Plotting the graph for the accuracy of training and testing
fig = plt.figure(figsize=(14,7))
plt.plot(epochs, accuracy, 'r', label="Training Accuracy")
plt.plot(epochs, val_accuracy, 'b', label="Test Accuracy")
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy')
plt.legend(loc='lower left')
plt.show()

#Plotting the graph for the loss of training and testing
fig2 = plt.figure(figsize=(14,7))
plt.plot(epochs, loss, 'r', label="Training Loss")
plt.plot(epochs, val_loss, 'b', label="Test Loss")
plt.legend(loc='upper right')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test loss')

# Prediction

In [None]:
filename = 'predict/prediction1.vlc'
SpectrogramImageForPrediction = AudioToSpectrogram(filename)
model.predict(SpectrogramImageForPrediction)

In [None]:
from pydub import AudioSegment
from pydub.silence import split_on_silence

sound_file = AudioSegment.from_wav("predict/prediction2.wav")
audio_chunks = split_on_silence(sound_file, 
    # must be silent for at least half a second
    min_silence_len=500,

    # consider it silent if quieter than -16 dBFS
    silence_thresh=-16
)

SpectrogramImagesForPrediction = []
for x in audio_chunks:
    SpectrogramImageForPrediction = AudioToSpectrogram(filename)
    SpectrogramImagesForPrediction.append(SpectrogramImageForPrediction)
    
for x in SpectrogramImagesForPrediction:
    model.predict(x)