**Settings**

The output of this section is the CSV files with the data to be handle by the model

```
trainData     : audio/train 
testData      : audio/test
```


In [1]:
#requisits and ignore warnings
import warnings
warnings.simplefilter('ignore')

import fnmatch
import collections
import keras
import librosa
import csv
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras import models, layers
from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from matplotlib import pyplot, cm
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix


2022-11-19 12:14:00.257764: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-19 12:14:00.375353: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-19 12:14:00.375373: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-19 12:14:00.399690: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-19 12:14:00.963903: W tensorflow/stream_executor/platform/de

In [2]:
def extractWavFeatures(soundFilesFolder, csvFileName):
    print("The features of the files in the folder "+soundFilesFolder+" will be saved to "+csvFileName)
    header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
    for i in range(1, 21):
        header += f' mfcc{i}'
    header += ' label'
    header = header.split()
    print('CSV Header: ', header)
    file = open(csvFileName, 'w', newline='')
    #with file:
    writer = csv.writer(file)
    writer.writerow(header)
    for filename in tqdm(os.listdir(soundFilesFolder)):
        number = f'{soundFilesFolder}/{filename}'
        y, sr = librosa.load(number, mono=True, duration=30)
        # remove leading and trailing silence
        y, index = librosa.effects.trim(y)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        rmse = librosa.feature.rms(y=y)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        writer.writerow(to_append.split())
    file.close()
    print("End of extractWavFeatures")

In [3]:
def preProcessData(csvFileName):
    print(csvFileName+ " will be preprocessed")
    data = pd.read_csv(csvFileName)
    filenameArray = data['filename'] 
    speakerArray = []
    #print(filenameArray)
    for i in range(len(filenameArray)):
        speaker = int(filenameArray[i].split("_")[0].split("r")[1])
        #print(speaker)
        #print(speaker)
        speakerArray.append(speaker)
    data['number'] = speakerArray
    #Dropping unnecessary columns
    data = data.drop(['filename'],axis=1)
    data = data.drop(['label'],axis=1)
    data = data.drop(['chroma_stft'],axis=1)
    data.shape

    print("Preprocessing is finished")
    print(data.head())
    return data

In [4]:
def getSpeaker(speaker):
    speaker = "Speaker"+str(speaker).zfill(3)
    return speaker
    
        
def printPrediction(X_data, y_data, printDigit, model):
    print('\n# Generate predictions')
    for i in range(len(y_data)):
        predict_x=model.predict(X_data[i:i+1])[0]
        predict_classes = np.argmax(predict_x)
        prediction = getSpeaker(predict_classes)
    
        speaker = getSpeaker(y_data[i])
        if printDigit == True:
           print("Number={0:d}, y={1:10s}- prediction={2:10s}- match={3}".format(i, speaker, prediction, speaker==prediction))
        else:
           print("y={0:10s}- prediction={1:10s}- match={2}".format(speaker, prediction, speaker==prediction))

In [5]:
def report(X_data, y_data, model):
    #Confution Matrix and Classification Report
    predict_y = model.predict(X_data)
    Y_pred = np.argmax(predict_y, axis=1)
    y_test_num = y_data.astype(np.int64)
    conf_mt = confusion_matrix(y_test_num, Y_pred)
    #Revisar el valor
    size=len(conf_mt)
    print(size)
    print(conf_mt[size-1])
    key=0
    for val in conf_mt[size-1]:
        if val!=0:
            print(key)
        key=key+1
    conf_mt=conf_mt / conf_mt.astype(np.float).sum(axis=1)
    #print(conf_mt)
    plt.matshow(conf_mt)
    plt.show()
    print('\nClassification Report')
    print(classification_report(y_test_num, Y_pred))


In [6]:
def main(test_folder, train_folder):
    # Defines the names of the CSV files
    TRAIN_CSV_FILE = "train.csv"
    TEST_CSV_FILE = "test.csv"


    extractWavFeatures("audios/audios/"+train_folder, TRAIN_CSV_FILE)
    extractWavFeatures("audios/audios/"+test_folder, TEST_CSV_FILE)
    print("CSV files are created")

    trainData = preProcessData(TRAIN_CSV_FILE)
    testData = preProcessData(TEST_CSV_FILE)

    # Splitting the dataset into training, validation and testing dataset
    X = np.array(trainData.iloc[:, :-1], dtype = float)
    y = trainData.iloc[:, -1]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=50)


    X_test = np.array(testData.iloc[:, :-1], dtype = float)
    y_test = testData.iloc[:, -1]

    print("Y from training data:", y_train.shape)
    print("Y from validation data:", y_val.shape)
    print("Y from test data:", y_test.shape)

    #Normalizing the dataset
    scaler = StandardScaler()
    X_train = scaler.fit_transform( X_train )
    X_val = scaler.transform( X_val )
    X_test = scaler.transform( X_test )

    print("X from training data", X_train.shape)
    print("X from validation data", X_val.shape)
    print("X from test data", X_test.shape)

    weight={}
    path='audios/audios/56_speakers_audio_data'
    for speaker in os.listdir(path):
        if speaker.find(".sh")==-1:
            dir_path=path+'/'+speaker
            speaker = int(speaker.split("_")[0].split("r")[1])
            count = int(len(fnmatch.filter(os.listdir(dir_path), '*.*'))*0.8)-1
            weight.update({speaker: count})
    max_value=sum(weight.values())
    print("Sum of values: "+str(max_value))
    weight = {key: value for key, value in sorted(weight.items())}
    weight = {k: 1-(v/max_value) for k, v in weight.items()}
    print(weight.keys())
    print(set(range(57)) - set(y_train))

    #Creating a Model
    # model 1
    model = models.Sequential()
    model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(100, activation='softmax'))

    # Learning Process of a model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    # simple early stopping

    #es = EarlyStopping(monitor='val_accuracy', patience=10, verbose=1)
    es = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
    print(X_train)
    #Train with early stopping to avoid overfitting
    y_train=np.array(y_train, dtype=int)
    y_val=np.array(y_val, dtype=int)
    history = model.fit(X_train,y_train,validation_data=(X_val, y_val),epochs=100,batch_size=128,class_weight=weight, callbacks=[es])

    # plot training history
    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    pyplot.show()

    print('\n# TEST DATA #\n')
    y_test=np.array(y_test, dtype=int)
    score = model.evaluate(X_test, y_test)
    print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))
    # Prediction
    printPrediction(X_test[0:20], y_test[0:20], False, model)

    print("Classification Report for Test Data\n")
    report(X_test, y_test, model)
    return model

### Main 50 users all data

In [7]:
model=main("test", "train")

The features of the files in the folder audios/audios/train will be saved to train.csv
CSV Header:  ['filename', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15', 'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20', 'label']


  3%|█                                        | 86/3255 [00:24<14:52,  3.55it/s]


KeyboardInterrupt: 

### Main 5 users all data

In [None]:
main("test_5_users", "train_5_users")

### Main 5 users with 5 train registers

In [None]:
main("test_5_users_5_train", "train_5_users_5_train")

### Save model

In [None]:
model.save('speaker-recognition.h5')


Transfer learning

https://keras.io/guides/transfer_learning/


### Load model

In [None]:
new_model = keras.models.load_model('speaker-recognition.h5')


In [None]:
header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
filename='Speaker0055_477.wav'

for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()
print('CSV Header: ', header)
file = open('test.csv', 'w', newline='')
#with file:
writer = csv.writer(file)
writer.writerow(header)
number = f'audios/audios/test/{filename}'
y, sr = librosa.load(number, mono=True, duration=30)
# remove leading and trailing silence
y, index = librosa.effects.trim(y)
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
rmse = librosa.feature.rms(y=y)
spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
zcr = librosa.feature.zero_crossing_rate(y)
mfcc = librosa.feature.mfcc(y=y, sr=sr)
to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
for e in mfcc:
    to_append += f' {np.mean(e)}'
writer.writerow(to_append.split())
file.close()
print("End of extractWavFeatures")


In [None]:
testData = preProcessData('test.csv')

In [None]:
X_test = np.array(testData.iloc[:, :-1], dtype = float)
y_test = testData.iloc[:, -1]


In [None]:
print('\n# TEST DATA #\n')
y_test=np.array(y_test, dtype=int)
score = new_model.evaluate(X_test, y_test)
print("%s: %.2f%%" % (new_model.metrics_names[1], score[1]*100))
# Prediction
printPrediction(X_test[0:1], y_test[0:1], False, new_model)

print("Classification Report for Test Data\n")
report(X_test, y_test, new_model)