In [24]:
import numpy as np
from scipy.signal import lfilter
from scipy.signal.windows import hann
from scipy.linalg import toeplitz, solve_toeplitz
from scipy.signal import convolve
import os
import librosa
from scipy.spatial.distance import pdist, squareform
import seaborn as sns
import matplotlib.pyplot as plt
import math
import gc
from IPython.display import clear_output
import tensorflow.keras.layers as tfl
import tensorflow as tf
from keras.callbacks import ReduceLROnPlateau

In [25]:
def lpc(x, order, regularization=1e-12):
    R = np.correlate(x, x, mode='full')[len(x)-1:]
    R = R[:order+1]
    R[0] += regularization  # Apply regularization
    coeff = solve_toeplitz((R[:order], R[:order]), -R[1:])
    
    if coeff[0] == 0:  # Ensure the first coefficient is not zero
        coeff[0] = 1e-10  # Add a small value to avoid issues
    
    return coeff

In [26]:
def gfmiaif_pulse(s_gvl, iterations=1, nv=12, ng=3, enc_size=16, d=0.99, win=None):
    if win is None:
        win = hann(len(s_gvl))
    
    Lpf = nv + 1
    x_gvl = np.concatenate((np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl))
    idx_pf = slice(Lpf, len(x_gvl))
    
    al = [1, -d]
    s_gv = lfilter([1], al, s_gvl)
    x_gv = lfilter([1], al, x_gvl)
    
    ag1 = lpc(s_gv * win, 1)
    
    for _ in range(ng - 1):
        x_v1x = lfilter(ag1, [1], x_gv)
        s_v1x = x_v1x[idx_pf]
        ag1x = lpc(s_v1x * win, 1)
        ag1 = np.convolve(ag1, ag1x)
    
    x_v1 = lfilter(ag1, [1], x_gv)
    s_v1 = x_v1[idx_pf]
    av1 = lpc(s_v1 * win, nv)
    
    for _ in range(iterations):
        x_g1 = lfilter(av1, [1], x_gv)
        s_g1 = x_g1[idx_pf]
        ag = lpc(s_g1 * win, ng)
    
        x_v = lfilter(ag, [1], x_gv)
        s_v = x_v[idx_pf]
        av1 = lpc(s_v * win, nv)
        
    glottal_pulse = lfilter(av1, [1], s_gvl)
    vt_pulse = lfilter(ag, [1], s_gvl)
    
    return vt_pulse,glottal_pulse

In [27]:
def generate_spec(data_folder,create_folder,num_speakers=5,num_frames=100,frame_length_msec=30,sr=44100,verbose=True):
    
    ori_folder = os.path.join(create_folder, "original")
    vt_folder = os.path.join(create_folder, "vocal_tract")
    glot_folder = os.path.join(create_folder, "glottal")
    
    total_speaker = 0
    for speaker in os.listdir(data_folder):
        ori_folder_speaker = os.path.join(ori_folder, speaker)
        vt_folder_speaker = os.path.join(vt_folder, speaker)
        glot_folder_speaker = os.path.join(glot_folder, speaker)
        os.makedirs(ori_folder_speaker, exist_ok=True)
        os.makedirs(vt_folder_speaker, exist_ok=True)
        os.makedirs(glot_folder_speaker, exist_ok=True)
        
        total_time_data=num_frames*frame_length_msec
        curr_time=0
        
        y=[]
        wav_files=os.listdir(os.path.join(data_folder, speaker))
        idx=0
        while curr_time<total_time_data:
            if wav_files[idx].endswith(".wav"):
                wav_file_path = os.path.join(data_folder, speaker, wav_files[idx])

                y_curr, sr = librosa.load(wav_file_path, sr=sr)
                y.extend(y_curr)
                
                curr_time+=60000
                idx+=1
                
        del wav_files

        frame_length_samples=frame_length_msec*sr//1000
        frames = librosa.util.frame(y, frame_length=frame_length_samples, hop_length=frame_length_samples).T
        frames=frames[:num_frames]
#         print(frames)
        
        del y

        for num,frame in enumerate(frames):
            vt,glot=gfmiaif_pulse(frame)
            window=hann(len(frame))
            frame_windowed = frame * window
            glot_windowed = glot * window
            vt_windowed = vt * window

            del vt,glot
            
            plt.figure(figsize=(5, 5))
            ax = plt.axes()
            ax.set_axis_off()
            plt.set_cmap('hot')
            amp=np.abs(librosa.stft(frame_windowed))
            D = librosa.amplitude_to_db(amp, ref=np.max)
            librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
            output_path = os.path.join(ori_folder_speaker, "ori"+str(num+1) + ".png")
            plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
            plt.close('all')
        
            plt.figure(figsize=(5, 5))
            ax = plt.axes()
            ax.set_axis_off()
            plt.set_cmap('hot')
            amp=np.abs(librosa.stft(vt_windowed))
            D = librosa.amplitude_to_db(amp, ref=np.max)
            librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
            output_path = os.path.join(vt_folder_speaker, "vt"+str(num+1) + ".png")
            plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
            plt.close('all')
            
            plt.figure(figsize=(5, 5))
            ax = plt.axes()
            ax.set_axis_off()
            plt.set_cmap('hot')
            amp=np.abs(librosa.stft(glot_windowed))
            D = librosa.amplitude_to_db(amp, ref=np.max)
            librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
            output_path = os.path.join(glot_folder_speaker, "glot"+str(num+1) + ".png")
            plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
            plt.close('all')
            del D,ax,output_path
            gc.collect()

        del frames
                        
        total_speaker += 1
        if total_speaker != 0 and total_speaker % 1 == 0 and verbose:
            print(f"{total_speaker} speakers completed.\n")

        if total_speaker == num_speakers:
            break

In [28]:
num_speakers=5
frame_length_msec=50
num_frames=100
data_folder="/kaggle/input/speaker-recognition-audio-dataset/50_speakers_audio_data"
create_folder="/kaggle/working/"

In [29]:
generate_spec(data_folder,create_folder,num_speakers,num_frames,frame_length_msec)

[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (1600) too large for available bit count (1568)


1 speakers completed.

2 speakers completed.

3 speakers completed.

4 speakers completed.

5 speakers completed.



In [30]:
from PIL import Image

dataset_path_ori="/kaggle/working/original"
dataset_path_vt="/kaggle/working/vocal_tract"
dataset_path_glot="/kaggle/working/glottal"
folders=os.listdir(dataset_path_ori)

# Initializing training and test dataset
X_train=[[],[],[]]
y_train=[[],[],[]]
X_test=[[],[],[]]
y_test=[[],[],[]]

# Split the dataset into training and test set.
num=np.random.rand(num_frames*num_speakers)
mask=num<0.2
split=mask.astype(int)

for dirs in folders:
    i=0
    for img in os.listdir(os.path.join(dataset_path_ori,dirs)):
        image=Image.open(os.path.join(dataset_path_ori,dirs,img))
        new_img=image.resize((200,200))
        tmp_array=np.array(new_img)/255.
        if split[i]==0:
            X_train[0].append(tmp_array)
            y_train[0].append(str(dirs))
        else:
            X_test[0].append(tmp_array)
            y_test[0].append(str(dirs))
            
        i+=1
        
    i=0    
    for img in os.listdir(os.path.join(dataset_path_vt,dirs)):
        image=Image.open(os.path.join(dataset_path_vt,dirs,img))
        new_img=image.resize((200,200))
        tmp_array=np.array(new_img)/255.
        if split[i]==0:
            X_train[1].append(tmp_array)
            y_train[1].append(str(dirs))
        else:
            X_test[1].append(tmp_array)
            y_test[1].append(str(dirs))
            
        i+=1
            
    i=0
    for img in os.listdir(os.path.join(dataset_path_glot,dirs)):
        image=Image.open(os.path.join(dataset_path_glot,dirs,img))
        new_img=image.resize((200,200))
        tmp_array=np.array(new_img)/255.
        if split[i]==0:
            X_train[2].append(tmp_array)
            y_train[2].append(str(dirs))
        else:
            X_test[2].append(tmp_array)
            y_test[2].append(str(dirs))
        
        i+=1

In [31]:
dict={}
i=0
for val in folders:
    dict[val]=i
    i=i+1

dict

{'Speaker0043': 0,
 'Speaker_0014': 1,
 'Speaker0047': 2,
 'Speaker_0009': 3,
 'Speaker0029': 4}

In [32]:
i=0
for _ in y_train[0]:
    y_train[0][i]=dict[y_train[0][i]]
    i=i+1

i=0
for _ in y_test[0]:
    y_test[0][i]=dict[y_test[0][i]]
    i=i+1
    
i=0
for _ in y_train[1]:
    y_train[1][i]=dict[y_train[1][i]]
    i=i+1

i=0
for _ in y_test[1]:
    y_test[1][i]=dict[y_test[1][i]]
    i=i+1
    
i=0
for _ in y_train[2]:
    y_train[2][i]=dict[y_train[2][i]]
    i=i+1

i=0
for _ in y_test[2]:
    y_test[2][i]=dict[y_test[2][i]]
    i=i+1

In [33]:
def convolutional_model(input_shape,output_shape):
    input_img = tf.keras.Input(shape=input_shape)
    Z1=tfl.Conv2D(filters=32,kernel_size=(3,3),strides=(1,1),padding='same')(input_img)
    A1=tfl.ReLU()(Z1)
    P1=tfl.MaxPool2D(pool_size=(4,4),padding='same')(A1)
    Z2=tfl.Conv2D(filters=64,kernel_size=(2,2),strides=(1,1),padding='same')(P1)
    A2=tfl.ReLU()(Z2)
    P2=tfl.MaxPool2D(pool_size=(4,4),padding='same')(A2)
    Z3=tfl.Conv2D(filters=128,kernel_size=(2,2),strides=(1,1),padding='same')(P2)
    A3=tfl.ReLU()(Z3)
    P3=tfl.MaxPool2D(pool_size=(4,4),padding='same')(A3)
    F=tfl.Flatten()(P3)
    D1=tfl.Dense(128)(F)
    DP1=tfl.Dropout(0.5)(D1)
    
    outputs=tfl.Dense(output_shape,activation='softmax')(DP1)
    
    model = tf.keras.Model(inputs=input_img, outputs=outputs)
    return model

In [34]:
conv_model_ori = convolutional_model((200, 200, 4),num_speakers)
conv_model_ori.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [35]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00001)
history = conv_model_ori.fit(np.array(X_train[0]),np.array(y_train[0]), batch_size=32, epochs=100, validation_data=(np.array(X_test[0]),np.array(y_test[0])),callbacks=(reduce_lr))

Epoch 1/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 634ms/step - accuracy: 0.1862 - loss: 1.6571 - val_accuracy: 0.5091 - val_loss: 1.4219 - learning_rate: 0.0010
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 556ms/step - accuracy: 0.5759 - loss: 1.2804 - val_accuracy: 0.5182 - val_loss: 0.9914 - learning_rate: 0.0010
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 567ms/step - accuracy: 0.6009 - loss: 0.9251 - val_accuracy: 0.6455 - val_loss: 0.8690 - learning_rate: 0.0010
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 560ms/step - accuracy: 0.7277 - loss: 0.7035 - val_accuracy: 0.8091 - val_loss: 0.5790 - learning_rate: 0.0010
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 623ms/step - accuracy: 0.8545 - loss: 0.4395 - val_accuracy: 0.8273 - val_loss: 0.4689 - learning_rate: 0.0010
Epoch 6/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [36]:
conv_model_glot = convolutional_model((200, 200, 4),num_speakers)
conv_model_glot.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [37]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00001)
history_glot = conv_model_glot.fit(np.array(X_train[2]),np.array(y_train[2]), batch_size=32, epochs=75, validation_data=(np.array(X_test[2]),np.array(y_test[2])),callbacks=(reduce_lr))

Epoch 1/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 583ms/step - accuracy: 0.3024 - loss: 1.5684 - val_accuracy: 0.6364 - val_loss: 1.1732 - learning_rate: 0.0010
Epoch 2/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 549ms/step - accuracy: 0.6013 - loss: 1.0738 - val_accuracy: 0.7182 - val_loss: 0.8435 - learning_rate: 0.0010
Epoch 3/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 551ms/step - accuracy: 0.6635 - loss: 0.8436 - val_accuracy: 0.7182 - val_loss: 0.6368 - learning_rate: 0.0010
Epoch 4/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 556ms/step - accuracy: 0.7800 - loss: 0.6410 - val_accuracy: 0.7818 - val_loss: 0.4824 - learning_rate: 0.0010
Epoch 5/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 552ms/step - accuracy: 0.8053 - loss: 0.5872 - val_accuracy: 0.9273 - val_loss: 0.3062 - learning_rate: 0.0010
Epoch 6/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [38]:
conv_model_vt = convolutional_model((200, 200, 4),num_speakers)
conv_model_vt.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [39]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00001)
history_vt = conv_model_vt.fit(np.array(X_train[1]),np.array(y_train[1]), batch_size=32, epochs=75, validation_data=(np.array(X_test[1]),np.array(y_test[1])),callbacks=(reduce_lr))

Epoch 1/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 584ms/step - accuracy: 0.3180 - loss: 1.5787 - val_accuracy: 0.7091 - val_loss: 1.1821 - learning_rate: 0.0010
Epoch 2/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 573ms/step - accuracy: 0.5447 - loss: 1.1134 - val_accuracy: 0.7000 - val_loss: 0.8292 - learning_rate: 0.0010
Epoch 3/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 542ms/step - accuracy: 0.7037 - loss: 0.7400 - val_accuracy: 0.7636 - val_loss: 0.6987 - learning_rate: 0.0010
Epoch 4/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 555ms/step - accuracy: 0.7593 - loss: 0.6812 - val_accuracy: 0.7818 - val_loss: 0.5878 - learning_rate: 0.0010
Epoch 5/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 591ms/step - accuracy: 0.8112 - loss: 0.5624 - val_accuracy: 0.8727 - val_loss: 0.4036 - learning_rate: 0.0010
Epoch 6/75
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m