In [4]:
import numpy as np
from scipy.signal import lfilter
from scipy.signal.windows import hann
from scipy.linalg import toeplitz, solve_toeplitz
import os
import librosa
from scipy.spatial.distance import pdist, squareform
import seaborn as sns
import matplotlib.pyplot as plt
import math
import tensorflow.keras.layers as tfl
import tensorflow as tf
from keras.callbacks import ReduceLROnPlateau

In [5]:
def lpc(x, order):
    R = np.correlate(x, x, mode='full')[len(x)-1:]
    R = R[:order+1]
    coeff=solve_toeplitz((R[:order], R[:order]), -R[1:])
    del R
    
    return coeff

In [6]:
def gfmiaif(s_gvl, iterations=1, nv=12, ng=3, d=0.99, win=None):
    if win is None:
        win = hann(len(s_gvl))
    
    Lpf = nv + 1
    x_gvl = np.concatenate((np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl))
    idx_pf = slice(Lpf, len(x_gvl))
    
    al = [1, -d]
    s_gv = lfilter([1], al, s_gvl)
    x_gv = lfilter([1], al, x_gvl)
    
    ag1 = lpc(s_gv * win, 1)
    
    for _ in range(ng - 1):
        x_v1x = lfilter(ag1, [1], x_gv)
        s_v1x = x_v1x[idx_pf]
        ag1x = lpc(s_v1x * win, 1)
        ag1 = np.convolve(ag1, ag1x)
    
    x_v1 = lfilter(ag1, [1], x_gv)
    s_v1 = x_v1[idx_pf]
    av1 = lpc(s_v1 * win, nv)
    
    for _ in range(iterations):
        x_g1 = lfilter(av1, [1], x_gv)
        s_g1 = x_g1[idx_pf]
        ag = lpc(s_g1 * win, ng)
    
        x_v = lfilter(ag, [1], x_gv)
        s_v = x_v[idx_pf]
        av1 = lpc(s_v * win, nv)
    
    return av1, ag, al

In [7]:
def generate_encoding(data_folder,num_speakers=5,num_frames=100,frame_length_msec=30,sr=44100,verbose=True,iteration=1):
    x_glot=[]
    x_vt=[]
    y_glot=[]
    y_vt=[]
    
    total_speaker = 0
    for speaker in os.listdir(data_folder):
        total_time_data=num_frames*frame_length_msec
        curr_time=0
        
        y=[]
        wav_files=os.listdir(os.path.join(data_folder, speaker))
        idx=0
        while curr_time<total_time_data:
            if wav_files[idx].endswith(".wav"):
                wav_file_path = os.path.join(data_folder, speaker, wav_files[idx])

                y_curr, sr = librosa.load(wav_file_path, sr=sr)
                y.append(y_curr)
                
                curr_time+=60000
                idx+=1
                
        del wav_files

        frame_length_samples=frame_length_msec*sr//1000
        frames = librosa.util.frame(list(y[0]), frame_length=frame_length_samples, hop_length=frame_length_samples).T
        frames=frames[:num_frames]
#         print(frames)
        
        del y

        for frame in frames:
            vt,glot,_=gfmiaif(frame,iteration)
            x_vt.append(vt)
            x_glot.append(glot)
            y_glot.append(speaker)
            y_vt.append(speaker)

            del vt,glot

        del frames
                        
        total_speaker += 1
        if total_speaker != 0 and total_speaker % 1 == 0 and verbose:
            print(f"{total_speaker} speakers completed.\n")

        if total_speaker == num_speakers:
            break
    
    return x_vt,x_glot,y_vt,y_glot

In [32]:
num_speakers=5
frame_length_msec=3
num_frames=1000
data_folder="/kaggle/input/speaker-recognition-audio-dataset/50_speakers_audio_data"

In [33]:
x_vt,x_glot,y_vt,y_glot=generate_encoding(data_folder,num_speakers,num_frames,frame_length_msec)

[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (1600) too large for available bit count (1568)


1 speakers completed.

2 speakers completed.

3 speakers completed.

4 speakers completed.

5 speakers completed.



In [34]:
dict={}
i=0

unique=[]
for val in y_vt:
    if val not in unique:
        unique.append(val)

for val in unique:
    dict[val]=i
    i+=1

dict

{'Speaker0043': 0,
 'Speaker_0014': 1,
 'Speaker0029': 2,
 'Speaker0047': 3,
 'Speaker_0009': 4}

In [35]:
from sklearn.model_selection import train_test_split

X_train_vt, X_test_vt, y_train_vt, y_test_vt = train_test_split(x_vt,y_vt , 
                                   random_state=104,  
                                   test_size=0.2,  
                                   shuffle=True)

X_train_glot, X_test_glot, y_train_glot, y_test_glot = train_test_split(x_glot,y_glot , 
                                   random_state=104,  
                                   test_size=0.2,  
                                   shuffle=True)

In [36]:
i=0
for _ in y_train_vt:
    y_train_vt[i]=dict[y_train_vt[i]]
    i=i+1

i=0
for _ in y_test_vt:
    y_test_vt[i]=dict[y_test_vt[i]]
    i=i+1
    
i=0
for _ in y_train_glot:
    y_train_glot[i]=dict[y_train_glot[i]]
    i=i+1

i=0
for _ in y_test_glot:
    y_test_glot[i]=dict[y_test_glot[i]]
    i=i+1

In [37]:
def dnn_model(input_shape,output_shape):
    input_data = tf.keras.Input(shape=input_shape)
    
    D1 = tfl.Dense(512, activation='relu')(input_data)
    DP1 = tfl.Dropout(0.5)(D1)
    
    D2 = tfl.Dense(256, activation='relu')(DP1)
    DP2 = tfl.Dropout(0.5)(D2)
    
    D3 = tfl.Dense(128, activation='relu')(DP2)
    DP3 = tfl.Dropout(0.5)(D3)
    
    outputs = tfl.Dense(output_shape, activation='softmax')(DP3)
    
    model = tf.keras.Model(inputs=input_data, outputs=outputs)
    return model

In [38]:
model_vt = dnn_model((len(X_train_vt[0]),),num_speakers)
model_vt.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [39]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00001)
history = model_vt.fit(np.array(X_train_vt),np.array(y_train_vt), batch_size=32, epochs=100, validation_data=(np.array(X_test_vt),np.array(y_test_vt)),callbacks=(reduce_lr))

Epoch 1/100
[1m 87/125[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.2070 - loss: 3.3916

W0000 00:00:1724827735.794349     323 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m116/125[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 2ms/step - accuracy: 0.2093 - loss: 3.1474

W0000 00:00:1724827736.366224     321 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.2103 - loss: 3.0800 - val_accuracy: 0.3020 - val_loss: 1.5559 - learning_rate: 0.0010
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2500 - loss: 1.6593 - val_accuracy: 0.2890 - val_loss: 1.5367 - learning_rate: 0.0010
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2591 - loss: 1.6121 - val_accuracy: 0.3470 - val_loss: 1.5089 - learning_rate: 0.0010
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2966 - loss: 1.5511 - val_accuracy: 0.3590 - val_loss: 1.4802 - learning_rate: 0.0010
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3073 - loss: 1.5119 - val_accuracy: 0.3770 - val_loss: 1.4522 - learning_rate: 0.0010
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

In [40]:
model_glot = dnn_model((len((X_train_glot[0]),),),num_speakers)
model_glot.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [41]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00001)
history = model_glot.fit(np.array(X_train_glot),np.array(y_train_glot), batch_size=32, epochs=100, validation_data=(np.array(X_test_glot),np.array(y_test_glot)),callbacks=(reduce_lr))

Epoch 1/100
[1m 76/125[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.2049 - loss: 1.6653

W0000 00:00:1724827771.668796     323 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m102/125[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.2026 - loss: 1.6608

W0000 00:00:1724827772.716985     321 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.2016 - loss: 1.6574 - val_accuracy: 0.1910 - val_loss: 1.6101 - learning_rate: 0.0010
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2088 - loss: 1.6140 - val_accuracy: 0.2450 - val_loss: 1.6064 - learning_rate: 0.0010
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2164 - loss: 1.6138 - val_accuracy: 0.2380 - val_loss: 1.6076 - learning_rate: 0.0010
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2139 - loss: 1.6118 - val_accuracy: 0.2060 - val_loss: 1.6023 - learning_rate: 0.0010
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2120 - loss: 1.6045 - val_accuracy: 0.2440 - val_loss: 1.5979 - learning_rate: 0.0010
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

In [42]:
def create_train_triplets(X_train, y_train, num_triplets):
    triplets = []
    labels = np.unique(y_train)
    
    for _ in range(num_triplets):
        anchor_label = np.random.choice(labels)
        negative_label = np.random.choice(labels[labels != anchor_label])
        
        anchor_indices = np.where(y_train == anchor_label)[0]
        positive_indices = np.where(y_train == anchor_label)[0]
        negative_indices = np.where(y_train == negative_label)[0]
        
        anchor = X_train[np.random.choice(anchor_indices)]
        positive = X_train[np.random.choice(positive_indices)]
        negative = X_train[np.random.choice(negative_indices)]
        
        triplets.append((anchor, positive, negative))
    
    return np.array(triplets)

In [43]:
def create_test_triplets(X_test, y_test, num_triplets):
    triplets = []
    range_test = len(y_test)
    idx = np.arange(range_test)
    
    for _ in range(num_triplets):
        label_1 = np.random.choice(idx)
        label_2 = np.random.choice(idx)
        
        anchor = X_test[label_1]
        inp = X_test[label_2]
        out = 1 if y_test[label_1] == y_test[label_2] else 0
        
        triplets.append((anchor, inp, out))
        
    return np.array(triplets, dtype=object) 

In [49]:
def dnn_model_triplet_loss(input_shape):
    input_data = tf.keras.Input(shape=input_shape)
    
    D1 = tfl.Dense(512, activation='relu')(input_data)
    DP1 = tfl.Dropout(0.5)(D1)
    
    D2 = tfl.Dense(256, activation='relu')(DP1)
    DP2 = tfl.Dropout(0.5)(D2)
    
    outputs = tfl.Dense(128, activation='relu')(DP2)
        
    model = tf.keras.Model(inputs=input_data, outputs=outputs)
    return model

input_shape = (3,)
base_model = dnn_model_triplet_loss(input_shape)

In [50]:
def triplet_loss(y_true, y_pred, margin=0.2):
    anchor, positive, negative = y_pred[0], y_pred[1], y_pred[2]
    pos_dist = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
    neg_dist = tf.reduce_sum(tf.square(anchor - negative), axis=-1)
    loss = tf.maximum(pos_dist - neg_dist + margin, 0.0)
    return tf.reduce_mean(loss)

In [51]:
anchor_input = tfl.Input(shape=input_shape, name='anchor_input')
positive_input = tfl.Input(shape=input_shape, name='positive_input')
negative_input = tfl.Input(shape=input_shape, name='negative_input')

encoded_anchor = base_model(anchor_input)
encoded_positive = base_model(positive_input)
encoded_negative = base_model(negative_input)

triplet_model_vt = tf.keras.Model(inputs=[anchor_input, positive_input, negative_input],
                      outputs=[encoded_anchor, encoded_positive, encoded_negative])

triplet_model_vt.compile(optimizer='adam', loss=triplet_loss)

triplet_model_glot = tf.keras.Model(inputs=[anchor_input, positive_input, negative_input],
                      outputs=[encoded_anchor, encoded_positive, encoded_negative])

triplet_model_glot.compile(optimizer='adam', loss=triplet_loss)

In [52]:
train_num=5000
test_num=100

In [53]:
triplets_train=create_train_triplets(X_train_glot,y_train_glot,train_num)

anchor_images = triplets_train[:, 0]
positive_images = triplets_train[:, 1]
negative_images = triplets_train[:, 2]

triplet_model_glot.fit([anchor_images, positive_images, negative_images], 
                  np.zeros((train_num, )),
                  batch_size=32, 
                  epochs=100)

Epoch 1/100
[1m 94/157[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 2ms/step - loss: 0.5979   

W0000 00:00:1724827966.055651     321 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - loss: 0.6665
Epoch 2/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6431 
Epoch 3/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3250
Epoch 4/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2312
Epoch 5/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2040 
Epoch 6/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2277
Epoch 7/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2075
Epoch 8/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2314
Epoch 9/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2123
Epoch 10/100
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - l

<keras.src.callbacks.history.History at 0x78bdd02eafb0>

In [57]:
triplets_test=create_test_triplets(X_test_glot,y_test_glot,test_num)

true_pos = 0
false_pos = 0

anchor_input_test = np.stack(triplets_test[:, 0])
positive_input_test = np.stack(triplets_test[:, 1])
labels_test = triplets_test[:, 2]

encoded_anchor = base_model.predict(anchor_input_test)
encoded_positive = base_model.predict(positive_input_test)

distances = np.linalg.norm(encoded_anchor - encoded_positive, axis=1)
threshold = 0.0005

for i in range(test_num):
    print(str(distances[i])+" : "+str(labels_test[i]))
    if distances[i] < threshold and labels_test[i] == 1:
        true_pos += 1
    if distances[i] >= threshold and labels_test[i] == 0:
        false_pos += 1

total_pos=np.sum(labels_test)

accuracy = (true_pos+false_pos) / test_num
true_pos = (true_pos) / total_pos
false_pos = (false_pos) / (test_num-total_pos)
print("True Positives: {:.2f}".format(true_pos))
print("False Positives: {:.2f}".format(false_pos))
print("Accuracy: {:.2f}".format(accuracy))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
0.0014800157 : 0
0.0015888996 : 0
0.00033659316 : 1
0.003005301 : 0
0.0018699258 : 0
0.00010216137 : 0
0.000102092556 : 0
0.00059036555 : 1
0.00064239494 : 1
0.0025430105 : 1
6.342057e-05 : 0
0.00015717497 : 0
0.010897068 : 0
0.0012867061 : 1
0.00033170616 : 0
0.0017192049 : 0
0.0022876486 : 0
0.0009848366 : 0
0.00047735052 : 0
0.0012839041 : 0
0.0007801936 : 1
0.01104493 : 0
0.0012684395 : 0
0.0033328237 : 0
3.891487e-05 : 0
0.19754037 : 0
0.0006633836 : 0
5.0655945e-05 : 1
0.00066811906 : 0
7.591466e-05 : 0
0.0026551967 : 0
0.00025113055 : 0
0.004156731 : 0
0.0057413955 : 0
0.00028032396 : 1
0.00022699224 : 0
0.000138681 : 0
0.002045016 : 0
0.004073869 : 1
0.0011569808 : 0
0.000105682535 : 1
0.00011845376 : 1
0.00029316123 : 0
0.00034877457 : 0
0.0033937392 : 0
0.00032596663 : 0
0.0014894689 : 0
0.00012885491 : 0
0.00020088906 : 0
0.000562133

In [None]:
# triplets_train=create_train_triplets(X_train_vt,y_train_vt,train_num)

# anchor_images = triplets_train[:, 0]
# positive_images = triplets_train[:, 1]
# negative_images = triplets_train[:, 2]

# triplet_model_vt.fit([anchor_images, positive_images, negative_images], 
#                   np.zeros((train_num, )),
#                   batch_size=32, 
#                   epochs=100)

In [None]:
# triplets_test=create_test_triplets(X_test_vt,y_test_vt,test_num)

# true_pos = 0
# false_pos = 0

# anchor_input_test = np.stack(triplets_test[:, 0])
# positive_input_test = np.stack(triplets_test[:, 1])
# labels_test = triplets_test[:, 2]

# encoded_anchor = base_model.predict(anchor_input_test)
# encoded_positive = base_model.predict(positive_input_test)

# distances = np.linalg.norm(encoded_anchor - encoded_positive, axis=1)
# threshold = 0.057

# for i in range(test_num):
#     print(str(distances[i])+" : "+str(labels_test[i]))
#     if distances[i] < threshold and labels_test[i] == 1:
#         true_pos += 1
#     if distances[i] >= threshold and labels_test[i] == 0:
#         false_pos += 1

# total_pos=np.sum(labels_test)

# accuracy = (true_pos+false_pos) / test_num
# true_pos = (true_pos) / total_pos
# false_pos = (false_pos) / (test_num-total_pos)
# print("True Positives: {:.2f}".format(true_pos))
# print("False Positives: {:.2f}".format(false_pos))
# print("Accuracy: {:.2f}".format(accuracy))