In [1]:
import numpy as np
from scipy.signal import lfilter
from scipy.signal.windows import hann
from scipy.linalg import toeplitz, solve_toeplitz
import os
import librosa
import tensorflow.keras.layers as tfl
import tensorflow as tf
from keras.callbacks import ReduceLROnPlateau

2024-07-01 12:25:09.734074: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 12:25:09.734157: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 12:25:09.882767: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def lpc(x, order):
    R = np.correlate(x, x, mode='full')[len(x)-1:]
    R = R[:order+1]
    return solve_toeplitz((R[:order], R[:order]), -R[1:])

In [3]:
def gfmiaif(s_gvl, nv=16, ng=16, d=0.99, win=None):
    if win is None:
        win = hann(len(s_gvl))
    
    Lpf = nv + 1
    x_gvl = np.concatenate((np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl))
    idx_pf = slice(Lpf, len(x_gvl))
    
    al = [1, -d]
    s_gv = lfilter([1], al, s_gvl)
    x_gv = lfilter([1], al, x_gvl)
    
    ag1 = lpc(s_gv * win, 1)
    
    for _ in range(ng - 1):
        x_v1x = lfilter(ag1, [1], x_gv)
        s_v1x = x_v1x[idx_pf]
        ag1x = lpc(s_v1x * win, 1)
        ag1 = np.convolve(ag1, ag1x)
    
    x_v1 = lfilter(ag1, [1], x_gv)
    s_v1 = x_v1[idx_pf]
    av1 = lpc(s_v1 * win, nv)
    
    x_g1 = lfilter(av1, [1], x_gv)
    s_g1 = x_g1[idx_pf]
    ag = lpc(s_g1 * win, ng)
    
    x_v = lfilter(ag, [1], x_gv)
    s_v = x_v[idx_pf]
    av = lpc(s_v * win, nv)
    
    return av, ag, al

In [5]:
def generate_train_test_instances(data_folder,num_speakers=5,num_utterances=20,sr=11000,hop_length=512,frame_length=2048,verbose=True):
    x=[[],[]]
    y_ret=[[],[]]
    
    total_speaker = 0
    for speaker in os.listdir(data_folder):
        total_utterances = 0
        for vidID in os.listdir(os.path.join(data_folder, speaker)):
            for file in os.listdir(os.path.join(data_folder, speaker, vidID)):
                if file.endswith(".wav"):
                    wav_file_path = os.path.join(data_folder, speaker, vidID, file)
                    
                    y, sr = librosa.load(wav_file_path, sr=sr)
#                     frames = librosa.util.frame(y, frame_length=frame_length, hop_length=hop_length).T
                    vt_coeff,glot_coeff,_=gfmiaif(y)
#                     glot_coeff=[0 for _ in range(16)]
#                     for frame in frames:
#                         _,glot,_=gfmiaif(frame)
#                         cnt=0
#                         for i in glot:
#                             glot_coeff[cnt]+=i
#                             cnt+=1
                    
#                     cnt=0
#                     for i in glot_coeff:
#                         glot_coeff[cnt]=glot_coeff[cnt]/len(frames)
#                         cnt+=1
                        
                    x[0].append(vt_coeff)
                    x[1].append(glot_coeff)
                    y_ret[0].append(speaker)
                    y_ret[1].append(speaker)
                    
                    total_utterances+=1
                    
                    if total_utterances==num_utterances:
                        break
                        
            if total_utterances==num_utterances:
                        break
        
        total_speaker += 1
        if total_speaker != 0 and total_speaker % 1 == 0 and verbose:
            print(f"{total_speaker} speakers completed.\n")
        
        if total_speaker == num_speakers:
            break
    
    return x,y_ret

In [6]:
num_speakers=5
num_utterances=20
data_folder="/kaggle/input/voxceleb1train/wav"

In [7]:
x,y=generate_train_test_instances(data_folder,num_speakers,num_utterances)

1 speakers completed.

2 speakers completed.

3 speakers completed.

4 speakers completed.

5 speakers completed.



In [8]:
dict={}
i=0

unique=[]
for val in y[0]:
    if val not in unique:
        unique.append(val)

for val in unique:
    dict[val]=i
    i+=1

dict

{'id10116': 0, 'id11079': 1, 'id11123': 2, 'id10459': 3, 'id10484': 4}

In [9]:
from sklearn.model_selection import train_test_split

X_train_vt, X_test_vt, y_train_vt, y_test_vt = train_test_split(x[0],y[0] , 
                                   random_state=104,  
                                   test_size=0.2,  
                                   shuffle=True)

X_train_glot, X_test_glot, y_train_glot, y_test_glot = train_test_split(x[1],y[1] , 
                                   random_state=104,  
                                   test_size=0.2,  
                                   shuffle=True)

In [10]:
i=0
for _ in y_train_vt:
    y_train_vt[i]=dict[y_train_vt[i]]
    i=i+1

i=0
for _ in y_test_vt:
    y_test_vt[i]=dict[y_test_vt[i]]
    i=i+1
    
i=0
for _ in y_train_glot:
    y_train_glot[i]=dict[y_train_glot[i]]
    i=i+1

i=0
for _ in y_test_glot:
    y_test_glot[i]=dict[y_test_glot[i]]
    i=i+1

In [11]:
def dnn_model(input_shape,output_shape):
    input_data = tf.keras.Input(shape=input_shape)
    
    D1 = tfl.Dense(512, activation='relu')(input_data)
    DP1 = tfl.Dropout(0.5)(D1)
    
    D2 = tfl.Dense(256, activation='relu')(DP1)
    DP2 = tfl.Dropout(0.5)(D2)
    
    D3 = tfl.Dense(128, activation='relu')(DP2)
    DP3 = tfl.Dropout(0.5)(D3)
    
    outputs = tfl.Dense(output_shape, activation='softmax')(DP3)
    
    model = tf.keras.Model(inputs=input_data, outputs=outputs)
    return model

In [12]:
model_vt = dnn_model((len(X_train_vt[0]),),num_speakers)
model_vt.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [13]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00001)
history = model_vt.fit(np.array(X_train_vt),np.array(y_train_vt), batch_size=32, epochs=100, validation_data=(np.array(X_test_vt),np.array(y_test_vt)),callbacks=(reduce_lr))

Epoch 1/100
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m12s[0m 6s/step - accuracy: 0.2500 - loss: 1.6520

I0000 00:00:1719839385.060397     128 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1719839385.076550     128 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.2229 - loss: 1.6720 

W0000 00:00:1719839390.434220     131 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3s/step - accuracy: 0.2172 - loss: 1.6751 - val_accuracy: 0.1000 - val_loss: 1.6201 - learning_rate: 0.0010
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.1945 - loss: 1.6709 - val_accuracy: 0.1000 - val_loss: 1.6195 - learning_rate: 0.0010
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2375 - loss: 1.5988 - val_accuracy: 0.1000 - val_loss: 1.6274 - learning_rate: 0.0010
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2453 - loss: 1.6341 - val_accuracy: 0.1000 - val_loss: 1.6284 - learning_rate: 0.0010
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.3047 - loss: 1.5733 - val_accuracy: 0.2000 - val_loss: 1.6208 - learning_rate: 0.0010
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accurac

In [14]:
model_glot = dnn_model((len((X_train_glot[0]),),),num_speakers)
model_glot.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [15]:
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00001)
history = model_glot.fit(np.array(X_train_glot),np.array(y_train_glot), batch_size=32, epochs=100, validation_data=(np.array(X_test_glot),np.array(y_test_glot)),callbacks=(reduce_lr))

Epoch 1/100
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m9s[0m 5s/step - accuracy: 0.1562 - loss: 1.6434

W0000 00:00:1719839427.604907     130 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.1667 - loss: 1.6425

W0000 00:00:1719839432.365185     131 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3s/step - accuracy: 0.1719 - loss: 1.6409 - val_accuracy: 0.1500 - val_loss: 1.6260 - learning_rate: 0.0010
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.2469 - loss: 1.6288 - val_accuracy: 0.1000 - val_loss: 1.6249 - learning_rate: 0.0010
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2672 - loss: 1.5600 - val_accuracy: 0.1000 - val_loss: 1.6260 - learning_rate: 0.0010
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2164 - loss: 1.5938 - val_accuracy: 0.1000 - val_loss: 1.6261 - learning_rate: 0.0010
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2070 - loss: 1.6119 - val_accuracy: 0.1000 - val_loss: 1.6249 - learning_rate: 0.0010
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accurac