In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import lfilter,correlate
import tensorflow.keras.layers as tfl
import tensorflow as tf

2024-06-14 19:38:03.871659: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-14 19:38:03.871829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-14 19:38:04.140726: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def pre_emphasis(signal,factor=0.97):
    signal_preemphasized = np.append(signal[0], signal[1:] - factor * signal[:-1])
    return signal_preemphasized

In [3]:
def inverse_filtering(signal, lpc_coeffs):
    return lfilter(np.concatenate(([1], -lpc_coeffs[1:])), [1], signal)

In [4]:
def compute_lpc(signal, order):
    autocorr = correlate(signal, signal, mode='full')
    autocorr = autocorr[len(signal)-1:len(signal)+order]
    
    a = np.concatenate(([1], -autocorr[1:order+1]))
    b = [1]
    
    lpc_coeffs = lfilter(b, a, signal)
    
    lpc_coeffs = np.concatenate(([1], -lpc_coeffs[1:order+1]))
    
    return lpc_coeffs

In [5]:
def lpc_to_cepstrum(lpc_coeffs):
    cepstrum_coeffs = np.fft.ifft(np.log(np.abs(np.fft.fft(lpc_coeffs))))
    return cepstrum_coeffs

In [6]:
import gc
from IPython.display import clear_output

def create_spectrograms(train_folder, create_folder, verbose=False, speakers=50, utterances=10, sr=44100, frame_length=2048, hop_length=512, lpc_order=16):
    spc_folder = os.path.join(create_folder, "spectrogram")
    vt_folder = os.path.join(create_folder, "vocal_tract")
    glot_folder = os.path.join(create_folder, "glottal")
    
    total_speaker = 0
    for speaker in os.listdir(train_folder):
        total_utterances = 0
        speaker_folder_spc = os.path.join(spc_folder, speaker)
        speaker_folder_vt = os.path.join(vt_folder, speaker)
        speaker_folder_glot = os.path.join(glot_folder, speaker)
        os.makedirs(speaker_folder_spc, exist_ok=True)
        os.makedirs(speaker_folder_vt, exist_ok=True)
        os.makedirs(speaker_folder_glot, exist_ok=True)
        
        for vidID in os.listdir(os.path.join(train_folder, speaker)):
            for file in os.listdir(os.path.join(train_folder, speaker, vidID)):
                if file.endswith(".wav"):
                    wav_file_path = os.path.join(train_folder, speaker, vidID, file)
                    
                    y, sr = librosa.load(wav_file_path, sr=sr)
                    
                    y_preemphasized=pre_emphasis(y)
                    
                    frames = librosa.util.frame(y_preemphasized, frame_length=frame_length, hop_length=hop_length).T
                    del y_preemphasized
                    
                    window = np.hamming(frame_length)
                    frames_windowed = frames * window  
                    del frames
                    
                    lpc_coeffs = []
                    for frame in frames_windowed:
                        frame=np.array(frame)
                        coeff = compute_lpc(frame, lpc_order)
                        lpc_coeffs.append(coeff)
                        
                    glottal_waveforms = [inverse_filtering(frame, coeff) for frame, coeff in zip(frames_windowed, lpc_coeffs)]

                    residuals = []
                    for i, coeff in enumerate(lpc_coeffs):
                        frame = frames_windowed[i]
                        residual = lfilter(coeff, [1.0], frame)
                        residuals.append(residual)
                        
                    residual_avg = np.mean(np.abs(np.array(residuals)), axis=0)
                    
                    del frames_windowed, lpc_coeffs, residuals

                    try:
                        plt.figure(figsize=(10, 10))
                        ax = plt.axes()
                        ax.set_axis_off()
                        plt.set_cmap('hot')
                        D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
                        del y
                        librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
                        output_path = os.path.join(speaker_folder_spc, "spc"+str(total_utterances+1) + ".png")
                        plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
                        plt.close('all')
                        del D

                        plt.figure(figsize=(10, 10))
                        ax = plt.axes()
                        ax.set_axis_off()
                        plt.set_cmap('hot')
                        D_residual = librosa.amplitude_to_db(np.abs(librosa.stft(residual_avg)), ref=np.max)
                        librosa.display.specshow(D_residual, sr=sr, x_axis='time', y_axis='log')
                        output_path = os.path.join(speaker_folder_vt, "vt"+str(total_utterances+1) + ".png")
                        plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
                        plt.close('all')
                        del D_residual

                        plt.figure(figsize=(10, 10))
                        ax = plt.axes()
                        ax.set_axis_off()
                        plt.set_cmap('hot')
                        D_glot = librosa.amplitude_to_db(np.abs(librosa.stft(glottal_waveforms[0])), ref=np.max)
                        librosa.display.specshow(D_glot, sr=sr, x_axis='time', y_axis='log')
                        output_path = os.path.join(speaker_folder_glot, "glot"+str(total_utterances+1) + ".png")
                        plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
                        plt.close('all')
                        del D_glot
                    
                    except OSError as e:
                        print(f"Error saving spectrogram for {wav_file_path}: {e}")
                    
                    del glottal_waveforms, residual_avg
                    gc.collect()
    
                    total_utterances += 1
                    if total_utterances == utterances:
                        break
            
            if total_utterances == utterances:
                break
        
        total_speaker += 1
        if total_speaker != 0 and total_speaker % 1 == 0 and verbose:
            print(f"{total_speaker} speakers completed.\n")
        
        if total_speaker == speakers:
            break

In [7]:
raw_dataset_loc="/kaggle/input/voxceleb1train/wav"
save_loc="/kaggle/working/"
num_speakers=5
num_utterances=100
create_spectrograms(raw_dataset_loc,save_loc,True,num_speakers,num_utterances)

1 speakers completed.

2 speakers completed.

3 speakers completed.

4 speakers completed.

5 speakers completed.



In [8]:
from PIL import Image

dataset_path_spc="/kaggle/working/spectrogram"
dataset_path_vt="/kaggle/working/vocal_tract"
dataset_path_glot="/kaggle/working/glottal"
folders=os.listdir(dataset_path_spc)

# Initializing training and test dataset
X_train=[[],[],[]]
y_train=[[],[],[]]
X_test=[[],[],[]]
y_test=[[],[],[]]

# Split the dataset into training and test set.
num=np.random.rand(num_utterances*num_speakers)
mask=num<0.2
split=mask.astype(int)

i=0
for dirs in folders:
    for img in os.listdir(os.path.join(dataset_path_spc,dirs)):
        image=Image.open(os.path.join(dataset_path_spc,dirs,img))
        new_img=image.resize((200,200))
        tmp_array=np.array(new_img)/255.
        if split[i]==0:
            X_train[0].append(tmp_array)
            y_train[0].append(str(dirs))
        else:
            X_test[0].append(tmp_array)
            y_test[0].append(str(dirs))
            
    for img in os.listdir(os.path.join(dataset_path_vt,dirs)):
        image=Image.open(os.path.join(dataset_path_vt,dirs,img))
        new_img=image.resize((200,200))
        tmp_array=np.array(new_img)/255.
        if split[i]==0:
            X_train[1].append(tmp_array)
            y_train[1].append(str(dirs))
        else:
            X_test[1].append(tmp_array)
            y_test[1].append(str(dirs))
            
    for img in os.listdir(os.path.join(dataset_path_glot,dirs)):
        image=Image.open(os.path.join(dataset_path_glot,dirs,img))
        new_img=image.resize((200,200))
        tmp_array=np.array(new_img)/255.
        if split[i]==0:
            X_train[2].append(tmp_array)
            y_train[2].append(str(dirs))
        else:
            X_test[2].append(tmp_array)
            y_test[2].append(str(dirs))
        
        i=i+1

In [9]:
dict={}
i=0
for val in folders:
    dict[val]=i
    i=i+1

dict

{'id11123': 0, 'id10459': 1, 'id10116': 2, 'id10484': 3, 'id11079': 4}

In [10]:
i=0
for _ in y_train[0]:
    y_train[0][i]=dict[y_train[0][i]]
    i=i+1

i=0
for _ in y_test[0]:
    y_test[0][i]=dict[y_test[0][i]]
    i=i+1
    
i=0
for _ in y_train[1]:
    y_train[1][i]=dict[y_train[1][i]]
    i=i+1

i=0
for _ in y_test[1]:
    y_test[1][i]=dict[y_test[1][i]]
    i=i+1
    
i=0
for _ in y_train[2]:
    y_train[2][i]=dict[y_train[2][i]]
    i=i+1

i=0
for _ in y_test[2]:
    y_test[2][i]=dict[y_test[2][i]]
    i=i+1

In [11]:
def convolutional_model(input_shape,output_shape):
    input_img = tf.keras.Input(shape=input_shape)
    Z1=tfl.Conv2D(filters=8,kernel_size=(4,4),strides=(1,1),padding='same')(input_img)
    A1=tfl.ReLU()(Z1)
    P1=tfl.MaxPool2D(pool_size=(8,8),strides=(8,8),padding='same')(A1)
    Z2=tfl.Conv2D(filters=16,kernel_size=(2,2),strides=(1,1),padding='same')(P1)
    A2=tfl.ReLU()(Z2)
    P2=tfl.MaxPool2D(pool_size=(4,4),strides=(4,4),padding='same')(A2)
    F=tfl.Flatten()(P2)
    outputs=tfl.Dense(output_shape,activation='softmax')(F)
    
    model = tf.keras.Model(inputs=input_img, outputs=outputs)
    return model

In [12]:
conv_model = convolutional_model((200, 200, 4),num_speakers)
conv_model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train[0], y_train[0])).batch(64)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test[0], y_test[0])).batch(64)
history = conv_model.fit(train_dataset, epochs=100, validation_data=test_dataset)

Epoch 1/100
[1m2/4[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 64ms/step - accuracy: 0.1094 - loss: 1.7835   

I0000 00:00:1718395058.490434     124 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1718395058.509760     124 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 896ms/step - accuracy: 0.2757 - loss: 1.6331

W0000 00:00:1718395061.198965     121 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718395062.077280     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1s/step - accuracy: 0.3015 - loss: 1.6231 - val_accuracy: 0.0000e+00 - val_loss: 2.6073
Epoch 2/100
[1m2/4[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 52ms/step - accuracy: 0.0898 - loss: 1.0643    

W0000 00:00:1718395062.955009     123 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.1147 - loss: 1.1280 - val_accuracy: 0.0000e+00 - val_loss: 3.3780
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.6150 - loss: 0.9818 - val_accuracy: 0.0000e+00 - val_loss: 4.1027
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - accuracy: 0.6240 - loss: 0.9393 - val_accuracy: 0.0000e+00 - val_loss: 4.7270
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 81ms/step - accuracy: 0.6272 - loss: 0.9461 - val_accuracy: 0.0000e+00 - val_loss: 5.2562
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.6874 - loss: 0.9744 - val_accuracy: 0.0000e+00 - val_loss: 5.6994
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 0.5384 - loss: 1.0018 - val_accuracy: 0.0000e+00 - val_loss: 6.0599
Epoch 8/100
[1m4/4[0m [32

In [14]:
conv_model_glot = convolutional_model((200, 200, 4),num_speakers)
conv_model_glot.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train[2], y_train[2])).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test[2], y_test[2])).batch(16)
history_glot = conv_model_glot.fit(train_dataset, epochs=100, validation_data=test_dataset)

Epoch 1/100
[1m15/20[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 13ms/step - accuracy: 0.1071 - loss: 1.8209  

W0000 00:00:1718395349.542794     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m19/20[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.0996 - loss: 1.8332

W0000 00:00:1718395350.412305     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 84ms/step - accuracy: 0.0961 - loss: 1.8454 - val_accuracy: 0.2632 - val_loss: 1.5906
Epoch 2/100
[1m14/20[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 12ms/step - accuracy: 0.7029 - loss: 1.4097

W0000 00:00:1718395351.131524     123 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5630 - loss: 1.4525 - val_accuracy: 0.2368 - val_loss: 1.5608
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0849 - loss: 1.5289 - val_accuracy: 0.2105 - val_loss: 1.5497
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0891 - loss: 1.5335 - val_accuracy: 0.2763 - val_loss: 1.5404
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1160 - loss: 1.5329 - val_accuracy: 0.3947 - val_loss: 1.5334
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1671 - loss: 1.5283 - val_accuracy: 0.3947 - val_loss: 1.5293
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.2182 - loss: 1.5199 - val_accuracy: 0.4342 - val_loss: 1.5217
Epoch 8/100
[1m20/20[0m [32m━━━━━━━━━

In [16]:
conv_model_vt = convolutional_model((200, 200, 4),num_speakers)
conv_model_vt.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [18]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train[1], y_train[1])).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test[1], y_test[1])).batch(16)
history_vt = conv_model_vt.fit(train_dataset, epochs=200, validation_data=test_dataset)

Epoch 1/200
[1m13/16[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 15ms/step - accuracy: 0.4002 - loss: 1.5629 

W0000 00:00:1718395593.202547     124 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.3791 - loss: 1.6174

W0000 00:00:1718395594.283255     121 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1718395594.865896     121 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 133ms/step - accuracy: 0.3730 - loss: 1.6327 - val_accuracy: 0.0000e+00 - val_loss: 2.7147
Epoch 2/200
[1m 9/16[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 15ms/step - accuracy: 0.9298 - loss: 0.8214

W0000 00:00:1718395595.194694     121 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.7223 - loss: 0.9258 - val_accuracy: 0.0000e+00 - val_loss: 3.5326
Epoch 3/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.2269 - loss: 1.0766 - val_accuracy: 0.0000e+00 - val_loss: 4.3369
Epoch 4/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.0788 - loss: 1.0932 - val_accuracy: 0.0000e+00 - val_loss: 4.7724
Epoch 5/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0931 - loss: 1.0506 - val_accuracy: 0.0000e+00 - val_loss: 5.1016
Epoch 6/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.1686 - loss: 1.0292 - val_accuracy: 0.0000e+00 - val_loss: 5.2254
Epoch 7/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.1699 - loss: 1.0228 - val_accuracy: 0.0000e+00 - val_loss: 5.5710
Epoch 8/200
[1m

In [None]:
globals().clear()

In [None]:
locals().clear()