In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import lfilter,correlate

In [None]:
def pre_emphasis(signal,factor=0.97):
    signal_preemphasized = np.append(signal[0], signal[1:] - factor * signal[:-1])
    return signal_preemphasized

In [None]:
def inverse_filtering(signal, lpc_coeffs):
    return lfilter(np.concatenate(([1], -lpc_coeffs[1:])), [1], signal)

In [None]:
def compute_lpc(signal, order):
    autocorr = correlate(signal, signal, mode='full')
    autocorr = autocorr[len(signal)-1:len(signal)+order]
    
    a = np.concatenate(([1], -autocorr[1:order+1]))
    b = [1]
    
    lpc_coeffs = lfilter(b, a, signal)
    
    lpc_coeffs = np.concatenate(([1], -lpc_coeffs[1:order+1]))
    
    return lpc_coeffs

In [None]:
def lpc_to_cepstrum(lpc_coeffs):
    cepstrum_coeffs = np.fft.ifft(np.log(np.abs(np.fft.fft(lpc_coeffs))))
    return cepstrum_coeffs

In [None]:
def create_spectrograms(train_folder, create_folder, verbose=False, speakers=50, utterances=10, sr=44100, frame_length=2048, hop_length=512, lpc_order=16):
    spc_folder = os.path.join(create_folder, "spectrogram")
    vt_folder = os.path.join(create_folder, "vocal_tract")
    glot_folder = os.path.join(create_folder, "glottal")
    
    total_speaker = 0
    for speaker in os.listdir(train_folder):
        total_utterances = 0
        speaker_folder_spc = os.path.join(spc_folder, speaker)
        speaker_folder_vt = os.path.join(vt_folder, speaker)
        speaker_folder_glot = os.path.join(glot_folder, speaker)
        os.makedirs(speaker_folder_spc, exist_ok=True)
        os.makedirs(speaker_folder_vt, exist_ok=True)
        os.makedirs(speaker_folder_glot, exist_ok=True)
        
        for vidID in os.listdir(os.path.join(train_folder, speaker)):
            for file in os.listdir(os.path.join(train_folder, speaker, vidID)):
                if file.endswith(".wav"):
                    wav_file_path = os.path.join(train_folder, speaker, vidID, file)
                    
                    y, sr = librosa.load(wav_file_path, sr=sr)
                    
                    y_preemphasized=pre_emphasis(y)
                    
                    frames = librosa.util.frame(y_preemphasized, frame_length=frame_length, hop_length=hop_length).T
                    window = np.hamming(frame_length)
                    frames_windowed = frames * window  
                    
                    lpc_coeffs = []
                    for frame in frames_windowed:
                        frame=np.array(frame)
                        coeff = compute_lpc(frame, lpc_order)
                        lpc_coeffs.append(coeff)
                        
                    glottal_waveforms = [inverse_filtering(frame, coeff) for frame, coeff in zip(frames_windowed, lpc_coeffs)]

                    residuals = []
                    for i, coeff in enumerate(lpc_coeffs):
                        frame = frames_windowed[i]
                        residual = lfilter(coeff, [1.0], frame)
                        residuals.append(residual)
                        
                    residual_avg = np.mean(np.abs(np.array(residuals)), axis=0)

                    plt.figure(figsize=(10, 5))
                    ax = plt.axes()
                    ax.set_axis_off()
                    plt.set_cmap('hot')
                    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
                    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
                    output_path = os.path.join(speaker_folder_spc, "spc"+str(total_utterances+1) + ".png")
                    plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
                    plt.close()
                    
                    plt.figure(figsize=(10, 5))
                    ax = plt.axes()
                    ax.set_axis_off()
                    plt.set_cmap('hot')
                    D_residual = librosa.amplitude_to_db(np.abs(librosa.stft(residual_avg)), ref=np.max)
                    librosa.display.specshow(D_residual, sr=sr, x_axis='time', y_axis='log')
                    output_path = os.path.join(speaker_folder_vt, "vt"+str(total_utterances+1) + ".png")
                    plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
                    plt.close()
                    
                    plt.figure(figsize=(10, 5))
                    ax = plt.axes()
                    ax.set_axis_off()
                    plt.set_cmap('hot')
                    D_glot = librosa.amplitude_to_db(np.abs(librosa.stft(glottal_waveforms[0])), ref=np.max)
                    librosa.display.specshow(D_glot, sr=sr, x_axis='time', y_axis='log')
                    output_path = os.path.join(speaker_folder_glot, "glot"+str(total_utterances+1) + ".png")
                    plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0.0)
                    plt.close()
    
                    total_utterances += 1
                    if total_utterances == utterances:
                        break
            
            if total_utterances == utterances:
                break
        
        total_speaker += 1
        if total_speaker != 0 and total_speaker % 10 == 0 and verbose:
            print(f"{total_speaker} speakers completed.\n")
        
        if total_speaker == speakers:
            break


In [None]:
create_spectrograms("/kaggle/input/voxceleb1train/wav","/kaggle/working/",True)