In [1]:
import librosa
import numpy as np
import scipy
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import IPython.display as ipd
import matplotlib.pyplot as plt
import noisereduce as nr
from scipy.spatial.distance import cosine
from fastdtw import fastdtw


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio

In [3]:
def frame_audio(audio, FFT_size=1024, hop_size=15, sample_rate=44100):
    # hop_size in ms
    
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames

In [4]:
def freq_to_mel(freq):
    return 2595.0 * np.log10(1.0 + freq / 700.0)

def met_to_freq(mels):
    return 700.0 * (10.0**(mels / 2595.0) - 1.0)

In [5]:
def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
    fmin_mel = freq_to_mel(fmin)
    fmax_mel = freq_to_mel(fmax)
    
    # print("MEL min: {0}".format(fmin_mel))
    # print("MEL max: {0}".format(fmax_mel))
    
    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
    freqs = met_to_freq(mels)
    
    return np.floor((FFT_size + 0.5) / sample_rate * freqs).astype(int), freqs

In [6]:
def get_filters(filter_points, FFT_size):
    filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
    
    for n in range(len(filter_points)-2):
        filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
        filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
    
    return filters

In [7]:
def dct(dct_filter_num, filter_len):
    basis = np.empty((dct_filter_num,filter_len))
    basis[0, :] = 1.0 / np.sqrt(filter_len)
    
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    for i in range(1, dct_filter_num):
        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
        
    return basis

In [32]:
def extraction(audio, FFT_size=1024, hop_size=12, mel_filter_num=128, dct_filter_num=13):
    
    y, sr = librosa.load(audio, sr=44100)
    y, _ = librosa.effects.trim(y, top_db=20)
    y = nr.reduce_noise(y,sr)
    y = normalize_audio(y)

    y = librosa.effects.preemphasis(y)

    audio_framed = frame_audio(y, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sr)

    window = get_window("hamming", FFT_size, fftbins=True)
    audio_win = audio_framed * window
    audio_winT = np.transpose(audio_win)

    audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

    for n in range(audio_fft.shape[1]):
        audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

    audio_fft = np.transpose(audio_fft)

    audio_power = np.square(np.abs(audio_fft))

    freq_min = 0
    freq_high = sr / 2

    filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=sr)

    filters = get_filters(filter_points, FFT_size)
    enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
    filters *= enorm[:, np.newaxis]

    audio_filtered = np.dot(filters, np.transpose(audio_power))
    # audio_log = 10.0 * np.log10(audio_filtered)
    epsilon = 1e-10  # Nilai epsilon kecil
    audio_filtered_safe = np.maximum(audio_filtered, epsilon)  # Menghindari nilai nol
    audio_log = 10.0 * np.log10(audio_filtered_safe)

    dct_filters = dct(dct_filter_num, mel_filter_num)

    cepstral_coefficents = np.dot(dct_filters, audio_log)

    norm_cepstral_coefficents = librosa.util.normalize(np.transpose(cepstral_coefficents))

    # return cepstral_coefficents.T
    return norm_cepstral_coefficents

In [33]:
sample_audio_path = "../Dataset/01.Ha\'/Ha\'_M1_11.wav"

In [34]:
mfccs = extraction(sample_audio_path,
                   FFT_size=1024,
                   hop_size=12,
                   mel_filter_num=128,
                   dct_filter_num=13)

In [35]:
mfccs[0,:]

array([-0.69957868, -0.14991169, -0.14008468, -0.08799939, -0.00409409,
       -0.10424374,  0.01903684,  0.16773153,  0.13112112,  0.06109751,
        0.17561627,  0.16472779,  0.05526393])

In [36]:
mfccs[:,0]

array([-0.69957868, -0.76194583, -0.73248631, -0.66085228, -0.67897637,
       -0.64269489, -0.65447055, -0.66919861, -0.66182007, -0.61340081,
       -0.57296572, -0.4975703 , -0.40009155, -0.36776721, -0.34521963,
       -0.33918602, -0.33086992, -0.32561157, -0.31926236, -0.32738538,
       -0.33201693, -0.32297786, -0.32689731, -0.33932806, -0.36187652,
       -0.348603  , -0.35071911, -0.3613127 , -0.36694349, -0.35640813,
       -0.34469964, -0.3358282 , -0.32526104, -0.30019189, -0.29747392,
       -0.29760907, -0.29376438, -0.29712146, -0.30232451, -0.32240952,
       -0.35483447, -0.37404005, -0.40024509, -0.40090771, -0.37802126,
       -0.34399501, -0.33018805, -0.31427718, -0.30706466, -0.30548869,
       -0.32985743, -0.43051071, -0.53772316, -0.5656179 , -0.61076049,
       -0.65870479, -0.64989361, -0.62174158, -0.57822712, -0.56297712,
       -0.51544469, -0.47772424, -0.43753462, -0.40024181, -0.33654449,
       -0.34569343, -0.34192799, -0.38422749, -0.42254404, -0.45

In [37]:
mfccs.shape

(842, 13)

In [38]:
audio_path1 = "../Dataset/01.Ha\'/Ha\'_M1_11.wav"
audio_path2 = "../Dataset/01.Ha\'/Ha\'_M1_12.wav"
audio_path3 = '../Dataset/09.Qaf/Qaf_M1_11.wav'
audio_path4 = '../Dataset/09.Qaf/Qaf_M2_11.wav'
audio_path5 = "../Dataset/01.Ha\'/Ha\'_M2_11.wav"

In [39]:
mfccs1 = extraction(audio_path1)
mfccs2 = extraction(audio_path2)
mfccs3 = extraction(audio_path3)
mfccs4 = extraction(audio_path4)
mfccs5 = extraction(audio_path5)

distance, path = fastdtw(mfccs1, mfccs2, dist=cosine)
distance1, path1 = fastdtw(mfccs1, mfccs5, dist=cosine)
distance2, path2 = fastdtw(mfccs1, mfccs3, dist=cosine)
distance3, path3 = fastdtw(mfccs1, mfccs4, dist=cosine)

print(f'Distance = {distance}')
print(f'Distance = {distance1}')
print(f'Distance = {distance2}')
print(f'Distance = {distance3}')

Distance = 115.35547263151408
Distance = 244.89598549190163
Distance = 293.25020151648124
Distance = 330.13183265624536
