In [19]:

import matplotlib.pyplot as plt
import librosa as dsp
import librosa.display as dsp_plot
import numpy as np
import math
import IPython.display

hparams = {
    "min_level_db": -100, 
    "ref_level_db": 20,
    "normalize": False,
    "use_ref_db": False,
    "win_length": 0.05,
    "hop_length": 0.0125,
    "n_fft": 2048,
    "r": 3,
    "n_mels": 80,
    "griffin_lim_iters":50,
    "pre_emphasis": 0.97
}

pre-emphasis: 0.97; frame length: 50 ms;

frame shift: 12.5 ms; window type: Hann

In [36]:
def load_wave(audiofilepath):
    return dsp.load(audiofilepath, mono=True)
    
def do_preemphasis(wave, pre_emphasis=0.97):
    return np.append(wave[0], wave[1:] - pre_emphasis * wave[:-1])

def do_spectrogram(y=None, sr=16000,win_length=0.05,hop_length=0.0125, n_fft=2048,n_mels=80):
    #lengths: seconds -> samples
    hop_length = int(hop_length*sr);
    win_length = int(win_length*sr);
    
    #do stft first cause need to pass win_length arg
    stft = dsp.stft(y=y, n_fft=n_fft, win_length=win_length, hop_length=hop_length)

    #####################
    #keithito version 
    mels=dsp.filters.mel(sr, n_fft, n_mels=n_mels)
    s_mel = np.dot(mels,np.abs(stft))
    #read 20*log.. as 10*2 because (abs(signal))^2 => 2*10*log...
    S_mel = 20 * np.log10(np.maximum(1e-8,s_mel))
    S_lin =20 * np.log10(np.maximum(1e-8,np.abs(stft)))
    if hparams["use_ref_db"]:
        print("Using reference db..")
        S_mel=S_mel - hparams["ref_level_db"]
        S_lin=S_lin - hparams["ref_level_db"]
    
    #normalize between 0-1 (https://github.com/keithito/tacotron/issues/38)
    if hparams["normalize"]:
        print("Normalizing..")
        S_mel=np.clip((S_mel - hparams["min_level_db"]) / -hparams["min_level_db"], 0, 1)
        S_lin=np.clip((S_lin - hparams["min_level_db"]) / -hparams["min_level_db"], 0, 1)
    
    return (S_lin,S_mel)
    #print(S[:200])
    
def spect2wav(S,denormalize=True, use_ref_db=True,win_length=0.05,hop_length=0.0125, n_fft=2048):
    '''Converts spectrogram to waveform using librosa'''

    # Convert back to linear
    if denormalize:
        print("Denormalizing..")
        S = (np.clip(S, 0, 1) * -hparams["min_level_db"]) + hparams["min_level_db"]
    print(S.shape)
    if use_ref_db:
        print("Adding ref db..")
        S=S + hparams["ref_level_db"]
    
    #db -> amp
    S= np.power(10.0, S * 0.05)
    #amplitude raises of 1.5 as keithito (paper put 1.2)
    S = S**(1.5) #(1.2)
    
    #lengths: seconds -> samples
    hop_length = int(hop_length*sr);
    win_length = int(win_length*sr);
    
    #Based on https://github.com/librosa/librosa/issues/434
    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
    S_complex = np.abs(S).astype(np.complex)
    y = dsp.istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
    for i in range(hparams["griffin_lim_iters"]):
        ft = dsp.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
        angles = np.exp(1j * np.angle(ft))
        y = dsp.istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
    return y



In [37]:
def test_spectrogram(y,sr):
    
    
    hop_length = int(hparams["hop_length"]*sr)
    win_length = int(hparams["win_length"]*sr)
    print("hop size:", hop_length)
    print("win size:", win_length)
    n_fft=2048
    n_mels=80
    
    ft = dsp.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length);
    #####################
    #barronalex version
    S_bar_amp = dsp.feature.melspectrogram(S=ft, n_mels=n_mels)

    stft_bar_log = np.log(np.abs(ft) + 1e-8)
    S_bar_log = np.log(np.abs(S_bar_amp) + 1e-8)
    S_bar = (stft_bar_log,S_bar_log)
    
    print("barron: ", len(S_bar))
    #####################
    #keithito version 
    S_kei = do_spectrogram(y=y,
                           sr=sr,
                           win_length=hparams["win_length"],
                           hop_length=hparams["hop_length"],
                           n_fft=hparams["n_fft"],
                           n_mels=hparams["n_mels"]
                          )
    print("keithito: ", len(S_kei))
    #print(spect_kei[:200])
    
    #####################à
    #kyubyong
    # magnitude spectrogram -> linear one
    magnitude = np.abs(ft) #(1+n_fft/2, T)
    # power spectrogram 
    power = magnitude**2 #(1+n_fft/2, T) 
    # mel spectrogram
    S_kyu = dsp.feature.melspectrogram(S=power, n_mels=80) #(n_mels, T)
    
    
    plt.figure(figsize=(15, 20))
    
    plt.subplot(6, 1, 1)
    dsp_plot.specshow(stft_bar_log, sr=sr,fmax=2048, y_axis='log',x_axis='time')
    plt.colorbar()
    plt.title('Barron spectrogram linear')
    plt.subplot(6, 1, 2)
    dsp_plot.specshow(S_bar_log, sr=sr,fmax=2048, y_axis='mel',x_axis='time')
    plt.colorbar()
    plt.title('Barron spectrogram mel')
    
    plt.subplot(6, 1, 3)
    dsp_plot.specshow(S_kei[0], y_axis='log',fmax=2048, x_axis='time')
    plt.colorbar()#plt.colorbar(format='%+2.0f dB')
    plt.title('Keithito spectrogram linear')
    plt.subplot(6, 1, 4)
    dsp_plot.specshow(S_kei[1],y_axis='mel',fmax=2048, x_axis='time')
    plt.colorbar()#plt.colorbar(format='%+2.0f dB')
    plt.title('Keithito spectrogram mel')
    
    plt.subplot(6, 1, 5)
    dsp_plot.specshow(magnitude,fmax=2048,y_axis="log", x_axis='time')
    plt.colorbar()#plt.colorbar(format='%+2.0f dB')
    plt.title('kyubyong spectrogram linear')
    plt.subplot(6, 1, 6)
    dsp_plot.specshow(S_kyu,y_axis='mel',fmax=2048, x_axis='time')
    plt.colorbar()#plt.colorbar(format='%+2.0f dB')
    plt.title('kyubyong spectrogram mel')
    plt.tight_layout()


In [40]:
%matplotlib inline
y, sr = load_wave("cumulo.wav")
#test_spectrogram(y,sr)

hop_length = int(hparams["hop_length"]*sr)
win_length = int(hparams["win_length"]*sr)

hparams["use_ref_db"]=False
hparams["normalize"]=False
hparams["griffin_lim_iters"]=5
print(hparams)
S_lin, S_mel = do_spectrogram(y=y,
                           sr=sr,
                           win_length=hparams["win_length"],
                           hop_length=hparams["hop_length"],
                           n_fft=hparams["n_fft"],
                           n_mels=hparams["n_mels"]
                          )

print(S_lin.shape)
print(S_mel.shape)
# S= S_lin

# # Convert back to linear

# if hparams["normalize"]:
#     print("Denormalizing..")
#     S = (np.clip(S, 0, 1) * -hparams["min_level_db"]) + hparams["min_level_db"]
# if hparams["use_ref_db"]:
#     print("Adding ref db..")
#     S=S + hparams["ref_level_db"]

# #db -> amp
# print("db -> amp")
# S= np.power(10.0, S * 0.05)
# #amplitude raises of 1.5 as keithito (paper put 1.2)
# print("raise up..")
# S = S**(1.5) #(1.2)
# print("Generating random phases..")
# angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
# print("Restore original complex spectrum..")
# S_complex = np.abs(S).astype(np.complex)
# print("Start looping..")
# y = dsp.istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
# for i in range(hparams["griffin_lim_iters"]):
#     ft = dsp.stft(y=y, n_fft=hparams["n_fft"], hop_length=hop_length, win_length=win_length)
#     angles = np.exp(1j * np.angle(ft))
#     y = dsp.istft(S_complex * angles, hop_length=hop_length, win_length=win_length)

# # Original
# IPython.display.Audio(data=y, rate=sr)




{'min_level_db': -100, 'ref_level_db': 20, 'normalize': False, 'use_ref_db': False, 'win_length': 0.05, 'hop_length': 0.0125, 'n_fft': 2048, 'r': 3, 'n_mels': 80, 'griffin_lim_iters': 5, 'pre_emphasis': 0.97}
(1025, 186)
(80, 186)
