#  Audio Signals Processing to Obtain Audio Spectrogram using VQT

In [30]:
# Importing General Packages
import numpy as np
import scipy

## Importing Visualization Pakcages
import seaborn
import matplotlib.pyplot as plt
import IPython.display as ipd

## Importing Audio Processing Pakcages
import librosa, librosa.display 

In [31]:
# Define Variable Q-Transform Parameters for Audio Signals Processing
fs = 22050  # Sampling frequency 
hop_length=512  # number of samples between successive VQT columns
fmin=None # Minimum frequency. Defaults to C1 ~= 32.70 Hz
n_bins=84 # Number of frequency bins
gamma=20 # Bandwidth offset for determining filter lengths (If Gamma=0 then => CQT computation)
bins_per_octave=12 # Number of frequency bins per octave
tuning=0.0 # Tuning offset in fractions of a bin(None, tuning will be automatically estimated from the signal)
filter_scale=1 # Filter scale factor. Small values (<1) use shorter windows for improved time resolution.
norm=1 # Type of norm to use for basis function normalization
sparsity=0.01 # Sparsify the VQT basis by discarding up to sparsity fraction of the energy
window='hann' # Using Hann Window 
scale=True # Scale the VQT response by square-root the length of each channelâ€™s filter
pad_mode='reflect'  # Padding mode for centered frame analysis
res_type=None # The resampling mode for recursive downsampling 
dtype=None # The dtype of the output array. By default, this is inferred to match the numerical precision of the input signal

In [32]:
# Loading the Audio

# Path Configuration
path = './maestro-v1.0.0/2004/'

## Matplotlib
plt.rc("figure", figsize=(16, 8))

# Audio Path
# x, sr = librosa.load('./maestro-v1.0.0/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi',)

# Loadinging audio file
# filename = '%sMIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi'%path
filename = '%sGuns N Roses-Sweet Child O Mine Intro.wav'%path
x, fs = librosa.load(filename, sr=None, mono=True, duration=12)
# Audio data information
print("x Shape=", x.shape)
print("Sample rate fs=", fs)
print("Audio Length in seconds=%d [s]" % (x.shape[0]/fs))

# Playback audio file
ipd.Audio(x, rate=fs)

x Shape= (529200,)
Sample rate fs= 44100
Audio Length in seconds=12 [s]


In [37]:
# We continue to Compute the VQT of the loaded signal
def calc_vqt(x, fs = fs, hop_length= hop_length,fmin=fmin,n_bins=n_bins,gamma=20,bins_per_octave= bins_per_octave,tuning=tuning,filter_scale=filter_scale,norm=norm ,sparsity=0.01 ,window='hann', 
scale=scale,pad_mode=pad_mode,res_type=res_type,dtype=dtype):
    vqt = np.abs(librosa.vqt(x,sr = fs, hop_length= hop_length,fmin=fmin,n_bins=n_bins,gamma=20,bins_per_octave= bins_per_octave,tuning=tuning,filter_scale=filter_scale,norm=norm ,sparsity=0.01 ,window='hann', 
scale=scale,pad_mode=pad_mode,res_type=res_type,dtype=dtype))
    img = librosa.display.specshow(librosa.amplitude_to_db(vqt, ref=np.max),sr=fs,x_axis='time',y_axis='cqt_note',ax=ax)
    ax.set_title('Variable-Q power spectrum')
    fig.colorbar(img, ax=ax, format="%+2.0f dB")