In [1]:
# importing packages
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
from scipy.ndimage.filters import maximum_filter, minimum_filter, uniform_filter
import math


In [2]:
# connecting files from folder
signal_filename = "mixture5.wav"
vocal_filename = "vocals5.wav"

In [3]:
# reading audio file
original_signal, sr = librosa.load(signal_filename, sr=44100)
original_vocal_signal, sr = librosa.load(vocal_filename, sr=44100)
original_music_signal = original_signal - original_vocal_signal

In [4]:
def plot_waveform(x,sr,display_title):
  plt.figure(figsize=(14, 5))
  plt.title(display_title)
  librosa.display.waveplot(x, sr=sr)
    
def convert_audio_to_spectogram(x,sr,display_title):
  X = librosa.stft(x,win_length = 2048,hop_length = 512)
  Xdb = librosa.amplitude_to_db(abs(X))
  plt.figure(figsize=(14, 5))
  plt.title(display_title)
  librosa.display.specshow(Xdb, sr = sr, x_axis = 'time', y_axis = 'log')
  plt.colorbar()    
  return X

def convert_spectrogram_2DFT(spectrogram,sr,display_title):
  twoDFT_spectrogram = np.fft.fft2(abs(spectrogram))
  plt.figure(figsize=(14, 5))
  librosa.display.specshow(librosa.amplitude_to_db(abs(twoDFT_spectrogram)), sr=sr,x_axis='linear',y_axis='log')
  plt.title(display_title)
  plt.xlabel("Rate")
  plt.ylabel("scale")
  plt.colorbar()
  return twoDFT_spectrogram

In [8]:
# convert_audio_to_spectogram_log
original_signal_spectrogram = convert_audio_to_spectogram(original_signal,sr,display_title='Original Signal > Spectrogram')

(1025, 18897)


In [10]:
original_signal_spectrogram_2dFT = convert_spectrogram_2DFT(original_signal_spectrogram,sr,display_title='Original Signal > Spectrogram > 2dFT')

(1025, 18897)


In [11]:
   def filter_local_maxima(neighborhood_size, spectrogram_2dFT):
        data = np.abs(spectrogram_2dFT)
        threshold = np.std(data)

        data_max = maximum_filter(data, neighborhood_size)
        data_min = minimum_filter(data, neighborhood_size)
        alpha_c = data_max - data_min
        
        M_bg = (data == data_max)
        diff = (alpha_c > threshold)
        M_bg[diff == 0] = 0
        M_fg = 1 - M_bg
        return M_bg, M_fg

In [12]:
M_bg, M_fg = filter_local_maxima((1,15),original_signal_spectrogram_2dFT)
i2dFT_Bg = np.fft.ifft2(np.multiply(original_signal_spectrogram_2dFT,M_bg))
i2dFT_Fg = np.fft.ifft2(np.multiply(original_signal_spectrogram_2dFT,M_fg))
mask_Bg = np.abs(i2dFT_Bg) > np.abs(i2dFT_Fg)

background_signal_spectrogram = np.abs(np.multiply(original_signal_spectrogram,mask_Bg))
vocal_signal_spectrogram = np.abs(np.multiply(original_signal_spectrogram,1-mask_Bg))
background_signal = librosa.core.istft(np.multiply(original_signal_spectrogram,mask_Bg))
vocal_signal = librosa.core.istft(np.multiply(original_signal_spectrogram,1-mask_Bg))

In [13]:
def signaltonoise(signal,noise):
    signal_amplitude = np.sum(np.square(signal))
    
    noise_amplitude = np.sum(np.square(noise))
    
    SNR = 10*math.log10(signal_amplitude/noise_amplitude)
    return SNR

In [14]:
music_signal_back = original_music_signal[:len(background_signal)]
SNR_music = signaltonoise(music_signal_back,np.subtract(background_signal,music_signal_back))
print(SNR_music)

9.635219443483184


In [15]:
vocal_signal_back = original_vocal_signal[:len(vocal_signal)]
SNR_vocal = signaltonoise(vocal_signal_back,np.subtract(vocal_signal,vocal_signal_back))
print(SNR_vocal)

1.3323208956435555


In [16]:
def cosine_similarity(x,y):
    numerator = np.sum(np.multiply(x,y))
    denominator = math.sqrt(np.sum(np.square(x))*np.sum(np.square(y)))
    return numerator/denominator

In [18]:
similarity_music = cosine_similarity(background_signal,music_signal_back)
print(similarity_music)

0.9440542560147327


In [551]:
similarity_vocal = cosine_similarity(vocal_signal,vocal_signal_back)
print(similarity_vocal)

0.6544116149782884
