In [None]:
import numpy as np
import torch
from model.tasnet import MultiTasNet
import librosa
import youtube_dl
import os
import IPython.display
# from evaluate import separate_sample

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

In [None]:
state = torch.load("best_model.pt", map_location=torch.device("cpu"))  # load checkpoint
network = MultiTasNet(state["args"]).to(device)  # initialize the model
network.load_state_dict(state['state_dict'])  # load the pretrained weights

In [None]:
def separate_sample(track):
    rate = track.rate
    
    audio = track.audio.astype('float32').transpose(1, 0)
    mix = [librosa.core.resample(audio, 44100, s, res_type='kaiser_best', fix=False) for s in[8000, 16000, 32000]]
    mix = [librosa.util.fix_length(m, (mix[0].shape[-1]+1)*(2**i)) for i, m in enumerate(mix)]
    mix = [torch.from_numpy(s).float().to(device).unsqueeze_(1) for s in mix]
    mix = [s / s.std(dim=-1, keepdim=True) for s in mix]

    mix_left = [s[0:1, :, :] for s in mix]
    mix_right = [s[1:2, :, :] for s in mix]
    del mix
    mix = mix_left
    
    def resample(audio, target_rate):
        return librosa.core.resample(audio, rate, target_rate, res_type='kaiser_best', fix=False)
    
#     audio = audio.astype('float32')  # match the type with the type of the weights in the network
#     mix = [resample(audio, s) for s in[8000, 16000, 32000]]  # resample to different sampling rates for the three stages
#     mix = [librosa.util.fix_length(m, (mix[0].shape[-1]+1)*(2**i)) for i,m in enumerate(mix)]  # allign all three sample so that their lenghts are divisible
#     mix = [torch.from_numpy(s).float().to(device).unsqueeze_(1) for s in mix]  # cast to tensor with shape: [1, 1, T']
#     mix = [s / s.std(dim=-1, keepdim=True) for s in mix]  # normalize by the standard deviation
    
    network.eval()
    with torch.no_grad():        
        separation = network.inference(mix, n_chunks=2)[-1]  # call the network to obtain the separated audio with shape [1, 4, 1, T']

    # normalize the amplitudes by computing the least squares
    # -> we try to scale the separated stems so that their sum is equal to the input mix 
    a = separation[0,:,0,:].cpu().numpy().T  # separated stems
    b = mix[-1][0,0,:].cpu().numpy()  # input mix
    sol = np.linalg.lstsq(a, b, rcond=None)[0]  # scaling coefficients that minimize the MSE
    separation = a * sol  # scale the separated stems

    estimates = {
        'drums': separation[:,0:1],
        'bass': separation[:,1:2],
        'other': separation[:,2:3],
        'vocals': separation[:,3:4],
    }

    return estimates

In [None]:
# Only for one file

audio, rate = librosa.load('Traffic Experiment - Sirens-Copy1.stem.mp4', sr=None)

start_pad, stop_pad = max(0, start-4), min(audio.shape[-1]/rate-1, stop+4)
start_cut, stop_cut = start-start_pad, stop-stop_pad

audio = audio[start_pad*rate:stop_pad*rate].copy()
audio = np.expand_dims(audio, 0)

song = 'Traffic Experiment - Sirens'
print(f"{song}")
IPython.display.display(IPython.display.Audio(audio[:, start_cut*rate:stop_cut*rate].copy(), rate=rate))

In [None]:
print("separating... ", end='')
estimates = separate_sample(audio, rate)
estimates = {i: e[start_cut*32000:stop_cut*32000,:] for i, e in estimates.items()}  # cut to show only the desired part (mainly to reduce the latency)
print("done")
print("downloading audio files to the client side...")

for instrument in ['vocals', 'drums', 'bass', 'other']:
    if estimates[instrument].max() < 0.25: continue  # hacky way to remove the silent instruments

    print(f"\n{instrument}")
    IPython.display.display(IPython.display.Audio(estimates[instrument].T.copy(), rate=32000))

In [None]:
mus_test = musdb.DB(root="data/Sample", subsets="train")

track_estimates_pairs = []
for i, track in enumerate(mus_test.tracks):
    estimates = separate_sample(track)
    track_estimates_pairs.append((track, estimates))

    print(f"{int((i + 1) / len(mus_test.tracks) * 100)} %")

In [None]:
track_estimates_pairs

In [None]:
import multiprocessing

def evaluate(track_estimates):
        track, estimates = track_estimates
        museval.eval_mus_track(track, estimates, output_dir="/")

pool = multiprocessing.Pool(4)
scores_list = list(
    pool.imap_unordered(
        func=evaluate,
        iterable=track_estimates_pairs,
        chunksize=1
    )
)
pool.close()
pool.join()

print("Everything is evaluated")