In [1]:
import torch
import torch.nn as nn
import numpy as np
import scipy
import librosa
import youtube_dl
import os
import soundfile as sf
from IPython.display import Audio, display

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cpu


### Changing directory to use the model package

In [2]:
%cd ../models/Conditioned-Source-Separation-LaSAFT

/Users/arron/code/PlexedLive/mixs/models/Conditioned-Source-Separation-LaSAFT


In [3]:
!pwd

/Users/arron/code/PlexedLive/mixs/models/Conditioned-Source-Separation-LaSAFT


## Initialize model and load it's pretrained 'checkpoint'

In [4]:
from lasaft.source_separation.conditioned.cunet.models.dcun_tfc_gpocm_lasaft import DCUN_TFC_GPoCM_LaSAFT_Framework

args = {}

# FFT params
args['n_fft'] = 2048
args['hop_length'] = 1024
args['num_frame'] = 128

# SVS Framework
args['spec_type'] = 'complex'
args['spec_est_mode'] = 'mapping'

# Other Hyperparams
args['optimizer'] = 'adam'
args['lr'] = 0.001
args['dev_mode'] = False
args['train_loss'] = 'spec_mse'
args['val_loss'] = 'raw_l1'

# DenseNet Hyperparams

args ['n_blocks'] = 7
args ['input_channels'] = 4
args ['internal_channels'] = 24
args ['first_conv_activation'] = 'relu'
args ['last_activation'] = 'identity'
args ['t_down_layers'] = None
args ['f_down_layers'] = None
args ['tif_init_mode'] = None

# TFC_TDF Block's Hyperparams
args['n_internal_layers'] =5
args['kernel_size_t'] = 3
args['kernel_size_f'] = 3
args['tfc_tdf_activation'] = 'relu'
args['bn_factor'] = 16
args['min_bn_units'] = 16
args['tfc_tdf_bias'] = True
args['num_tdfs'] = 6
args['dk'] = 32

args['control_vector_type'] = 'embedding'
args['control_input_dim'] = 4
args['embedding_dim'] = 32
args['condition_to'] = 'decoder'

args['control_n_layer'] = 4
args['control_type'] = 'dense'
args['pocm_type'] = 'matmul'
args['pocm_norm'] = 'batch_norm'


model = DCUN_TFC_GPoCM_LaSAFT_Framework(**args)

In [5]:
model = model.load_from_checkpoint('pretrained/gpocm_lasaft.ckpt')

## Load track with librosa and display a player

In [None]:
track_dir = '../../raw_data/musdb18/test/'
track_name = 'Tom McKenzie - Directions.stem.mp4'

audio, rate = librosa.load(f'{track_dir}{track_name}', mono=False, sr=44100)

audio = audio[:, 40*rate:50*rate]

display(Audio(audio, rate=rate))


In [None]:
audio

# Seperation

In [None]:
separated = model.separate_track(audio.T, 'vocals') 
vocals, sr=librosa.load('temp.wav', mono=False)
display(Audio('temp.wav')) 

In [None]:
%%time
def separate_all (audio):
    '''
    you can add or remove what ever you want in this loop
    using the keys, vocals, drums, bass and other
    you can also combine them 
    '''
    
    print('vocals')
    separated = model.separate_track(audio.T, 'vocals') 
    vocals, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 

    print('drums')
    separated = model.separate_track(audio.T, 'drums') 
    drums, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 

    print('bass')
    separated = model.separate_track(audio.T, 'bass') 
    bass, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 

    print('other')
    separated = model.separate_track(audio.T, 'other') 
    other, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 
    
    print('vocal-backing')
    librosa.output.write_wav('temp.wav', drums+bass+other, sr)
    display(Audio('temp.wav')) 

    print('v+d+b+o')
    librosa.output.write_wav('temp.wav', vocals+drums+bass+other, sr)
    display(Audio('temp.wav')) 

separate_all(audio)

# Trying with songs outside the dataset

In [6]:
from IPython.display import HTML
url = "dQw4w9WgXcQ" #@param {type:"string"}
start = 42 #@param {type:"number"}
stop = 52 #@param {type:"number"}
embed_url = "https://www.youtube.com/embed/%s?rel=0&start=%d&end=%d&amp;controls=0&amp;showinfo=0" % (url, start, stop)
HTML('<iframe width="560" height="315" src=' + embed_url + 'frameborder="0" allowfullscreen></iframe>')



In [7]:
def my_hook(d):
    if d['status'] == 'finished':
        print('Done downloading...')


ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '44100',
    }],
    'outtmpl': '%(title)s.wav',
    'progress_hooks': [my_hook],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info(url, download=False)
    status = ydl.download([url])

audio, rate = librosa.load(info.get('title', None) + '.wav', sr=44100, mono=False)
audio = audio[:, start*rate:stop*rate]
print(audio.shape)
display(Audio(audio, rate=rate))

[youtube] dQw4w9WgXcQ: Downloading webpage
[youtube] dQw4w9WgXcQ: Downloading js player 408be03a
[youtube] dQw4w9WgXcQ: Downloading js player 408be03a
[youtube] dQw4w9WgXcQ: Downloading webpage
[download] Destination: Rick Astley - Never Gonna Give You Up (Video).wav
[download] 100% of 3.28MiB in 00:0283MiB/s ETA 00:002
Done downloading...
[ffmpeg] Post-process file Rick Astley - Never Gonna Give You Up (Video).wav exists, skipping
(2, 441000)


In [None]:
def separate_all (audio):
    print('vocals')
    separated = model.separate_track(audio.T, 'vocals') 
    vocals, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 

    print('drums')
    separated = model.separate_track(audio.T, 'drums') 
    drums, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav'))

    print('bass')
    separated = model.separate_track(audio.T, 'bass') 
    bass, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 

    print('other')
    separated = model.separate_track(audio.T, 'other') 
    other, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 
    
    print('vocal-backing')
    librosa.output.write_wav('temp.wav', drums+bass+other, sr)
    display(Audio('temp.wav')) 


    print('v+d+b+o')
    librosa.output.write_wav('temp.wav', vocals+drums+bass+other, sr)
    display(Audio('temp.wav')) 

separate_all(audio)

# Using museval to test model

In [None]:
%%time
track_dir = '../../raw_data/musdb18/test/'
track_name = 'Tom McKenzie - Directions.stem.mp4'

audio, rate = librosa.load(f'{track_dir}{track_name}', mono=False)

# display(Audio(audio, rate=rate))

# predictions = []

def separate_all (audio):
    print('vocals')
    separated = model.separate_track(audio.T, 'vocals') 
    vocals, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 

    print('drums')
    separated = model.separate_track(audio.T, 'drums') 
    drums, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 

    print('bass')
    separated = model.separate_track(audio.T, 'bass') 
    bass, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 

    print('other')
    separated = model.separate_track(audio.T, 'other') 
    other, sr=librosa.load('temp.wav', mono=False)
    display(Audio('temp.wav')) 
    
    print('vocal-backing')
    librosa.output.write_wav('temp.wav', drums+bass+other, sr)
    display(Audio('temp.wav')) 

    print('v+d+b+o')
    librosa.output.write_wav('temp.wav', vocals+drums+bass+other, sr)
    display(Audio('temp.wav')) 
    
#     predictions.append([vocals, drums, bass, other])
    
separate_all(audio)

In [None]:
np.array(predictions).shape

In [None]:
Audio(predictions[0][2], rate=rate)