### Demucs Replication with Limited Resources:

Original paper: https://arxiv.org/pdf/1911.13254.pdf?ref=https://githubhelp.com

Demucs: https://github.com/facebookresearch/demucs/blob/main/demucs/demucs.py


## Dataset used clipped MusDB18:

Please add the train dataset to your drive: https://drive.google.com/drive/folders/1odZkYJDZyvvbE8Y31XEnGPKO2nirUMIF?usp=sharing

Please add the test dataset to your drive: https://drive.google.com/drive/folders/1xyc0QI5tqY5Zkb9Yrky8mPuDlyFjQ12U?usp=sharing

In [143]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### PyDub is needed to play WAV in notebook

In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import os 
import math
import json
import numpy as np
import tensorflow as tf
from pydub import AudioSegment 
from keras import backend as K
import scipy.io.wavfile as wavf
from tensorflow.keras.layers import Conv1D, Conv1DTranspose, Dense, LSTM, BatchNormalization, UpSampling1D, AveragePooling1D


In [None]:
train_dataset_path = "/content/drive/MyDrive/dataset_train"
test_dataset_path = "/content/drive/MyDrive/dataset_test"

dataset_test = tf.data.experimental.load(test_dataset_path)
dataset_train = tf.data.experimental.load(train_dataset_path)

To understand the shapes of the input and target. The input is a stereo 10-second waveform sampled at 441kHz. 10-seconds. Therefore, the first dimension of X is 441000: $10(sec)x44100(samplerate)) = 441000$. The second dimension is 2, because it is a stereo audio.

The target consists of four 10-second stereo waveform, which explains the first dimension of Y. The target consists of drum, others, vocals and bass in this order.

In [None]:
X_shape, y_shape = (),()

for elem in dataset_test:
  X_shape = elem[0].shape
  y_shape = elem[1].shape
  break

X_shape, y_shape

(TensorShape([441000, 2]), TensorShape([4, 441000, 2]))

#### We needed to use a generator to load the samples, otherwise we exhauste the memory

In [None]:
def genertator_train():
  for elem in dataset_train:
    yield elem[0], elem[1]

def genertator_test():
  for elem in dataset_test:
    yield elem[0], elem[1]

In [None]:
train_tfds = (tf.data.Dataset.from_generator(genertator_train, 
                                             output_signature=(tf.TensorSpec(shape=X_shape, dtype=tf.float32),
                                                              tf.TensorSpec(shape=y_shape, dtype=tf.float32))))
test_tfds = (tf.data.Dataset.from_generator(genertator_test, 
                                             output_signature=(tf.TensorSpec(shape=X_shape, dtype=tf.float32),
                                                              tf.TensorSpec(shape=y_shape, dtype=tf.float32))))

#### We can not batch more than one sample because it will exhaust the memory when the trainign starts

In [None]:
def preprocessing(ds):
    ds = ds.cache()
    ds = ds.shuffle(20)
    ds = ds.batch(1)
    ds = ds.prefetch(128)
    return ds

In [None]:
train_ds = train_tfds.apply(preprocessing)
test_ds = test_tfds.apply(preprocessing)

### Play the input:

In [None]:
for elem in train_ds:
  mixture = elem[0]
  sources = elem[1]
  print("x: ",elem[0].shape, "y: ", elem[1].shape)
  break

x:  (1, 441000, 2) y:  (1, 4, 441000, 2)


In [None]:
def play(input, filename=None):
  filename_wav = "test.wav"
  if filename:
    filename_wav = filename

  # we need to save it, then play it. Couldn't find a nicer way.
  wavf.write("test.wav", rate=44100, data = input.numpy())
  wav_file = AudioSegment.from_wav(file = "test.wav")
  return wav_file

In [None]:
drums = sources[0][0]
vocals = sources[0][1]
other = sources[0][2]
bass = sources[0][3]
mixture = mixture[0]

In [None]:
play(bass)

## Gated Linear Unit (GLU):
GLU does not exist in TensorFlow. We followed [this implementation](https://medium.com/deeplearningmadeeasy/glu-gated-linear-unit-21e71cd5208).

In [None]:
class GLU(tf.keras.layers.Layer):
    def __init__(self, input_size):
        super().__init__()
        self.linear1 = tf.keras.layers.Dense(input_size)
        self.linear2 = tf.keras.layers.Dense(input_size, activation = "sigmoid")

    def call(self, x):
        return self.linear1(x) * self.linear2(x)

### The Encoder:

In [None]:
class DemucsEncoder(tf.keras.layers.Layer):
  def __init__(self, output_channels=64, num_of_conv_blocks=5):
    """
    Creates the Demucs Encoder, which is composed of 'num_of_conv_blocks' 
    convolution blocks. Each block starts with a convolution layer,
    followed by a convolutional layer of kernel width 1 to increase the depth.
    Finally, we have GLU.


    Args:
      output_channels (int): number of input channels to the enoder
      num_ov_conv_blocks (int): number of the convolution blocks in the decoder
    """
    super().__init__()

    self.blocks = []
    for i in range(num_of_conv_blocks):
      conv_layer1 = Conv1D(output_channels, kernel_size=8, strides=4,
                      activation="relu", padding="same")
      batch_noramlization1 = BatchNormalization()
      conv_layer2 = Conv1D(output_channels, kernel_size=1)
      batch_noramlization2 = BatchNormalization()
      glu = GLU(output_channels)

      output_channels *= 2

      self.blocks.append([conv_layer1, batch_noramlization1, conv_layer2, 
                          batch_noramlization2, glu])

  def call(self, input, skip_connection = False):
      x = input
      saved = []
      for block in self.blocks:
        for layer in block:
          x = layer(x, training=True)
        if skip_connection:
          saved.append(x)     
      return x, saved

In [None]:
noise = tf.random.normal([1, 441000, 2])
encoder = DemucsEncoder()
encoded, _ = encoder(noise)
encoded.shape

TensorShape([1, 431, 1024])

### Center the trimming:
Since in the Demucs architecture there is skip connections between the enoder and the decoder, we need to add the output of the encoder to the input of the encoder, but they are of different lengths. Therefore, we need to trim the output of the encoder. 
P.S: we wanted to recover the original sample rate by using padding = "same" in the decoder, but we get a value error because of the skip connections. Thus, we decided not to add padding in the decoder.

In [None]:
def center_trim(tensor, reference):
    """
    This function was included in the Pytorch implementation for trimming tensors:

    It trims the tensor to be of the same size as reference for u-connections

    Center trim `tensor` with respect to `reference`, along the last dimension.
    `reference` can also be a number, representing the length to trim to.
    If the size difference != 0 mod 2, the extra sample is removed on the right side.
    """
    if hasattr(reference, "shape"):
        reference = reference.shape[-2]
    delta = tensor.shape[-2] - reference
    if delta < 0:
        raise ValueError("tensor must be larger than reference." f"Delta is {delta}.")
    if delta:
        tensor = tensor[:, delta // 2:-(delta - delta // 2), :]
    return tensor

# Decoder

In [None]:
class DemucsDecoder(tf.keras.layers.Layer): 
    def __init__(self, output_channels=512, sources=4, num_of_conv_blocks=5):
      """
      Creates the Demucs Decoder, which is composed of 'num_of_conv_blocks' 
      convolution blocks. Each block starts with a convolution layer,
      followed by a GLU, then a transposed convolution.
      After all blocks, there is a final linear layer without activation function.
      
      Args:
        output_channels (int): number of input channels to the decoder
        sources (int):  number of sources (in this case equals 4: drums, bass, 
                                            other, vocals)
        num_ov_conv_blocks (int): number of the convolution blocks in the decoder
      """

      super().__init__()
      self.blocks = []
      self.sources = sources
      
      for i in range(num_of_conv_blocks):
        conv = Conv1D(output_channels, kernel_size=3, strides=1)
        glu = GLU(output_channels)
        batch_normalization1 = BatchNormalization()
        transpose =  Conv1DTranspose(output_channels, kernel_size=8, 
                      strides=4, activation="relu")
        batch_normalization2 = BatchNormalization()
        self.blocks.append([conv, glu, batch_normalization1, transpose,
                            batch_normalization2])
        
        output_channels /= 2
      
      self.linear_layer = Dense(units=sources*2)

    def call(self, input, saved = None):
        x = input 
        for block in self.blocks:
          if saved:
            skip = center_trim(saved.pop(-1), x)
            x += skip
          for layer in block:
            x = layer(x,training=True)
        return self.linear_layer(x)

In [None]:
decoder = DemucsDecoder()
decoded = decoder(encoded)
decoded.shape

TensorShape([1, 1759916, 8])

##Bidirectional LSTM


We then use a bidirectional LSTM with 2 layers and a
hidden size $C_{L}$. The LSTM outputs $2C_L$ channels per time position. We use a linear layer to take that
number down to $C_L$. To reimplement it in TensorFlow, we used this [implementation](https://stackoverflow.com/questions/66626700/difference-between-tensorflows-tf-keras-layers-dense-and-pytorchs-torch-nn-lin)

In [None]:
import torch

In [None]:
def unfold(a, kernel_size, stride):
    """
    This function was included in the Pytorch implementation for implementing
    max step in BiLSTM:
    Given input of size [*OT, T], output Tensor of size [*OT, F, K]
    with K the kernel size, by extracting frames with the given stride.
    This will pad the input so that `F = ceil(T / K)`.
    see https://github.com/pytorch/pytorch/issues/60466
    """
    
    *shape, length = a.shape
    n_frames = math.ceil(length / stride)
    tgt_length = (n_frames - 1) * stride + kernel_size
    paddings = tf.constant([[0, tgt_length - length], [0, tgt_length - length]
                            ,[0, tgt_length - length]])
    a = tf.pad(a, paddings)
    a_torch = torch.from_numpy(a.numpy())
    strides = list(a_torch.stride())
    assert strides[-1] == 1, 'data should be contiguous'
    strides = strides[:-1] + [stride, 1]
    tmp = a_torch.as_strided([*shape, n_frames, kernel_size], strides)
    tmp_np = tmp.detach().numpy()
    a = tf.convert_to_tensor(tmp_np, dtype=tf.float32)
    return a

In [None]:
class BLSTM(tf.keras.layers.Layer):
  """
  Bidirectional LSTM layer, it contains also the using a max step to split the 
  input in overlapping chunks and the LSTM applied separately on each chunk.
  The implementation here is taken from the original Demucs implementation.
  We could not use it because of the padding error we kept running into it. 
  Thus, we kept uncommented for future work.

  Args:
      dim : Dimensionality
      layers: number of LSTM layers
  """

  def __init__(self, dim=1024, layers=2, max_steps=0, skip=True):
    super().__init__()
    self.skip = skip

    self.max_steps = max_steps
    self.layers_list = []
    for _ in range(layers):
      self.layers_list.append(tf.keras.layers.Bidirectional(
          tf.keras.layers.LSTM(dim,return_sequences=True), merge_mode='concat'))
      
    self.layers_list.append(tf.keras.layers.Dense(dim))

  def call(self, x):
      batch_size, samples, dim = x.shape
      skipped = x

      # if self.max_steps is not None and dim > self.max_steps:
      #   width = self.max_steps
      #   stride = width // 2
      #   frames = unfold(x, width, stride)
      #   nframes = frames.shape[2]
      #   framed = True
      #   x = tf.transpose(frames, perm=[0, 2, 1, 3])
      #   x = tf.reshape(x, [-1, samples, width])

      # x = tf.transpose(x, perm=[2,0,1])
      #
      for layer in self.layers_list:
        x = layer(x)
      #
      # x = tf.transpose(x, perm=[1,2,0])
      # #
      # if framed:
      #   out = []
      #   frames = tf.reshape(x, [batch_size, -1, samples, width])
      #   limit = stride // 2
      #   for k in range(nframes):
      #     if k == 0:
      #         out.append(frames[:, k, :, :-limit])
      #     elif k == nframes - 1:
      #         out.append(frames[:, k, :, limit:])
      #     else:
      #         out.append(frames[:, k, :, limit:-limit])

      #   out = tf.concat(out, -1)
      #   out = out[..., :dim]
      #   x = out
      
      if self.skip:
          x += skipped  
      return x

### Using Kaiming initialisation scheme:

In [None]:
def rescale_convolutional_layers(model, scale_reference=0.01):
  """
  It is taken from the original implementation.
  The authors mention that it helps but they are not sure why.
  """
  for block in model.blocks:
    for layer in block:
      if isinstance(layer, Conv1D) or isinstance(layer, Conv1DTranspose):
        weights, bias = layer.get_weights()[0], layer.get_weights()[1]
        weights_std, bias_std = np.std(weights), np.std(bias)

        scale_weights = (weights_std / scale_reference)**0.5
        scale_bias = (bias_std / scale_reference)**0.5

        updated_weights = weights / scale_weights
        updated_bias = bias / scale_bias
        layer.set_weights([updated_weights,updated_bias])

In [None]:
def valid_length(length, resample):
    """
    Also taken from the origianl implemenation, in order to recover the input
    sample rate, which is reduced due to the convolutional layer with no padding:

    Return the nearest valid length to use with the model so that
    there is no time steps left over in a convolution, e.g. for all
    layers, size of the input - kernel_size % stride = 0.
    Note that input are automatically padded if necessary to ensure that the 
    output has the same length as the input.
    """
    if resample:
        length *= 2

    for _ in range(5):
        length = math.ceil((length - 8) / 4) + 1
        length = max(1, length)

    for _ in range(5):
        length = (length - 1) * 4 + 8

    if resample:
        length = math.ceil(length / 2)
    return int(length)


In [None]:
class Demucs(tf.keras.Model):
  """
  The final model, we added all the tricks we tried. We didn't use all of them
  after doing experiments.
  """
  def __init__(self,
                 optimizer = None,
                 rescale = False,
                 resample = False,
                 demucsEncoder = DemucsEncoder(),
                 demucsDecoder = DemucsDecoder(),
                 blstm = BLSTM(dim = 1024, layers = 2)):
    super(Demucs, self).__init__()
    self.audio_channels = 2
    self.sources = 4

    self.rescale = rescale
    self.resample = resample

    # for accumulating the gradients
    self.gradients = []
    self.train_vars = []
    self.accum_vars = []
    self.accum_gradient = []

    # the components of the architecture
    self.encoder = demucsEncoder
    self.decoder = demucsDecoder
    self.lstm = blstm

    # for resampling
    self.upsample = UpSampling1D(2)
    self.downsample = AveragePooling1D()

    self.optimizer = optimizer
    self.loss_function = tf.keras.losses.MeanAbsoluteError()

  @tf.function 
  def call(self, input):
      x = input
      length = x.shape[-2]
      # expand the input to help recovering the sample rate
      delta = valid_length(length, self.resample) - length
      paddings = [[0, 0],
                  [delta // 2, delta - delta // 2],
                  [0, 0]]
      x = tf.pad(x, paddings)

      if self.resample:
        x = self.upsample(x)
        
      x, saved =  self.encoder(x, skip_connection=True)

      if self.lstm:
          x = self.lstm(x)

      x = self.decoder(x, saved = saved)

      if self.resample:
        x = self.downsample(x)
        
      x = tf.reshape(x, [x.shape[0], self.sources, 
                         x.shape[1], self.audio_channels])
      
      if self.rescale:
        rescale_convolutional_layers(self.encoder)
        rescale_convolutional_layers(self.decoder)
        self.rescale = False

      return x

  def train(self, input, target, accum_gradient, counter):
      # since we couldn't recover the sample rate of the input
      target = target[:,:,:439980,:]

      with tf.GradientTape() as tape:
          prediction = self(input)
          loss = self.loss_function(target, prediction)

      # get gradients of this tape
      gradients = tape.gradient(loss, self.trainable_variables)
      # accumulate the gradients
      accum_gradient = [(acum_grad+grad) for acum_grad, grad in 
                           zip(accum_gradient, gradients)]
      if counter == 128:
          # now, after executing all the tapes you needed, 
          # we apply the optimization step
          # (but first we take the average of the gradients)
          accum_gradient = [this_grad/counter for this_grad in accum_gradient]
          # apply optimization step
          self.optimizer.apply_gradients(zip(accum_gradient,self.trainable_variables))
          counter = 0        
      return loss, accum_gradient, counter

  def test(self, test_data):
        test_loss_agg = []
        for input, target in test_data:
            prediction = self(input)
            target = target[:,:,:439980,:]
            loss = self.loss_function(target, prediction)
            test_loss_agg.append(loss.numpy())
        test_loss = tf.reduce_mean(test_loss_agg)
        return test_loss

In [None]:
for elem in train_ds.take(1):
  input = elem[0]
  target = elem[1]

In [None]:
model = Demucs()
test = model(input)
model.summary()

Model: "demucs_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 demucs_encoder_1 (DemucsEnc  multiple                 9785600   
 oder)                                                           
                                                                 
 demucs_decoder (DemucsDecod  multiple                 5599112   
 er)                                                             
                                                                 
 blstm (BLSTM)               multiple                  44057600  
                                                                 
 up_sampling1d_8 (UpSampling  multiple                 0 (unused)
 1D)                                                             
                                                                 
 average_pooling1d_8 (Averag  multiple                 0 (unused)
 ePooling1D)                                              

In [None]:
test.shape

TensorShape([1, 4, 439980, 2])

## Save the model

In [None]:
# !rm ÷-r drive/MyDrive/saved_model/L

In [None]:
!mkdir -p drive/MyDrive/saved_model/L

In [None]:
path = "drive/MyDrive/saved_model/L/"

In [None]:
def save_model(model):
  """
  Saving model in the folder specific in the global variable
  """
  global path
  model.save_weights(path + 'model_weights',save_format='tf')

## Load the model

In [None]:
def load_model(optimizer):
  """
  Loading model out of the folder specific in the global variable
  """
  global path

  demucs = Demucs(optimizer=optimizer)
  demucs.load_weights(path+'model_weights')

  return demucs

## Visualization:
Here is the [workplace](https://wandb.ai/rfarah/demucs?workspace=user-rfarah) on W&B

In [None]:
! pip install wandb

Installing collected packages: smmap, gitdb, shortuuid, setproctitle, sentry-sdk, pathtools, GitPython, docker-pycreds, wandb
Successfully installed GitPython-3.1.27 docker-pycreds-0.4.0 gitdb-4.0.9 pathtools-0.1.2 sentry-sdk-1.5.8 setproctitle-1.2.2 shortuuid-1.0.8 smmap-5.0.0 wandb-0.12.14


In [None]:
# ! wandb login

In [None]:
import wandb
wandb.init(project="demucs", entity="rfarah")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# WandB – Config is a variable that holds and saves hyperparameters and inputs
config = wandb.config          # Initialize config
config.batch_size = 128        # Input batch size for training
config.learning_rate = 0.004    # Learning rate
config.num_steps = 1000      # Number of batches to train
config.num_input = 441000    #data input (audio shape: 10 * 44100 = 441000)

### Data Augmentation
Audiomentations Library is used for data augmentation

In [None]:
!pip install audiomentations

In [None]:
from audiomentations import Compose, Shift, Normalize, PitchShift

In [None]:
AUGMENTER = Compose([Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5, rollover=True),
                     PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
                     Normalize()])

In [None]:
def augment_source(input, source_idx):
    input_first = input[:,source_idx,:,0]
    input_second = input[:,source_idx,:,1]

    input_first_augmented = AUGMENTER(input_first, sample_rate=441000)
    input_second_augmented = AUGMENTER(input_second, sample_rate=441000)

    augmented_input = tf.stack([input_first_augmented, 
                                input_second_augmented], axis=-1)

    return augmented_input

In [None]:
def augment(input):
  if input.shape[1] != 4:
    first_mono_audio = input[:,:,0]
    second_mono_audio = input[:,:,1]

    first_monoaudio_augmented = AUGMENTER(first_mono_audio, sample_rate=441000)
    second_monoaudio_augmented = AUGMENTER(second_mono_audio, sample_rate=441000)

    augemented_input = tf.stack([first_monoaudio_augmented, 
                                 second_monoaudio_augmented], axis=-1)
    return augemented_input
  else:
    augmented_drums = augment_source(input, 0)
    augmented_vocals = augment_source(input, 1)
    augmented_other = augment_source(input, 2)
    augmented_bass = augment_source(input, 3)

    augmented_input = tf.stack([augmented_drums, augmented_vocals,
                                augmented_other, augmented_bass], axis=1)
  return augmented_input

## The training loop

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.004)

In [None]:
# If we already have a trained model
model = load_model(optimizer)

In [None]:
model = Demucs(optimizer = optimizer)

In [None]:
num_epochs = 1000

train_losses = []
test_losses = []

test_loss = model.test(test_ds)
test_losses.append(test_loss)

accum_gradient = [tv.assign(tf.zeros_like(tv)) for tv in model.trainable_variables]

for epoch in range(num_epochs):
    counter = 1
    print(f'Epoch {epoch}. Mean Squared Error value: {test_losses[-1]}')
    epoch_loss_agg = [] 

    for mixture, sources in train_ds:
        if counter == 1:
          train_vars = model.trainable_variables
          accum_gradient = [tf.Variable(tf.zeros_like(tv.initialized_value()),
                                        trainable=False)  for tv in train_vars]

        if epoch % 10 == 0 and epoch != 0:
          mixture = augment(mixture)
          sources = augment(sources)

        loss, accum_gradient, counter = model.train(mixture, sources, 
                                                    accum_gradient, counter)
        epoch_loss_agg.append(loss)
        counter += 1

    if ((epoch % 50 == 0) or epoch == 1000) and epoch != 0:
      save_model(model)

    avg_epoch_loss_agg = tf.reduce_mean(epoch_loss_agg)

    train_losses.append(avg_epoch_loss_agg)
    print(f'Epoch {epoch}. Mean Squared Error value training: {train_losses[-1]}')

    test_loss = model.test(test_ds)
    test_losses.append(test_loss)
    
    wandb.log({'epoch_loss_avg': avg_epoch_loss_agg.numpy(),
               'test_loss': test_loss})
      
  

Epoch 0. Mean Squared Error value: 0.021214812994003296
Epoch 0. Mean Squared Error value training: 0.025803226977586746
Epoch 1. Mean Squared Error value: 0.021211406216025352
Epoch 1. Mean Squared Error value training: 0.025814171880483627
Epoch 2. Mean Squared Error value: 0.02119569666683674
Epoch 2. Mean Squared Error value training: 0.025807062163949013
Epoch 3. Mean Squared Error value: 0.021219812333583832
Epoch 3. Mean Squared Error value training: 0.025821272283792496
Epoch 4. Mean Squared Error value: 0.021196255460381508
Epoch 4. Mean Squared Error value training: 0.025805208832025528
Epoch 5. Mean Squared Error value: 0.02121369168162346
Epoch 5. Mean Squared Error value training: 0.025811029598116875
Epoch 6. Mean Squared Error value: 0.02119484730064869
Epoch 6. Mean Squared Error value training: 0.02581026963889599
Epoch 7. Mean Squared Error value: 0.02121184580028057
Epoch 7. Mean Squared Error value training: 0.025811228901147842
Epoch 8. Mean Squared Error value: 0.



Epoch 10. Mean Squared Error value training: 0.05859849974513054
Epoch 11. Mean Squared Error value: 0.021218638867139816
Epoch 11. Mean Squared Error value training: 0.025817690417170525
Epoch 12. Mean Squared Error value: 0.02120811678469181
Epoch 12. Mean Squared Error value training: 0.025811538100242615
Epoch 13. Mean Squared Error value: 0.02121213637292385
Epoch 13. Mean Squared Error value training: 0.02581270970404148
Epoch 14. Mean Squared Error value: 0.021195124834775925
Epoch 14. Mean Squared Error value training: 0.0257999449968338
Epoch 15. Mean Squared Error value: 0.021196646615862846
Epoch 15. Mean Squared Error value training: 0.02580420859158039
Epoch 16. Mean Squared Error value: 0.021198371425271034
Epoch 16. Mean Squared Error value training: 0.025803539901971817
Epoch 17. Mean Squared Error value: 0.021192504093050957
Epoch 17. Mean Squared Error value training: 0.025802571326494217
Epoch 18. Mean Squared Error value: 0.021204641088843346
Epoch 18. Mean Squared 

In [None]:
output = model(input)

In [None]:
drums = output[0][0]
vocals = output[0][1]
other = output[0][2]
bass = output[0][3]

In [None]:
play(drums)

### Get the model ouput for the API 

In [103]:
def picke_a_song(song_name):
  sample_path = "/content/drive/MyDrive/api_songs/"+song_name+".wav"
  raw_audio = tf.io.read_file(sample_path)
  decoded_sample, _ = tf.audio.decode_wav(contents=raw_audio, 
                                          desired_samples=441000, 
                                          desired_channels=2)
  decoded_sample = tf.reshape(decoded_sample, [1, 441000,2])
  return decoded_sample

In [157]:
song_names = ["run_run_run", "fire", "shore", "toynbee_suite" , "talk_about_it"]
songs = [picke_a_song(song_name) for song_name in song_names]

In [168]:
def save_results(data, song_name):
  path = "/content/drive/MyDrive/output_api_songs/"+song_name+'/'
  wavf.write(path+"drums.wav", rate=44100, data = data[0].numpy())
  wavf.write(path+"vocals.wav", rate=44100, data = data[1].numpy())
  wavf.write(path+"other.wav", rate=44100, data = data[2].numpy())
  wavf.write(path+"bass.wav", rate=44100, data = data[3].numpy())


In [158]:
results = [model(song)[0] for song in songs]

In [170]:
for song_sources, song_name in zip(results, song_names):
  save_results(song_sources, song_name)