In [2]:
! pip install librosa numpy pandas demucs

Collecting demucs
  Downloading demucs-4.0.0.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting dora-search
  Downloading dora_search-0.1.11.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.0/87.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting diffq>=0.2.1
  Downloading diffq-0.2.3-cp310-cp310-macosx_10_9_x86_64.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.1/106.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 kB[0m [31m1.3 

In [1]:
import librosa
import numpy as np
import os
import pandas as pd
import io
from pathlib import Path
from shutil import rmtree
import subprocess as sp


# Source Separation using Demucs

In [None]:
model = "htdemucs"
extensions = ["mp3", "wav", "ogg", "flac"]  # we will look for all those file types.
two_stems = None   # only separate one stems from the rest, for instance

# Options for the output audio.
mp3 = True
mp3_rate = 320
float32 = False  # output as float 32 wavs, unsused if 'mp3' is True.
int24 = False    # output as int24 wavs, unused if 'mp3' is True.

In [39]:
from dataclasses import dataclass

@dataclass
class DeMixedAudio():
    sample_rate: int
    bass : np.ndarray
    drums : np.ndarray
    other : np.ndarray
    vocals : np.ndarray

In [None]:
class DeMixedAudio():
    def __init__(self, sample_rate, bass, drums, other, vocals):
        self.sample_rate = sample_rate
        self.bass = bass
        self.drums = drums
        self.other = other
        self.vocals = vocals

TODO: check how this works across songs with different sample rates

- Does the model work with different sample rates?
- Does the model output have the same sample rate as the input?
- If so TODO: find a way to find the input native sample rate of the audio and use this to re-load the stems

In [40]:
def get_stems(path_to_file,model='htdemucs',):
    file_name = os.path.basename(path_to_file)
    sp.run(["python3", "-m", "demucs.separate", "-o", "../resources/tmp", "-n", model, path_to_file])
    print("Demucs done, loading files")
    stems={}
    for stem in ['bass','drums','other','vocals']:
        stems[stem] = librosa.load(os.path.join("../resources/tmp/htdemucs",file_name[:file_name.find('.')],f"{stem}.wav"), sr=44100)[0]
    rmtree(f"../resources/tmp/htdemucs/{file_name[:file_name.find('.')]}")
    return DeMixedAudio(44100, bass=stems['bass'], drums=stems['drums'], other=stems['other'], vocals=stems['vocals'])#stems

In [41]:
song = get_stems("../resources/test_file.wav")

Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/vedant/Desktop/Programming/WMG/scalable-asset-generation/resources/tmp/htdemucs
Separating track ../resources/test_file.wav


dyld: Library not loaded: /usr/local/opt/jpeg-xl/lib/libjxl.0.6.dylib
  Referenced from: /usr/local/bin/ffprobe
  Reason: image not found
100%|████████████████████████████████████████████████████████████████████████| 128.7/128.7 [02:07<00:00,  1.01seconds/s]


Demucs done, loading files


In [42]:
song

DeMixedAudio(sample_rate=44100, bass=array([-3.0517578e-05, -9.1552734e-05, -9.1552734e-05, ...,
        4.5776367e-04,  4.5776367e-04,  4.5776367e-04], dtype=float32), drums=array([0.00109863, 0.0009613 , 0.00099182, ..., 0.00033569, 0.00033569,
       0.00033569], dtype=float32), other=array([3.8146973e-04, 3.0517578e-05, 5.7983398e-04, ..., 2.1362305e-04,
       1.6784668e-04, 1.9836426e-04], dtype=float32), vocals=array([0.00054932, 0.00057983, 0.00056458, ..., 0.00039673, 0.00039673,
       0.00039673], dtype=float32))

In [38]:
stems

{'bass': array([ 3.0517578e-05,  0.0000000e+00, -3.0517578e-05, ...,
         4.2724609e-04,  4.2724609e-04,  4.2724609e-04], dtype=float32),
 'drums': array([0.00088501, 0.00080872, 0.0007782 , ..., 0.00033569, 0.00033569,
        0.00033569], dtype=float32),
 'other': array([0.00053406, 0.00027466, 0.00082397, ..., 0.00027466, 0.00021362,
        0.0002594 ], dtype=float32),
 'vocals': array([0.00050354, 0.0005188 , 0.00047302, ..., 0.00039673, 0.00042725,
        0.00039673], dtype=float32)}