In [1]:
from scripts import *
from tqdm.notebook import tqdm
import h5py
import librosa

# Creating the Dataset

## Downloading the files into the right place

First we set up the Data Folder which will hold the MUSDB data set. We want to directly download it into the `data/raw` directory next to this notebook. We will make that directory if it's not there. 

In [2]:
import os

data_folder = "./data"
raw_dataset_folder = f"{data_folder}/raw"

##  Create them if they don't exist
if not os.path.exists(data_folder):
    os.mkdir(data_folder)
    if not os.path.exists(raw_dataset_folder):
        os.mkdir(raw_dataset_folder)

Now we download the data into that folder, you only need to run this once of course. This will take a whileeeeee. it's 4.7GB 

1. __Go to the following url to download the dataset: https://zenodo.org/record/1117372__

2. Then unzip the file into `data/raw` such that the file directory looks like:
```
├── Asad
|   ├── NotebookSkeleton.ipynb
│   ├── data
│   │   ├── raw
│   │   │   ├── test
│   │   │   ├── train
```

## Now we can create the dataset

We use the musdb library to get a nice API to query for audio files stored within the `mp4` files in the raw dataset.

We set the root of the dataset to be the `./data/raw` folder we created earlier

In [3]:
import musdb

mus = musdb.DB(root=raw_dataset_folder)

We get our data split into training data and test data

In [4]:
training_data = mus.load_mus_tracks("train")
testing_data = mus.load_mus_tracks("test")

Now we make a dataset. 

1. We know that our Neural Network will take in data with $2^{14} = 16384$ samples. 
2. We would like our dataset to be made using samples that are converted to mono
3. We would like to use a smaller sample rate so that 16384 samples cover a reasonable amount of time. So we will choose to downsample the music such that the sample rate is reduced by half `downsampling_ratio=2`

In [5]:
# Downsampling ratio 
downsampling_ratio = 2    

# Samples inside one segment of data
samples_per_segment = 2**14 # = 16384

# Convert to mono
convert_to_mono = True

### Create training data set as an HDF5 file

In [7]:
song_db = []
idx = 0
hdf_dir = f"{data_folder}/training_data.h5"

for track in tqdm(training_data,  desc="Creating & Saving Dataset", position=0):
    # downsample data
    stems = Track(track).get_stem_section(ds=downsampling_ratio) 
    
    # Convert to mono
    if convert_to_mono:
        stems.convert_to_mono()
        
    # Cut the samples into segements to be analysed. The neural net architecture should take this many samples 
    # as inputs
    thebrokenstems = stems.cut_into_sections_based_on_samples(samples_per_segment) # Cut it into segments of samples_per_segment each
    
    with h5py.File(hdf_dir, "a") as f:
        instruments = ["drums", "bass", "other", "vocals"]
        f.attrs["samplerate"] = thebrokenstems[0].rate
        f.attrs["channels"] = 1 if thebrokenstems[0].is_mono else 2
        f.attrs["instruments"] = instruments
        
        for example in thebrokenstems:
            source_audios = example.get_stacked_audio(instruments)
            mix_audio = torch.sum(source_audios, dim=0)
            mix_audio = mix_audio.reshape((1, *mix_audio.shape))

            source_audios = source_audios.numpy()
            mix_audio = mix_audio.numpy()

            # Add to HDF5 file
            grp = f.create_group(str(idx))
            grp.create_dataset("inputs", shape=mix_audio.shape, dtype=mix_audio.dtype, data=mix_audio)
            grp.create_dataset("targets", shape=source_audios.shape, dtype=source_audios.dtype, data=source_audios)

            grp.attrs["length"] = mix_audio.shape[1]
            grp.attrs["target_length"] = source_audios.shape[1]
            song_db.append({"idx" : idx, "name" : track.name, "artist" : track.artist, "duration" : track.duration})
            idx += 1

Creating & Saving Dataset:   0%|          | 0/100 [00:00<?, ?it/s]

### Creating testing dataset as HDF5 file

In [9]:
song_db_test = []
idx = 0
hdf_dir_test = f"{data_folder}/testing_data.h5"

for track in tqdm(testing_data, desc="Creating & Saving Dataset", position=0):
    # downsample data
    stems = Track(track).get_stem_section(ds=downsampling_ratio) 
    
    # Convert to mono
    if convert_to_mono:
        stems.convert_to_mono()
        
    # Cut the samples into segements to be analysed. The neural net architecture should take this many samples 
    # as inputs
    thebrokenstems = stems.cut_into_sections_based_on_samples(samples_per_segment) # Cut it into segments
    
    with h5py.File(hdf_dir_test, "a") as f:
        instruments = ["drums", "bass", "other", "vocals"]
        f.attrs["samplerate"] = thebrokenstems[0].rate
        f.attrs["channels"] = 1 if thebrokenstems[0].is_mono else 2
        f.attrs["instruments"] = instruments
        
        for example in thebrokenstems:
            source_audios = example.get_stacked_audio(instruments)
            mix_audio = torch.sum(source_audios, dim=0)
            mix_audio = mix_audio.reshape((1, *mix_audio.shape))

            source_audios = source_audios.numpy()
            mix_audio = mix_audio.numpy()

            # Add to HDF5 file
            grp = f.create_group(str(idx))
            grp.create_dataset("inputs", shape=mix_audio.shape, dtype=mix_audio.dtype, data=mix_audio)
            grp.create_dataset("targets", shape=source_audios.shape, dtype=source_audios.dtype, data=source_audios)

            grp.attrs["length"] = mix_audio.shape[1]
            grp.attrs["target_length"] = source_audios.shape[1]
            song_db_test.append({"idx" : idx, "name" : track.name, "artist" : track.artist, "duration" : track.duration})
            idx += 1
            
#song_db_test = pd.DataFrame(song_db_test)

Creating & Saving Dataset:   0%|          | 0/50 [00:00<?, ?it/s]

ValueError: Unable to create group (name already exists)