In [None]:
import os
import json
import random
import torchaudio
from tqdm import tqdm
from glob import glob
from multiprocessing import Pool
files = glob("/scratch4/users/od/MuVi/audio/*.wav")
random.seed(1019)

indices = [i for i in range(len(files))]
random.shuffle(indices)

train_indices = indices[:int(0.8 * len(indices))]
valid_indices = indices[int(0.8 * len(indices)):int(0.9 * len(indices))]
test_indices = indices[int(0.9 * len(indices)):]


def runner(wav_path):
    name = os.path.basename(wav_path)[:-4]

    mp3_path = os.path.join(f"/scratch4/users/od/MuVi/v1/{split_name}/{name}.mp3")
    json_path = mp3_path.replace(".mp3", ".json")

    if (os.path.exists(wav_path) and os.path.exists(json_path)):
        return

    if (not os.path.exists(wav_path)):
        print("missing wav file", wav_path)
        return

    os.system(f"ffmpeg -hide_banner -loglevel error -y -i {wav_path} {mp3_path}")

    if (not os.path.exists(mp3_path)):
        print("missing mp3 file", mp3_path)
        return

    wav, sr = torchaudio.load(mp3_path)

    template = {
        "key": "",
        "artist": "",
        "sample_rate": sr,
        "file_extension": "mp3",
        "description": "",
        "keywords": [],
        "duration": wav.shape[-1] / sr,
        "bpm": "",
        "genre": "",
        "title": "",
        "name": name,
        "instrument": "Mix",
        "moods": []
    }

    with open(json_path, "w") as f:
        json.dump(template, f)


for split_name, split_indices in dict(train=train_indices, valid=valid_indices, test=test_indices).items():
    with Pool(16) as p:
        for _ in tqdm(
            p.imap_unordered(
                runner,
                [files[idx] for idx in split_indices]
            ),
            total=len(split_indices)
        ):
            continue