First we need a dataset, for this we're using Cotho a labled audio dataset. https://github.com/Labbeti/aac-datasets

In [1]:
from aac_datasets import Clotho
dataset = Clotho(root="./datasets/", subset="dev", download=False)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from aac_datasets.utils.collate import BasicCollate
from torch.utils.data import DataLoader
train_loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=18, pin_memory=True, collate_fn=BasicCollate())

In [10]:
import torch
for batch in train_loader:
    break

RuntimeError: stack expects each tensor to be equal size, but got [1, 904050] at entry 0 and [1, 1270080] at entry 1

In [30]:
import random
audio, text = batch['audio'][0][0][None, :], [random.choice(batch['captions'][0])]
audio.shape, text

(torch.Size([1, 904050]),
 ['Some random tones are played continuously and repetitively.'])

In [8]:
dataset[0]
import torchaudio
print(torchaudio.list_audio_backends())  

['ffmpeg', 'soundfile']


In [7]:
import os
os.environ["SOUNDFILE_LIBRARY"] = "/var/scratch/bmt270/miniconda3/envs/musiclm/lib/python3.10/site-packages/_soundfile_data/libsndfile_x86_64.so"

Train Mulan

In [1]:
import torch
from models import *
mulan, audio_transformer, text_transformer = build_mulan()

wavs = torch.randn(1, 44100*3)
# texts = torch.randint(0, 20000, (2, 256))
texts = ["test text here"]

loss = mulan(wavs, raw_texts=texts)
loss.backward()

2024-07-30 16:13:29 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


spectrogram yielded shape of (65, 11026), but had to be cropped to (64, 11024) to be patchified for transformer


In [12]:
mulan.contrast.temperatures.mean().item(), mulan.contrast.denominator_i.mean().item(), mulan.contrast.denominator_j.mean().item(), mulan.contrast.sims.mean().item(), mulan.contrast.numerator.mean().item()

(2.3025851249694824, 0.0, 0.0, 0.35081106424331665, 1.4202189445495605)

Preload the trained mulan and train the basics of the semantics transformer

In [1]:
from models import *
from audiolm_pytorch import SemanticTransformerTrainer
import torch

mulan, _, _ = build_mulan()
mulan.load_state_dict(torch.load('./models/mulan/ckpt.pt'))

semantic_transformer, wav2vec, quantizer = build_semantic_transformer(mulan)

trainer = SemanticTransformerTrainer(
    transformer = semantic_transformer,
    wav2vec = wav2vec,
    audio_conditioner = quantizer,   # pass in the MulanEmbedQuantizer instance above
    folder ='./datasets/CLOTHO_v2.1/clotho_audio_files',
    results_folder='./models/SemanticTransformer',
    batch_size = 1,
    data_max_length = 320 * 32,
    num_train_steps = 1
)

trainer.train()

2024-07-29 20:32:34 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


training with dataset of 3647 samples and validating with randomly splitted 192 samples
spectrogram yielded shape of (65, 854), but had to be cropped to (64, 848) to be patchified for transformer
0: loss: 6.407083511352539
0: valid loss 7.012447834014893
0: saving model to models/SemanticTransformer
training complete


In [1]:
from audiolm_pytorch import EncodecWrapper
encodec = EncodecWrapper()
# Now you can use the encodec variable in the same way you'd use the soundstream variables below.

2024-07-29 20:34:47 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [1]:
from audiolm_pytorch import SoundStreamTrainer
from accelerate import Accelerator
from models import *
import torch

soundstream = build_sound_stream()

trainer = SoundStreamTrainer(
    soundstream,
    folder ='./datasets/CLOTHO_v2.1/clotho_audio_files',
    batch_size = 4,
    grad_accum_every = 8,         # effective batch size of 32
    data_max_length_seconds = 2,  # train on 2 second audio
    num_train_steps = 1,
    results_folder='./models/SoundStream',
    accelerator=Accelerator(cpu=True) #Remove when training on GPU
)

trainer.train()

2024-07-29 21:58:48 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


training with dataset of 3647 samples and validating with randomly splitted 192 samples
0: soundstream total loss: 22.406, soundstream recon loss: 0.021 | discr (scale 1) loss: 2.000 | discr (scale 0.5) loss: 2.000 | discr (scale 0.25) loss: 2.000
0: saving to models/SoundStream
0: saving model to models/SoundStream
training complete


In [1]:
from audiolm_pytorch import SoundStream, AudioLM
from accelerate import Accelerator
from models import *
import torch
from musiclm_pytorch import MusicLM
use_gpu = False

mulan, _, _ = build_mulan()
mulan.load_state_dict(torch.load('./models/mulan/ckpt.pt'))
if use_gpu:
    mulan = mulan.cuda()

wav2vec = build_wav2vec()
if use_gpu:
    wav2vec = wav2vec.cuda()

quantizer = build_quantizer(mulan)
if use_gpu:
    quantizer = quantizer.cuda()

semantic_transformer = build_semantic_transformer(quantizer, wav2vec)
semantic_transformer.load('./models/SemanticTransformer/semantic.transformer.0.pt')
if use_gpu:
    semantic_transformer = semantic_transformer.cuda()


coarse_transformer = build_coarse_transformer(wav2vec)
coarse_transformer.load('./models/CoarseTransformer/coarse.transformer.0.pt')
if use_gpu:
    coarse_transformer = coarse_transformer.cuda()


fine_transformer = build_fine_transformer()
fine_transformer.load('./models/FineTransformer/fine.transformer.0.pt')
if use_gpu:
    fine_transformer = fine_transformer.cuda()


soundstream = SoundStream.init_and_load_from('./models/SoundStream/soundstream.0.pt')
if use_gpu:
    soundstream = soundstream.cuda()


audiolm = AudioLM(
    wav2vec = wav2vec,
    codec = soundstream,
    semantic_transformer = semantic_transformer,
    coarse_transformer = coarse_transformer,
    fine_transformer = fine_transformer
)
if use_gpu:
    audiolm = audiolm.cuda()


musiclm = MusicLM(
    audio_lm = audiolm,                 # `AudioLM` from https://github.com/lucidrains/audiolm-pytorch
    mulan_embed_quantizer = quantizer    # the `MuLaNEmbedQuantizer` from above
)

music = musiclm('the crystalline sounds of the piano in a ballroom', num_samples = 4) # sample 4 and pick the top match with mulan

2024-07-30 00:17:50 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda


generating semantic:   1%|▏         | 26/2048 [00:00<00:50, 40.20it/s]
generating coarse: 100%|██████████| 512/512 [05:55<00:00,  1.44it/s]
generating fine:  97%|█████████▋| 498/512 [15:46<00:44,  3.15s/it]