In [1]:
%%capture
!pip install audiolm-pytorch

In [2]:
import torch

from audiolm_pytorch import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load neural audio codec
soundstream = AudioLMSoundStream(
    codebook_size = 4096,
    rq_num_quantizers = 8,
    rq_groups = 2,                       # this paper proposes using multi-headed residual vector quantization - https://arxiv.org/abs/2305.02765
    use_lookup_free_quantizer = True,    # whether to use residual lookup free quantization - there are now reports of successful usage of this unpublished technique
    use_finite_scalar_quantizer = False, # whether to use residual finite scalar quantization
    attn_window_size = 128,              # local attention receptive field at bottleneck
    attn_depth = 2                       # 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
)

Neural Audio Codec

In [4]:
encodec = EncodecWrapper() # Now you can use the encodec variable in the same way you'd use the soundstream variables below.

Semantic transformer

In [5]:
# hubert checkpoints can be downloaded at
# https://github.com/facebookresearch/fairseq/tree/main/examples/hubert

wav2vec = HubertWithKmeans(
    checkpoint_path = './hubert/hubert_base_ls960.pt',
    kmeans_path = './hubert/hubert_base_ls960_L9_km500.bin'
)

semantic_transformer = SemanticTransformer(
    num_semantic_tokens = wav2vec.codebook_size,
    dim = 1024,
    depth = 6,
    flash_attn = True
).cuda()


# trainer = SemanticTransformerTrainer(
#     transformer = semantic_transformer,
#     wav2vec = wav2vec,
#     folder ='/path/to/audio/files',
#     batch_size = 1,
#     data_max_length = 320 * 32,
#     num_train_steps = 1
# )

# trainer.train()

Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda


Coarse Transformer

In [6]:
# wav2vec = HubertWithKmeans(
#     checkpoint_path = './hubert/hubert_base_ls960.pt',
#     kmeans_path = './hubert/hubert_base_ls960_L9_km500.bin'
# )

coarse_transformer = CoarseTransformer(
    num_semantic_tokens = wav2vec.codebook_size,
    codebook_size = 1024,
    num_coarse_quantizers = 3,
    dim = 512,
    depth = 6,
    flash_attn = True
)

# trainer = CoarseTransformerTrainer(
#     transformer = coarse_transformer,
#     codec = soundstream,
#     wav2vec = wav2vec,
#     folder = '/path/to/audio/files',
#     batch_size = 1,
#     data_max_length = 320 * 32,
#     num_train_steps = 1_000_000
# )

Fine Transformer

In [7]:
fine_transformer = FineTransformer(
    num_coarse_quantizers = 3,
    num_fine_quantizers = 5,
    codebook_size = 1024,
    dim = 512,
    depth = 6,
    flash_attn = True
)

AudioLM Model

In [8]:
audiolm = AudioLM(
    wav2vec = wav2vec,
    codec = soundstream,
    semantic_transformer = semantic_transformer,
    coarse_transformer = coarse_transformer,
    fine_transformer = fine_transformer
).cuda()

Generate Audio

In [None]:
generated_wav_with_prime = audiolm(prime_wave = torch.randn(1, 16000).cuda(),prime_wave_input_sample_hz=16000)
torch.save(generated_wav_with_prime,'./audio_results/output.wav')