## Exporting the stereo model

Trick to make the model actually function

In [None]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration, MusicgenModel

model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-stereo-small")
x = model.config.to_dict()
x['decoder']['num_codebooks'] = 4
model.config = model.config.from_dict(x)
model.save_pretrained("musicgen_fixed")

In [None]:
import soundfile as sf


sf.write('./test.wav', audio_values.detach().numpy()[0].T, samplerate=41000)

Try to replicate the export method

In [1]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch

processor = AutoProcessor.from_pretrained("facebook/musicgen-stereo-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-stereo-small")

  from .autonotebook import tqdm as notebook_tqdm
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


In [2]:
inputs = processor(
    text=["80s pop track with bassy drums and synth"],
    padding=True,
    return_tensors="pt",
)

audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'use_cache': True, 'guidance_scale': 3, 'encoder_outputs': BaseModelOutput(last_hidden_state=tensor([[[ 0.2691, -0.0719, -0.1556,  ...,  0.0485, -0.9299, -0.2878],
         [ 0.1664,  0.0127, -0.2210,  ..., -0.0561, -0.3862, -0.3766],
         [-0.0171, -0.2223,  0.1379,  ..., -0.3299, -0.5221, -0.3737],
         ...,
         [-0.2190,  0.0111, -0.1814,  ..., -0.0630,  0.2541, -0.1576],
         [ 0.0777, -0.0900, -0.0034,  ..., -0.2519, -0.1340, -0.3210],
         [ 0.0214,  0.0023, -0.0042,  ...,  0.0113,  0.0199, -0.0099]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0

KeyboardInterrupt: 

Export the configs

In [None]:
import os, json, glob
folder = './musicgen-stereo'
os.makedirs(folder, exist_ok=True)
processor.tokenizer.save_pretrained(f'{folder}')
processor.save_pretrained(f'{folder}')
model.config.to_json_file(f'{folder}/config.json')
model.generation_config.to_json_file(f'{folder}/generate_config.json')

Export the text encoder

In [None]:
# Create dummy input data for ONNX export
# Adjust input_ids shape and other inputs as per the actual model's input requirements
dummy_input_ids = torch.randint(0, 10, (1, 16), dtype=torch.long)  # Example shape (batch_size=1, seq_len=16)

# Export the model to ONNX format
torch.onnx.export(
    model.text_encoder,  # The model to be exported
    dummy_input_ids,  # Example inputs for the model
    f"{folder}/text_encoder.onnx",  # The path where the ONNX model will be saved
    input_names=["input_ids", "attention_mask", "encoder_hidden_states"],  # Input names
    output_names=["logits"],  # Output name(s)
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},  # Dynamic axes for varying batch_size and seq_len
        "logits": {0: "batch_size", 1: "sequence_length"}
    },
    opset_version=13  # Use the appropriate ONNX opset version
)

print("Encoder exported to ONNX successfully!")

Export the decoder

### Test the inputs

Preprocess the text

In [4]:
max_length = 256
outputs = processor.tokenizer(["80s pop track with bassy drums and synth"])
input_ids, attention_mask = torch.tensor(outputs['input_ids']), torch.tensor(outputs['attention_mask'])

Encode the text

In [5]:
# Input the data to the text encoder
encoded = model.text_encoder(input_ids)

Prepare the encoded text

In [6]:
# Apply the guidance scale or something
# When the guidance scale is > 1 then we need to apply zeros to the mask and the last hidden state
encoded.last_hidden_state = torch.concatenate([encoded.last_hidden_state, torch.zeros_like(encoded.last_hidden_state)], dim=0)
attention_mask = torch.concatenate([attention_mask, torch.zeros_like(attention_mask)], dim=0)

Prepare for the decoder

In [7]:
# Prepare for decoder inputs
num_codebooks = model.config.to_dict()['decoder']['num_codebooks']
decoder_input_ids = torch.ones((input_ids.size(0) * num_codebooks, 1), dtype=torch.long) * model.generation_config.decoder_start_token_id

# Build delay pattern
decoder_input_ids = decoder_input_ids.reshape(-1, num_codebooks, decoder_input_ids.shape[-1])
bsz, num_codebooks, seq_len = decoder_input_ids.shape
channel_codebooks = num_codebooks // 2
decoder_ids_shifted = torch.ones((bsz, num_codebooks, max_length), dtype=torch.long) * -1

# Just remember this as when the user wants a really small sample 
# Really small is max_len < (2 * num_codebooks - 1)
# decoder_input_ids.reshape(bsz * num_codebooks, -1), decoder_ids_shifted.reshape(bsz * num_codebooks, -1)

# Now fill the shifted ids with the prompt
for codebook in range(channel_codebooks):
    decoder_ids_shifted[:, 2 * codebook, codebook : seq_len + codebook] = decoder_input_ids[:, 2 * codebook]
    decoder_ids_shifted[:, 2 * codebook + 1, codebook : seq_len + codebook] = decoder_input_ids[:, 2 * codebook + 1]

delay_pattern = torch.triu(
    torch.ones((channel_codebooks, max_length), dtype=torch.bool), diagonal = max_length - channel_codebooks + 1
)
delay_pattern = delay_pattern + torch.tril(torch.ones((channel_codebooks, max_length), dtype=torch.bool))
delay_pattern = delay_pattern.repeat_interleave(2, dim=0)

mask = ~delay_pattern.to(input_ids.device)
decoder_input_ids = mask * decoder_ids_shifted + ~mask * model.generation_config.decoder_start_token_id
first_codebook_ids = decoder_input_ids[:, 0, :]
start_ids = (first_codebook_ids == -1).nonzero()[:, 1]
if len(start_ids) > 0:
    first_start_id = min(start_ids)
else:
    # we have no tokens that need to be filled - return entire matrix of input ids
    first_start_id = seq_len
pattern_mask = decoder_input_ids.reshape(bsz * num_codebooks, -1)
decoder_input_ids = decoder_input_ids[..., :first_start_id].reshape(bsz * num_codebooks, -1)

Prepare the logic Processor

In [8]:
model.generation_config.watermarking_config

Setup the generation type:

In [9]:
num_beams = model.config.to_dict()['decoder']['num_beams']
num_beam_groups = model.config.to_dict()['decoder']['num_beam_groups']
do_sample = model.config.to_dict()['decoder']['do_sample']
is_greedy_gen_mode = (
    (num_beams == 1)
    and (num_beam_groups == 1)
    and do_sample is False
)
is_sample_gen_mode = (
    (num_beams == 1)
    and (num_beam_groups == 1)
    and do_sample is True
)

For now we will just implement the sampler mode, greedy version will be researched later

Now we have to expand the input dims

In [10]:
n_samples = 1
decoder_input_ids.repeat_interleave(n_samples, dim=0)
attention_mask.repeat_interleave(n_samples, dim=0)
decoder_ids_shifted.repeat_interleave(n_samples, dim=0)

tensor([[[2048,   -1,   -1,  ...,   -1,   -1,   -1],
         [2048,   -1,   -1,  ...,   -1,   -1,   -1],
         [  -1, 2048,   -1,  ...,   -1,   -1,   -1],
         ...,
         [  -1,   -1, 2048,  ...,   -1,   -1,   -1],
         [  -1,   -1,   -1,  ...,   -1,   -1,   -1],
         [  -1,   -1,   -1,  ...,   -1,   -1,   -1]]])

Do Sample

In [12]:
# Apply mask
decoder_input_ids = torch.where(decoder_ids_shifted[..., :decoder_input_ids.shape[-1]] == -1, decoder_input_ids, decoder_ids_shifted[..., :decoder_input_ids.shape[-1]])

In [17]:
encoded

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 0.2691, -0.0719, -0.1556,  ...,  0.0485, -0.9299, -0.2878],
         [ 0.1664,  0.0127, -0.2210,  ..., -0.0561, -0.3862, -0.3766],
         [-0.0171, -0.2223,  0.1379,  ..., -0.3299, -0.5221, -0.3737],
         ...,
         [-0.2190,  0.0111, -0.1814,  ..., -0.0630,  0.2541, -0.1576],
         [ 0.0777, -0.0900, -0.0034,  ..., -0.2519, -0.1340, -0.3210],
         [ 0.0214,  0.0023, -0.0042,  ...,  0.0113,  0.0199, -0.0099]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<CatBackward0>), past_key_values=None, hi

In [20]:
model.decoder(
    # input_ids=decoder_input_ids,
    # attention_mask=decoder_attention_mask,
    # encoder_hidden_states=encoder_hidden_states,
    # encoder_attention_mask=attention_mask,
    # inputs_embeds=decoder_inputs_embeds,
    # output_attentions=output_attentions,
    # output_hidden_states=output_hidden_states,
    # use_cache=use_cache,
    # past_key_values=past_key_values,
    # return_dict=return_dict,
    # labels=labels,

    decoder_input_ids, 
    decoder_ids_shifted, 
    encoded.last_hidden_state,
    attention_mask,
    None,
    None,
    None,
    True,
    None,
    True,
    None)

ValueError: You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time

Decode the data

In [20]:
num_beams = model.config.to_dict()['decoder']['num_beams']
num_beam_groups = model.config.to_dict()['decoder']['num_beam_groups']
do_sample = model.config.to_dict()['decoder']['do_sample']
is_greedy_gen_mode = (
    (num_beams == 1)
    and (num_beam_groups == 1)
    and do_sample is False
)
is_sample_gen_mode = (
    (num_beams == 1)
    and (num_beam_groups == 1)
    and do_sample is True
)

### Following are just text code

In [18]:
model.generation_config.do_sample

True

In [9]:
decoder_input_ids

tensor([[2048],
        [2048],
        [2048],
        [2048],
        [2048],
        [2048],
        [2048],
        [2048]])

In [None]:
model.generation_config.decoder_start_token_id, model.generation_config.max_length

In [None]:
input_ids.size(), attention_mask.size()

In [None]:
model.generation_config.bos_token_id

In [None]:
input_ids.shape, attention_mask.shape, encoded[0].shape

In [None]:
decoder_ins = input_ids.unsqueeze(1).repeat((1,8,1))

In [None]:
model.decoder(decoder_ins, attention_mask)

In [None]:
encoded[0]

In [None]:
input_ids.size()

In [None]:
model.text_encoder(input_ids=input_ids)[0].size()

lets try an run it without the model

In [None]:
import torch

# Create dummy input data for ONNX export
# Adjust input_ids shape and other inputs as per the actual model's input requirements
dummy_input_ids = torch.randint(0, 10, (1, 16), dtype=torch.long)  # Example shape (batch_size=1, seq_len=16)

# Export the model to ONNX format
torch.onnx.export(
    model.text_encoder,  # The model to be exported
    dummy_input_ids,  # Example inputs for the model
    f"{folder}/text_encoder.onnx",  # The path where the ONNX model will be saved
    input_names=["input_ids", "attention_mask", "encoder_hidden_states"],  # Input names
    output_names=["logits"],  # Output name(s)
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},  # Dynamic axes for varying batch_size and seq_len
        "logits": {0: "batch_size", 1: "sequence_length"}
    },
    opset_version=13  # Use the appropriate ONNX opset version
)

print("Model exported to ONNX successfully!")


In [None]:
model.text_encoder(input_ids)

In [None]:
inputs

In [None]:
model.decoder

In [None]:
model.audio_encoder

In [None]:
model.text_encoder

Then export the local model

In [None]:
from optimum.exporters.onnx import main_export
from optimum.exporters.onnx.model_configs import MusicgenOnnxConfig
from transformers import MusicgenConfig

model_id = "facebook/musicgen-small"

main_export(
    model_id,
    output="musicgen-small",
    task='text-to-audio',
)

Make it efficient

In [None]:
!optimum-cli onnxruntime quantize --avx512 --onnx_model musicgen-stereo -o quantized_musicgen

## Testing the model

Load the other configs

In [None]:
import onnxruntime as ort
import json

# Load the ORT config
with open("./quantized_musicgen/ort_config.json", "r") as f:
    ort_config = json.load(f)

# Apply ORT configuration when initializing the session
session_options = ort.SessionOptions()
if "graph_optimization_level" in ort_config:
    session_options.graph_optimization_level = ort_config["graph_optimization_level"]

# Example: Setting execution providers, thread counts, etc.
if "execution_providers" in ort_config:
    session_options.execution_mode = ort_config["execution_providers"]

Load the tokenizer

In [None]:
from transformers import PreTrainedTokenizerFast, AddedToken

# Load tokenizer configuration and special tokens map
with open("./quantized_musicgen/tokenizer_config.json", "r") as f:
    tokenizer_config = json.load(f)

with open("./quantized_musicgen/special_tokens_map.json", "r") as f:
    special_tokens_map = json.load(f)
    for key, value in special_tokens_map.items():
        if key != 'additional_special_tokens':
            special_tokens_map[key] = AddedToken(
                content = value['content'], 
                single_word = value['single_word'], 
                lstrip = value['lstrip'], 
                rstrip = value['rstrip'], 
                special = True, 
                normalized = value['normalized']
            )

# Load the model configuration (config.json)
with open("./quantized_musicgen/config.json", "r") as f:
    model_config = json.load(f)

# Load the tokenizer with configuration
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./quantized_musicgen/tokenizer.json")

# Add the special tokens from the special_tokens_map.json
tokenizer.add_special_tokens(special_tokens_map)

# Configure tokenizer with settings from tokenizer_config.json
if "padding_side" in tokenizer_config:
    print('adding padding_side')
    tokenizer.padding_side = tokenizer_config["padding_side"]
if "truncation_side" in tokenizer_config:
    print('adding truncation_side')
    tokenizer.truncation_side = tokenizer_config["truncation_side"]

Load the model slices

In [None]:
text_encoder_session = ort.InferenceSession('./quantized_musicgen/text_encoder_quantized.onnx', sess_options=session_options)
decoder_session = ort.InferenceSession('./quantized_musicgen/decoder_model_quantized.onnx', sess_options=session_options)

In [None]:
input_text = "80s pop track with bassy drums and synth"
inputs = tokenizer(input_text, return_tensors="np")

In [None]:
# Run inference for text encoding
encoded_text = text_encoder_session.run(None, {
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask']
})

In [None]:
import numpy as np
np.repeat(inputs['input_ids'], repeats=4, axis=0)

In [None]:
model_config['decoder']

In [None]:
# Process output and run decoder (adjusted based on model config)
decoder_inputs = {
    'input_ids': np.repeat(inputs['input_ids'], repeats=4, axis=0),
    'encoder_hidden_states': encoded_text[0],
    'encoder_attention_mask': inputs['attention_mask']
}

# Generate output from the decoder
decoder_output = decoder_session.run(None, decoder_inputs)

In [None]:
import os
os.listdir('./quantized_musicgen')

In [None]:
import numpy as np

# Number of decoder layers (in your case, 24 for Musicgen)
num_layers = 24

# Assuming hidden_size is the dimension of the model (1024 for Musicgen)
hidden_size = 1024

# Batch size, number of heads, sequence length (1 for the first step), and attention head size
batch_size = 1
num_heads = 16  # This depends on your model configuration
sequence_length = 1
head_size = hidden_size // num_heads

# Create past_key_values as a list of zero tensors for each layer
past_key_values = []

for _ in range(num_layers):
    decoder_key = np.zeros((batch_size, num_heads, sequence_length, head_size), dtype=np.float32)
    decoder_value = np.zeros((batch_size, num_heads, sequence_length, head_size), dtype=np.float32)
    encoder_key = np.zeros((batch_size, num_heads, sequence_length, head_size), dtype=np.float32)
    encoder_value = np.zeros((batch_size, num_heads, sequence_length, head_size), dtype=np.float32)
    past_key_values.append({
        "decoder.key": decoder_key,
        "decoder.value": decoder_value,
        "encoder.key": encoder_key,
        "encoder.value": encoder_value,
    })

In [None]:
encoder_hidden_states[:,:3,:].shape

In [None]:
input_tokens['attention_mask']

In [None]:
# Initialize variables
generated_tokens = decoder_input_ids
use_cache_branch = np.array([False], dtype=bool)  # Use False for first step

for step in range(gen_config.max_length):
    # Prepare the input dictionary for the ONNX session
    inputs = {
        "input_ids": decoder_input_ids,
        "encoder_hidden_states": encoder_hidden_states,
        "encoder_attention_mask": input_tokens['attention_mask'],
        "use_cache_branch": np.array([False], dtype=bool),  # Set to True to use past key values
    }

    # Add past key values to the input
    for i, layer_past in enumerate(past_key_values):
        inputs[f"past_key_values.{i}.decoder.key"] = layer_past["decoder.key"]
        inputs[f"past_key_values.{i}.decoder.value"] = layer_past["decoder.value"]
        inputs[f"past_key_values.{i}.encoder.key"] = layer_past["encoder.key"]
        inputs[f"past_key_values.{i}.encoder.value"] = layer_past["encoder.value"]

    # Run the ONNX session
    decoder_outputs = decoder_session.run(None, inputs)

    
    # Get logits and past key values
    logits = decoder_outputs[0]
    # Extract past_key_values from decoder_outputs if they are present
    
    # Sample next token (using greedy search, beam search, or sampling)
    next_token_id = np.argmax(logits[:, -1, :], axis=-1).reshape(4, 1)
    
    # Append the next token to generated tokens
    generated_tokens = np.concatenate([generated_tokens, next_token_id], axis=1)
    
    # Update inputs for next step
    use_cache_branch = np.array([True], dtype=bool)
    # Update past_key_values for next step


In [None]:
# Prepare input for encodec decoder
encodec_inputs = {
    "codes": generated_tokens  # Ensure this matches the expected input shape
}

# Run the encodec decoder
audio_outputs = encodec_decoder_session.run(None, encodec_inputs)

# Get the audio waveform
audio_waveform = audio_outputs[0]  # Adjust index based on actual output

In [None]:
import soundfile as sf

sf.write('generated_audio.wav', audio_waveform.squeeze(), samplerate=gen_config.sampling_rate)

In [None]:
for input_meta in decoder_session.get_inputs():
    print(f"Input name: {input_meta.name}, shape: {input_meta.shape}, type: {input_meta.type}")

In [None]:
matrix = np.zeros((len(vecs), len(vecs[0])))
for i in range(len(vecs)):
    matrix[i, :] = vecs[i]
matrix = np.dot(matrix,matrix.T)
for row in matrix:
    print(" ".join(f"{value:10.2f}" for value in row))

In [None]:
dfmax, dfmin = matrix.max(), matrix.min()

matrix = (matrix - dfmin)/(dfmax - dfmin)
for row in matrix:
    print(" ".join(f"{value:10.2f}" for value in row))