In [4]:
#load generator model
import hydra
import torch
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
import utils.training_utils as tr_utils

config_file_rel="../conf"
config_path="/home/eloi/projects/project_mfm_eloi/src/conf"
config_name="conf_tencymastering_multitrack_simulated_stylefxenc2048AF_contentCLAP.yaml"
model_dir = "/data5/eloi/experiments/tencymastering_multitrack_simulated_stylefxenc2048AF_contentCLAP"
ckpt="1C_tencymastering_vocals-50000.pt"

overrides = [
    f"model_dir={model_dir}",
    f"tester.checkpoint={ckpt}",
    "tester.cfg_scale=1.0",
]


with initialize(version_base=None, config_path=config_file_rel):
    args = compose(config_name=config_name, overrides=overrides)


if not os.path.exists(args.model_dir):
        raise Exception(f"Model directory {args.model_dir} does not exist")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("loading diff_params")
diff_params=hydra.utils.instantiate(args.diff_params)

print("loading network")
network=hydra.utils.instantiate(args.network)
network=network.to(device)
state_dict = torch.load(os.path.join(args.model_dir,args.tester.checkpoint), map_location=device, weights_only=False)

tr_utils.load_state_dict(state_dict, ema=network)


sampler = hydra.utils.instantiate(args.tester.sampler, network, diff_params, args, )

def generate_samples(x, input_type="dry"):
    B, N, C, L = x.shape  # B is the batch size, N is the number of tracks, C is the number of channels and L is the length of the audio
    shape=sampler.diff_params.default_shape
    shape= [B, *shape[1:]]  # B is the batch size, we want to sample B samples
    with torch.no_grad():
        is_wet= "wet" in input_type
        cond, x_preprocessed=sampler.diff_params.transform_forward(cond,  is_condition=True, is_test=True, clusters=cluster, taxonomy=taxonomy, masks=masks, is_wet=is_wet)
        preds, noise_init = sampler.predict_conditional(shape, cond=cond, cfg_scale=args.tester.cfg_scale, device=device, taxonomy=taxonomy, masks=masks)
    
    return preds
        




loading diff_params
Initializing EDM_Style with args: () {'type': 've_karras', 'AE_type': 'CLAP', 'cfg_dropout_prob': 0.2, 'sde_hp': {'sigma_data': 0.025, 'sigma_min': 0.0005, 'sigma_max': 5, 'max_sigma': 5, 'rho': 10}, 'MERT_args': {'layer': '8,', 'average': True, 'normalize': True}, 'CLAP_args': {'ckpt_path': '/data5/eloi/checkpoints/laion_clap/music_audioset_epoch_15_esc_90.14.patched.pt', 'distance_type': 'cosine', 'normalize': True, 'use_adaptor': False, 'adaptor_checkpoint': None, 'adaptor_type': None, 'add_noise': False, 'noise_sigma': 0.1}, 'fx_encoder_plusplus_args': {'distance_type': 'cosine', 'ckpt_path': '/home/eloi/projects/project_mfm_eloi/src/utils/feature_extractors/ckpt/fxenc_plusplus_default.pt'}, 'apply_fxnormaug': False, 'fxnormaug_train': <fx_model.fxnormaug_v0.FxNormAug object at 0x7f609553da90>, 'fxnormaug_inference': <fx_model.fxnormaug_v0.FxNormAug object at 0x7f614e6c8cd0>, 'default_shape': [1, 3, 64, 33]}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load the specified checkpoint /data5/eloi/checkpoints/laion_clap/music_audioset_epoch_15_esc_90.14.patched.pt from users.
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 

In [5]:
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import torch
from utils.collators import collate_multitrack_sim
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from datasets.tency_mastering_multitrack_simulated import TencyMastering_Test
import omegaconf
from fx_model.apply_effects_multitrack_utils import simulate_effects

normalize_params=omegaconf.OmegaConf.create(
    {
    "normalize_mode": "rms_dry",
    #"loudness_dry": -18.0,  # Target loudness for dry tracks
    "rms_dry": -25.0
    }
)

dataset_val= TencyMastering_Test(
  mode= "dry-wet",
  segment_length= 525312,
  fs= 44100,
  stereo= True,
  tracks= ["vocals","bass","drums"],
  clusters= [0,1],
  num_tracks= 4,
  path_csv= "/data5/eloi/TencyMastering/PANNs_country_pop/val_split.csv",
  normalize_params=normalize_params,
  num_examples= 4, #use all examples
  RMS_threshold_dB= -40.0,
  seed= 42
)

batch_size = 1
val_loader = torch.utils.data.DataLoader(dataset=dataset_val, batch_size=1, num_workers=1, collate_fn=lambda x: x) 

from fx_model.distribution_presets.clusters_multitrack import get_distributions_Cluster0_vocals, get_distributions_Cluster1_vocals, get_distributions_Cluster0_bass, get_distributions_Cluster1_bass, get_distributions_Cluster0_drums, get_distributions_Cluster1_drums
from fx_model.fx_pipeline import EffectRandomizer
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")


effect_randomizer_C0={
  "92": EffectRandomizer(sample_rate=44100, distributions_dict=get_distributions_Cluster0_vocals(sample_rate=44100), device=device),
  "2": EffectRandomizer(sample_rate=44100, distributions_dict=get_distributions_Cluster0_bass(sample_rate=44100), device=device),
  "11": EffectRandomizer(sample_rate=44100, distributions_dict=get_distributions_Cluster0_drums(sample_rate=44100), device=device),
}

effect_randomizer_C1={
  "92": EffectRandomizer(sample_rate=44100, distributions_dict=get_distributions_Cluster1_vocals(sample_rate=44100), device=device),
  "2": EffectRandomizer(sample_rate=44100, distributions_dict=get_distributions_Cluster1_bass(sample_rate=44100), device=device),
  "11": EffectRandomizer(sample_rate=44100, distributions_dict=get_distributions_Cluster1_drums(sample_rate=44100), device=device),
}


inputs=[]  # List to store input audio tensors
outputs_ref=[]

i=0
for data in val_loader:
    collated_data= collate_multitrack_sim(data, max_tracks=3)
    x=collated_data['x'].to(device)  # x is a tensor of shape [B, N, C, L] where B is the batch size, N is the number of tracks, C is the number of channels and L is the length of the audio
    cluster=collated_data['clusters'].to(device)  # cluster is a tensor of shape [B, N] where B is the batch size and N is the number of tracks
    taxonomy=collated_data['taxonomies']  # taxonomy is a list of lists of taxonomies, each list is a track, each taxonomy is a string of 2 digits
    masks=collated_data['masks'].to(device)  # masks is a tensor of shape [B, N] where B is the batch size and N is the number of tracks, it is used to mask the tracks that are not present in the batch

    preds=generate_samples(x, input_type="dry")  # Generate samples using the model
    print(f"Generated samples shape: {preds.shape}")  # preds is a tensor of shape [B, N, C, L] where B is the batch size, N is the number of tracks, C is the number of channels and L is the length of the audio


    res= simulate_effects(x, cluster, taxonomy, effect_randomizer_C0, effect_randomizer_C1)

    res_list= res.cpu().unbind(dim=0)

    for j, output in enumerate(res_list):
            inputs[i*batch_size + j] = x[j].cpu()  # Store the input audio
            outputs_ref[i*batch_size + j] = output

    i += 1


4 num_examples


4it [01:18, 19.53s/it]

test_samples 4 num_examples 4 num_skips 0





UnboundLocalError: cannot access local variable 'cond' where it is not associated with a value