In [8]:
import os
import re
import json
import torch
import utils.dnnlib as dnnlib
from utils.torch_utils import distributed as dist
import utils.setup as setup
from training.trainer import Trainer
from omegaconf import OmegaConf

import warnings


In [9]:
def parse_int_list(s):
    if isinstance(s, list): return s
    ranges = []
    range_re = re.compile(r'^(\d+)-(\d+)$')
    for p in s.split(','):
        m = range_re.match(p)
        if m:
            ranges.extend(range(int(m.group(1)), int(m.group(2))+1))
        else:
            ranges.append(int(p))
    return ranges

In [56]:
def worker_init_fn(worker_id):
    st=np.random.get_state()[2]
    np.random.seed( st+ worker_id)

In [82]:
from omegaconf import OmegaConf

args = OmegaConf.create({
    # Logging --- Base logging
    "logging": {
    "log": True,
    "log_interval": 1000,
    "heavy_log_interval": 50000,  # same as save_interval
    "save_model": True,
    "save_interval": 50000,

    "num_sigma_bins": 20,
    "freq_cqt_logging": 100,

    "print_model_summary": False, ## orginial is True

    "profiling": {
        "enabled": True,
        "wait": 5,
        "warmup": 10,
        "active": 2,
        "repeat": 1
    },

    "stft": {
        "win_size": 1024,
        "hop_size": 256
    },

    "cqt": {
        "hop_length": 1024,
        "num_octs": 6,
        "fmin": 70,
        "bins_per_oct": 1
    },

    "log_feature_stats": True,
    "log_feature_stats_interval": 50000
},
    
    # Dataset configuration
    "dset": {
        "name": "musicnet",
        "callable": "datasets.audiofolder.AudioFolderDataset",
        "path": r"E:\Class\ECE661\audio-inpainting-diffusion\musicnet\train_data",

        "test": {
            "callable": "datasets.audiofolder_test.AudioFolderDatasetTest",
            "num_samples": 4,
            "batch_size": 1,
            "path": r"E:\Class\ECE661\audio-inpainting-diffusion\musicnet\test_data",
        },
    },

    # Network configuration
    "network": {
        "name": "unet_cqt_oct_with_attention",  # adaLN_2
        "callable": "networks.unet_cqt_oct_with_projattention_adaLN_2.Unet_CQT_oct_with_attention",

        "use_fencoding": False,
        "use_norm": True,
        "filter_out_cqt_DC_Nyq": True,

        "depth": 7,
        "emb_dim": 256,

        "Ns": [64,96, 96, 128, 128,256, 256],
        "Ss": [2, 2, 2, 2, 2, 2, 2],
        "num_dils": [2,3,4,5,6,7,7],

        "attention_layers": [0, 0, 0, 0, 1, 1, 1, 1],
        "bottleneck_type": "res_dil_convs",
        "num_bottleneck_layers": 1,

        "cqt": {
            "window": "kaiser",
            "beta": 1,
            "num_octs": 7,
            "bins_per_oct": 64,
        },

        "attention_dict": {
            "num_heads": 8,
            "attn_dropout": 0.0,
            "bias_qkv": False,
            "N": 0,
            "rel_pos_num_buckets": 32,
            "rel_pos_max_distance": 64,
            "use_rel_pos": False,
            "Nproj": 8,
        }

        # Optional transformer block (uncomment if needed)
        # "transformer": {
        #     "num_heads": 8,
        #     "dim_head": 64,
        #     "num_layers": 16,
        #     "channels": 512,
        #     "attn_dropout": 0.1,
        #     "multiplier_ff": 4,
        #     "activation": "gelu",
        # }
    },

    # Diffusion parameters
    "diff_params": {
        "callable": "diff_params.edm.EDM",
        "sigma_data": 0.063,
        "sigma_min": 1e-5,
        "sigma_max": 10,
        "P_mean": -1.2,
        "P_std": 1.2,
        "ro": 13,
        "ro_train": 10,
        "Schurn": 5,
        "Snoise": 1,
        "Stmin": 0,
        "Stmax": 50,
        "aweighting": {
            "use_aweighting": False,
            "ntaps": 101
        }
    },
    
    # Tester configuration  
    # here use inpainting_tester
    "tester": {
    "do_test": True,
    "name": "inpainting_tester",
    "callable": "testing.tester_inpainting.Tester",
    "sampler_callable": "testing.edm_sampler_inpainting.Sampler",

    "modes": ["inpainting"],

    "T": 35,
    "order": 2,
    "filter_out_cqt_DC_Nyq": True,
    "checkpoint": "experiments/54/22k_8s-790000.pt",

    "unconditional": {
        "num_samples": 4,
        "audio_len": 184184
    },

    "posterior_sampling": {
        "xi": 0.25,
        "norm": 2,
        "smoothl1_beta": 1
    },

    "data_consistency": {
        "use": True,
        "type": "always",
        "smooth": True,
        "hann_size": 50
    },

    "diff_params": {
        "same_as_training": False,
        "sigma_data": 0.063,
        "sigma_min": 1e-4,
        "sigma_max": 1,
        "P_mean": -1.2,
        "P_std": 1.2,
        "ro": 13,
        "ro_train": 13,
        "Schurn": 10,
        "Snoise": 1.0,
        "Stmin": 0,
        "Stmax": 50
    },

    "autoregressive": {
        "overlap": 0.25,
        "num_samples": 4
    },

    "sampler": "stochastic",
    "noise_in_observations_SNR": None,

    "inpainting": {
        "mask_mode": "long",
        "long": {
            "gap_length": 1500,
            "start_gap_idx": None
        },
        "short": {
            "num_gaps": 4,
            "gap_length": 25,
            "start_gap_idx": None
        }
    },

    "spectrogram_inpainting": {
        "stft": {
            "window": "hann",
            "n_fft": 1024,
            "hop_length": 256,
            "win_length": 1024
        },
        "time_mask_length": 2000,
        "time_start_idx": None,
        "min_masked_freq": 300,
        "max_masked_freq": 2000
    },

    "STN_inpainting": {
        "STN_params": {
            "nwin1": 4096,
            "G1": 0.65,
            "G2": 0.7
        },
        "type": "T"
    },

    "comp_sens": {
        "percentage": 5
    },

    "max_thresh_grads": 1,
    "type_spec": "linear",

    "declipping": {
        "SDR": 3
    }
},
    
    # Experiment configuration
    "exp": {
        "exp_name": "musicnet44k_4s_Duke",
        "trainer_callable": "training.trainer.Trainer",
        "model_dir": None,
        "optimizer": {
            "type": "adam", 
            "beta1": 0.9,
            "beta2": 0.999,
            "eps": 1e-8
        },

        "wandb": {
            "entity": "eloimoliner",
            "project": "A-diffusion"
        },
        "lr": 2e-4,
        "lr_rampup_it": 10000,
        "scheduler_step_size": 60000,
        "scheduler_gamma": 0.8,
        "batch": 4,
        "batch_gpu": 4,
        "num_accumulation_rounds": 1,
        "use_fp16": False,
        "num_workers": 4,
        "seed": 42,
        "resume": True,
        "resume_checkpoint": None,
        "sample_rate": 44100,
        "audio_len": 184184,
        "resample_factor": 1,
        "device": "cpu",
        "use_cqt_DC_correction": False,
        "ema_rate": 0.9999,
        "ema_rampup": 10000,
        "use_grad_clip": True,
        "max_grad_norm": 1,
        "restore": False,
        "checkpoint_id": None,
        "augmentations": {
            "rev_polarity": True,
            "pitch_shift": {
                "use": False,
                "min_semitones": -6,
                "max_semitones": 6
            },
            "gain": {
                "use": False,
                "min_db": -3,
                "max_db": 3
            }
        }
    }
})


In [11]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [12]:
# Setup model directory
dirname = os.path.dirname(os.path.abspath("__file__"))
args.model_dir = os.path.join(dirname, str(args.model_dir))
if not os.path.exists(args.model_dir):
    os.makedirs(args.model_dir)
args.exp.model_dir = args.model_dir

print(f"Model directory: {args.model_dir}")

Model directory: e:\Class\ECE661\audio-inpainting-diffusion\experiments/cqt


In [13]:
# Setup multiprocessing
torch.multiprocessing.set_start_method('spawn')

In [None]:
# check the dataset path
# this is just to make sure you change the path in the args. LOL
import glob
import random
import soundfile as sf

path = args.dset.path
filelist=glob.glob(os.path.join(path,"*.wav"))
num=1
#for file in self.train_samples:  
file=filelist[num]
data, samplerate = sf.read(file)

In [14]:
import sys
import os

# Add the project root directory to Python path
project_root = os.path.abspath(os.path.dirname('__file__'))
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to Python path")

sys.path.append(r"E:\Class\ECE661\audio-inpainting-diffusion\datasets")

Added e:\Class\ECE661\audio-inpainting-diffusion to Python path


In [58]:
# Setup training dataset
from audiofolder import AudioFolderDataset
dset_obj = AudioFolderDataset(dset_args=args.dset, fs=args.exp.sample_rate*args.exp.resample_factor, seg_len=args.exp.audio_len*args.exp.resample_factor, overfit=False)

# it has an __iter__ method
# it will return a random batch of audio samples with length 4.17 seconds -- or 184184 samples.
train_set = iter(torch.utils.data.DataLoader(dataset=dset_obj, 
                                        batch_size=args.exp.batch,  
                                        num_workers=args.exp.num_workers, 
                                        pin_memory=True, worker_init_fn=worker_init_fn))

In [60]:
# set up diff model
from diff_params.edm import EDM
diff_parameters=EDM(args)
print("Diffusion parameters setup complete")

Diffusion parameters setup complete


In [50]:
# Setup network
from networks.unet_cqt_oct_with_projattention_adaLN_2 import Unet_CQT_oct_with_attention
network = Unet_CQT_oct_with_attention(args, device)
print("Network setup complete")

using a kaiser window with beta= 1
Attention layer at (down) octave 4
Attention layer at (down) octave 5
Attention layer at (down) octave 6
Attention layer at (up) oct layer 6
Attention layer at (up) oct layer 5
Attention layer at (up) oct layer 4
Network setup complete


In [51]:
# Setup optimizer
optimizer = torch.optim.Adam(network.parameters(), 
                             lr=args.exp.lr, 
                             betas=(args.exp.optimizer.beta1, args.exp.optimizer.beta2), 
                             eps=args.exp.optimizer.eps)
print("Optimizer setup complete")

Optimizer setup complete


In [57]:
# Setup test dataset
from audiofolder_test import AudioFolderDatasetTest
test_set_obj =  AudioFolderDatasetTest(dset_args=args.dset, 
                                    fs=args.exp.sample_rate*args.exp.resample_factor,
                                    seg_len=args.exp.audio_len*args.exp.resample_factor, 
                                    num_samples=args.dset.test.num_samples)
test_set = torch.utils.data.DataLoader(dataset=test_set_obj, batch_size=args.dset.test.batch_size,  
                                       num_workers=args.exp.num_workers, pin_memory=True, 
                                       worker_init_fn=worker_init_fn)

In [63]:
# Setup tester
from testing.tester import Tester

tester = Tester(args=args, network=network, test_set=test_set, diff_params=diff_params, device=device)
print("Tester setup complete")

Tester setup complete


In [None]:
# Setup trainer
from training.trainer import Trainer
trainer = Trainer(args, dset=train_set, network=network, optimizer=optimizer, diff_params=diff_parameters, tester=tester, device=device)
print("Trainer setup complete")

total_params:  186.279616 M
trying to load a project checkpoint
checkpoint_id None
Missing key model_dir
    full_key: model_dir
    object_type=dict
Could not resume from checkpoint
training from scratch


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:wandb: Paste an API key from your profile and hit enter:

In [69]:
# Print Training related information
print('\nTraining options:')
print(f'Network architecture:    {args.network.callable}')
print(f'Diffusion parameterization:  {args.diff_params.callable}')
print(f'Batch size:              {args.exp.batch}')
print(f'Number of GPUs:          {1 if torch.cuda.is_available() else 0}')
print(f'Mixed-precision:         {args.exp.use_fp16}')


Training options:
Network architecture:    networks.unet_cqt_oct_with_projattention_adaLN_2.Unet_CQT_oct_with_attention
Diffusion parameterization:  diff_params.edm.EDM
Batch size:              4
Number of GPUs:          1
Mixed-precision:         False


In [None]:
# Start training
trainer.training_loop()