In [13]:
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

#use gpu number 3 
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

#file_id=6448
file_id=23114

base_path="/home/eloi/projects/project_mfm_eloi/audio_examples/TM_val/"
file_dry=os.path.join(base_path, "dry",str(file_id),"vocals.wav")
file_wet=os.path.join(base_path, "wet",str(file_id),"vocals.wav")

file_dry_fxnorm=os.path.join(base_path,"dry", str(file_id), "vocals_normalized.wav")
file_wet_fxnorm=os.path.join(base_path, "wet",str(file_id), "vocals_normalized.wav")

file_dry_fxnorm_dr=os.path.join(base_path, "dry", str(file_id), "vocals_normalized_dr.wav")
file_wet_fxnorm_dr=os.path.join(base_path, "wet",str(file_id),"vocals_normalized_dr.wav")

import soundfile as sf

print("Loading files:", file_dry, file_wet)
dry, sr = sf.read(file_dry)
print("Loading files:", file_dry, file_wet)
wet, sr = sf.read(file_wet)
print("files loaded")

#file_dry_fxnorm, sr=sf.read(file_dry_fxnorm_dr)
#file_wet_fxnorm, sr=sf.read(file_wet_fxnorm_dr)

dry=torch.from_numpy(dry.T).float().unsqueeze(0)
wet=torch.from_numpy(wet.T).float().unsqueeze(0)

#dry_fxnorm=torch.from_numpy(file_dry_fxnorm.T).float().unsqueeze(0)

dry=dry.mean(dim=1, keepdim=True)

#dry_fxnorm=torch.from_numpy(file_dry_fxnorm.T).float().unsqueeze(0)


start_t=15*sr
segment_length = 524288
wet_segment = wet[...,start_t:start_t + segment_length]


import pyloudnorm as pyln
meter = pyln.Meter(sr)
normaliser = lambda x: pyln.normalize.loudness(
    x, meter.integrated_loudness(x), -18.0
)

print("dry", dry.shape, dry.min(), dry.max())

dry = torch.from_numpy(normaliser(dry.numpy().T).T).float().to(device)
#wet = torch.from_numpy(normaliser(wet.numpy().T).T).float().to(device)
#dry_fxnorm = torch.from_numpy(normaliser(dry_fxnorm.numpy().T).T).float().to(device)


print("dry", dry.shape,dry.min(), dry.max())
dry_segment = dry[...,start_t:start_t + segment_length]


from IPython.display import Audio

Audio(dry_segment[0].cpu().numpy(), rate=sr, normalize=False)
#Audio(wet_segment[0].cpu().numpy(), rate=sr, normalize=True)

Using device: cuda
Loading files: /home/eloi/projects/project_mfm_eloi/audio_examples/TM_val/dry/23114/vocals.wav /home/eloi/projects/project_mfm_eloi/audio_examples/TM_val/wet/23114/vocals.wav
Loading files: /home/eloi/projects/project_mfm_eloi/audio_examples/TM_val/dry/23114/vocals.wav /home/eloi/projects/project_mfm_eloi/audio_examples/TM_val/wet/23114/vocals.wav
files loaded
dry torch.Size([1, 1, 7014686]) tensor(-0.7464) tensor(0.7499)
dry torch.Size([1, 1, 7014686]) tensor(-0.8735, device='cuda:0') tensor(0.8776, device='cuda:0')


In [2]:

from typing import List
from utils.ITOMaster_loss import compute_rms, compute_crest_factor, compute_stereo_width, compute_stereo_imbalance, compute_barkspectrum, compute_log_spread

def compute_log_rms(x: torch.Tensor, **kwargs):
    """Compute root mean square energy.

    Args:
        x: (bs, 1, seq_len)

    Returns:
        rms: (bs, )
    """
    rms=compute_rms(x)
    return 20 * torch.log10(rms.clamp(min=1e-8))

import loudness

def compute_loudness(x: torch.Tensor, sample_rate=44100):
    lufs_out=torch.zeros((x.shape[0], x.shape[1]), device=x.device)
    for b in range(B):
        x_i=x[b].cpu().numpy().T
        lufs_in=loudness.integrated_loudness(x_i, sample_rate)
        lufs_out[b] = torch.tensor(lufs_in, device=x.device)
    
    return lufs_out

class AudioFeatureLoss(torch.nn.Module):
    def __init__(
        self,
        weights: List[float],
        sample_rate: int,
    ) -> None:
        """Compute loss using a set of differentiable audio features.

        Args:
            weights: weights for each feature
            sample_rate: sample rate of audio
            stem_separation: whether to compute loss on stems or mix

        Based on features proposed in:

        Man, B. D., et al.
        "An analysis and evaluation of audio features for multitrack music mixtures."
        (2014).

        """
        super().__init__()
        self.weights = weights
        self.sample_rate = sample_rate
        self.sources_list = ["mix"]
        self.source_weights = [1.0]

        self.transforms = [
            compute_rms,
            compute_crest_factor,
            compute_log_spread,
            compute_stereo_width,
            compute_stereo_imbalance,
            compute_barkspectrum,
        ]

        assert len(self.transforms) == len(weights)

    def forward(self, input: torch.Tensor, target: torch.Tensor):
        losses = {}

        # reshape for example stem dim
        input_stems = input.unsqueeze(1)
        target_stems = target.unsqueeze(1)

        n_stems = input_stems.shape[1]

        # iterate over each stem compute loss for each transform
        for stem_idx in range(n_stems):
            input_stem = input_stems[:, stem_idx, ...]
            target_stem = target_stems[:, stem_idx, ...]

            for transform, weight in zip(self.transforms, self.weights):
                transform_name = "_".join(transform.__name__.split("_")[1:])
                key = f"{self.sources_list[stem_idx]}-{transform_name}"
                input_transform = transform(input_stem, sample_rate=self.sample_rate)
                target_transform = transform(target_stem, sample_rate=self.sample_rate)
                val = torch.nn.functional.mse_loss(input_transform, target_transform)
                losses[key] = weight * val * self.source_weights[stem_idx]

        return losses

def get_AF(x: torch.Tensor):
    """Get audio features from a tensor."""
    rms = compute_log_rms(x, sample_rate=sr)
    crest_factor = compute_crest_factor(x, sample_rate=sr)
    log_spread= compute_log_spread(x, sample_rate=sr)
    stereo_width = compute_stereo_width(x, sample_rate=sr)
    stereo_imbalance = compute_stereo_imbalance(x, sample_rate=sr)

    return rms, crest_factor, log_spread, stereo_width, stereo_imbalance

#af_loss= AudioFeatureLoss(
#    weights=[0.1, 0.001, 1.0, 1.0, 0.1],
#    sample_rate=sr,
#).to(device)
#
#
x=dry_segment.expand(-1,2,-1).to(device)
y=wet_segment.to(device)
#
#losses = af_loss(x, y)

features=get_AF(y)


In [3]:

import numpy as np
from datasets.tency_mastering_multitrack_random import TencyMastering_Test
import omegaconf    

normalize_params=omegaconf.OmegaConf.create(
    {
    "normalize_mode": "rms_dry",
    #"loudness_dry": -18.0,  # Target loudness for dry tracks
    "rms_dry": -25.0
    }
)
dataset_val= TencyMastering_Test(
  mode= "train",
  segment_length= 525312,
  fs= 44100,
  stereo= True,
  tracks= "all",
  num_tracks= 50,
  path_csv= "/data5/eloi/TencyMastering/PANNs_country_pop/train_split.csv",
  normalize_params=normalize_params,
  num_examples= -1, #use all examples
  RMS_threshold_dB= -40.0,
  seed= 42
)

val_loader = torch.utils.data.DataLoader(dataset=dataset_val, batch_size=1, num_workers=1)  # Use a lambda function to keep the original structure
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




-1 num_examples


50it [18:51, 22.63s/it]

test_samples 7100 num_examples -1 num_skips 0





In [10]:
def compute_loudness(x: torch.Tensor, sample_rate=44100):
    B, C, T = x.shape
    lufs_out=torch.zeros((x.shape[0], x.shape[1]), device=x.device)
    for b in range(B):
        x_i=x[b].cpu().numpy().T + 1e-6
        lufs_in=loudness.integrated_loudness(x_i, sample_rate)
        lufs_out[b] = torch.tensor(lufs_in, device=x.device)
    
    return lufs_out

def get_AF(x: torch.Tensor):
    """Get audio features from a tensor."""
    loudness = compute_loudness(x, sample_rate=sr)
    crest_factor = compute_crest_factor(x, sample_rate=sr)
    log_spread= compute_log_spread(x, sample_rate=sr)
    stereo_width = compute_stereo_width(x, sample_rate=sr)
    stereo_imbalance = compute_stereo_imbalance(x, sample_rate=sr)

    return loudness, crest_factor, log_spread, stereo_width, stereo_imbalance

features={}
num_iterations=1000
for i, (x, y) in enumerate(val_loader):

    if i >= num_iterations:
        break

    y=y.to(device)

    AF= get_AF(y)

    #check that there is no NaN or Inf in AF
    try:
        for af in AF:
            if torch.isnan(af).any() or torch.isinf(af).any():
                raise ValueError("NaN or Inf found in audio features")
    except ValueError as e:
        print(f"Error in iteration {i}: {e}")
        continue

    print(AF)
    if i==0:
        for j, af in enumerate(AF):
            features[j]=[]
            features[j].append(af.cpu().numpy())

    else:
        for j, af in enumerate(AF):
            features[j].append(af.cpu().numpy())




#take the mean of each feature and the std
features_mean = {}
features_std = {}

for key, value in features.items():
    print(f"Feature {key}: {len(value)} samples")
    features_mean[key] = np.mean(value, axis=0)
    features_std[key] = np.std(value, axis=0)



(tensor([[-18.9472, -18.9472]], device='cuda:0'), tensor([[15.2208, 15.4792]], device='cuda:0'), tensor([[-23.1845, -22.8997]], device='cuda:0'), tensor([0.0396], device='cuda:0'), tensor([0.0050], device='cuda:0'))
(tensor([[-14.6372, -14.6372]], device='cuda:0'), tensor([[18.2267, 17.5565]], device='cuda:0'), tensor([[-43.7256, -43.6388]], device='cuda:0'), tensor([0.1230], device='cuda:0'), tensor([-0.0001], device='cuda:0'))
(tensor([[-19.7144, -19.7144]], device='cuda:0'), tensor([[21.3269, 21.4871]], device='cuda:0'), tensor([[-11.5512, -10.6673]], device='cuda:0'), tensor([0.0706], device='cuda:0'), tensor([-0.0323], device='cuda:0'))
(tensor([[-19.8179, -19.8179]], device='cuda:0'), tensor([[10.2042, 10.2042]], device='cuda:0'), tensor([[-8.4387, -8.4387]], device='cuda:0'), tensor([0.], device='cuda:0'), tensor([0.], device='cuda:0'))
(tensor([[-15.4821, -15.4821]], device='cuda:0'), tensor([[21.8226, 21.2081]], device='cuda:0'), tensor([[-92.1364, -92.3869]], device='cuda:0')

In [None]:

def apply_loud_normalization(x, lufs=-23, sample_rate=44100,device=None):
    """
    x shaPe: (batch_size, channels, time)
    """

    if x.ndim() != 3:
        x_shape= x.shape
        x=x.view(-1, x_shape[-2], x_shape[-1])  # Ensure x is 3D

    B, C, T = x.shape

    if device is None:
        device = x.device

    x_out = torch.zeros_like(x)
    for b in range(B):
        x_i=x[b].cpu().numpy().T
        lufs_in=loudness.integrated_loudness(x_i, sample_rate)

        delta_loudness= lufs - lufs_in
        gain=np.power(10, delta_loudness / 20)  # Convert dB to linear gain

        x_out[b] = torch.tensor(x_i.T * gain, device=device)
    
    if x.ndim() != 3:
        x_out = x_out.view(x_shape)

    return x_out




In [18]:
wet_segment.shape
wet_segment_norm= apply_loud_normalization(wet_segment, lufs=-23, sample_rate=sr, device=device)
print("wet_segment_norm shape:", wet_segment_norm.shape)

wet_segment_norm shape: torch.Size([1, 2, 524288])


In [20]:
lufs_out= compute_loudness(wet_segment_norm, sample_rate=sr)
print("Loudness of wet segment after normalization:", lufs_out)

Loudness of wet segment after normalization: tensor([[-23.0090, -23.0090]])


In [None]:
import math

    
from utils.ITOMaster_loss import compute_log_rms, compute_crest_factor, compute_stereo_width, compute_stereo_imbalance

class AF_RFF_embedding:
    def __init__(self,
                input_dim=6,
                output_dim=64,
                sigma=1.0,
                log_rms_shift=-26.5, #calculated as the mean from the dataset
                log_rms_scale=7.0, #calculated as the std from the dataset
                crest_shift=16.7, #calculated as the mean from the dataset
                crest_scale=6.3,
                stereo_width_shift=0.28,
                stereo_width_scale=0.39,
                stereo_imbalance_shift=0.0,
                stereo_imbalance_scale=0.35,
                device="cpu"
                ):
        """
        Deterministic Fourier feature transformer using fixed cosine-based projection
        """

        self.device = device
        # Ensure output_dim is even and >= 2 * input_dim
        self.output_dim = max(input_dim * 2, output_dim)
        if self.output_dim % 2 != 0:
            self.output_dim += 1
        
        self.input_dim = input_dim
        self.sigma = sigma
        
        # Create deterministic projection matrix
        self.projection = self._create_deterministic_projection(input_dim, self.output_dim // 2, sigma)
        self.projection = self.projection.to(self.device)
        
        # Normalization factor
        self.scale_factor = math.sqrt(2.0 / self.output_dim)

        self.log_rms_shift = log_rms_shift
        self.log_rms_scale = log_rms_scale
        self.crest_shift = crest_shift
        self.crest_scale = crest_scale
        self.stereo_width_shift = stereo_width_shift
        self.stereo_width_scale = stereo_width_scale
        self.stereo_imbalance_shift = stereo_imbalance_shift
        self.stereo_imbalance_scale = stereo_imbalance_scale
    
    def _create_deterministic_projection(self, input_dim, proj_dim, sigma):
        """
        Create a deterministic projection matrix using a cosine basis
        """
        # Cosine-based matrix (like DCT type-II)
        projection = torch.zeros(input_dim, proj_dim)
        for i in range(input_dim):
            for j in range(proj_dim):
                projection[i, j] = math.cos(math.pi * (i + 0.5) * (j + 1) / proj_dim)
        
        return projection * sigma
    
    def encode(self, x):

        log_rms=compute_log_rms(x)
        crest_factor= compute_crest_factor(x)
        stereo_width= compute_stereo_width(x)
        stereo_imbalance= compute_stereo_imbalance(x)


        log_rms_std, crest_factor_std, stereo_width_std, stereo_imbalance_std = self.standardize_features(
            log_rms, crest_factor, stereo_width, stereo_imbalance
        )

        embedding= self.transform(
            log_rms_std, crest_factor_std, stereo_width_std, stereo_imbalance_std
        )

        return embedding, (log_rms, crest_factor, stereo_width, stereo_imbalance)
    
    def decode(self, fourier_features):
        """
        Invert Fourier features back to original feature space
        (approximate due to phase-only reconstruction)
        """
        reconstructed = self.inverse_transform(fourier_features)
        
        # Reshape back to original feature dimensions
        log_rms= reconstructed[:,0:2]
        crest_factor = reconstructed[:,2:4]
        stereo_width = reconstructed[:,4:5]
        stereo_imbalance = reconstructed[:,5:6]

        log_rms, crest_factor, stereo_width, stereo_imbalance = self.destandardize_features(
            log_rms, crest_factor, stereo_width, stereo_imbalance
        )
        
        return log_rms, crest_factor, stereo_width, stereo_imbalance

    def standardize_features(self, log_rms, crest_factor, stereo_width, stereo_imbalance):
        """
        Standardize features using pre-computed mean and std
        """
        log_rms = (log_rms - self.log_rms_shift) / self.log_rms_scale
        crest_factor = (crest_factor - self.crest_shift) / self.crest_scale
        stereo_width = (stereo_width - self.stereo_width_shift) / self.stereo_width_scale
        stereo_imbalance = (stereo_imbalance - self.stereo_imbalance_shift) / self.stereo_imbalance_scale
        
        return log_rms, crest_factor, stereo_width, stereo_imbalance
    
    def destandardize_features(self, log_rms, crest_factor, stereo_width, stereo_imbalance):
        """
        Reverse standardization to get back to original feature space
        """
        log_rms = log_rms * self.log_rms_scale + self.log_rms_shift
        crest_factor = crest_factor * self.crest_scale + self.crest_shift
        stereo_width = stereo_width * self.stereo_width_scale + self.stereo_width_shift
        stereo_imbalance = stereo_imbalance * self.stereo_imbalance_scale + self.stereo_imbalance_shift
        
        return log_rms, crest_factor, stereo_width, stereo_imbalance
        
    def transform(self, log_rms, crest_factor, stereo_width, stereo_imbalance):
        """
        Transform features using the stored projection matrix
        """

        flat_features=torch.cat([log_rms, crest_factor, stereo_width.unsqueeze(-1), stereo_imbalance.unsqueeze(-1)], dim=-1)
        
        # Project and transform
        projected = flat_features @ self.projection
        cos_features = torch.cos(projected)
        sin_features = torch.sin(projected)
        
        # Concatenate and normalize
        return torch.cat([cos_features, sin_features], dim=-1) * self.scale_factor
    
    def inverse_transform(self, fourier_features):
        """
        Invert Fourier features back to original feature space
        (approximate due to phase-only reconstruction)
        """
        # Split into cosine and sine components
        feature_dim = fourier_features.shape[-1] // 2
        cos_features = fourier_features[:, :feature_dim]
        sin_features = fourier_features[:, feature_dim:]
        
        # Denormalize
        cos_features = cos_features / self.scale_factor
        sin_features = sin_features / self.scale_factor
        
        # Compute phase angles
        phases = torch.atan2(sin_features, cos_features)
        
        # Use pseudo-inverse for approximate inversion
        projection_pinv = torch.pinverse(self.projection)
        reconstructed = phases @ projection_pinv
        
        return reconstructed


#AF_embedding = AF_RFF_embedding(device=device, output_dim=128)
AF_embedding = AF_RFF_embedding(device=device)
y= wet_segment.expand(4, 2, -1).to(device)
#print("y shape:", y.shape, y.std(), y.mean(), y.min(), y.max())
embedding, (log_rms, crest_factor, stereo_width, stereo_imbalance) = AF_embedding.encode(y)

print("Embedding shape:", embedding.shape, embedding.std())

log_rms_rec, crest_factor_rec, stereo_width_rec, stereo_imbalance_rec = AF_embedding.decode(embedding)

print("log_rms", log_rms, log_rms_rec )
print("crest_factor", crest_factor, crest_factor_rec)
print("stereo_width", stereo_width, stereo_width_rec)
print("stereo_imbalance", stereo_imbalance, stereo_imbalance_rec)

Embedding shape: torch.Size([4, 64]) tensor(0.1148, device='cuda:0')
