In [2]:

import librosa
import numpy as np
import torch
import IPython
from music2latent4 import Inferencer
import soundfile as sf

# Initialize the Inferencer
encdec = Inferencer()

# Load an example audio file (or use your own!)
#audio_path = librosa.example('trumpet')
audio_path="/home/eloi/projects/project_mfm_eloi/src/audio_examples/9220_ld_multi.wav"
wv, sr=sf.read(audio_path, dtype='float32')
#wv, sr = librosa.load(audio_path, sr=44100)  # Ensure 44.1kHz sample rate
wv=torch.from_numpy(wv).permute(1,0)  # Add batch dimension

print(wv.shape)
length=2**19+1024
start=15*sr
end=start+length
wv=wv[...,start:end]  # Truncate to 5 seconds for faster processing


print("wv", wv.shape)

# Encode the audio
latent = encdec.encode(wv)

def latent2seq(latent):
    """
    Convert the latent representation to a sequence of latent vectors.
    """
    # Reshape the latent representation to match the expected input shape
    latent = latent.view(latent.size(0), -1)
    return latent

def seq2latent(latent_sequence):
    """
    Convert the sequence of latent vectors back to the original latent representation.
    """
    # Reshape the latent sequence to match the expected output shape
    latent = latent_sequence.view(latent_sequence.size(0), -1, 64)
    return latent


latent=latent2seq(latent)
print("latent", latent.shape)

# Decode the latent representation back to a waveform
latent=seq2latent(latent)
wv_rec = encdec.decode(latent)

print("wv_rec", wv_rec.shape)

# Listen to the original and reconstructed audio 
print('Original')
IPython.display.display(IPython.display.Audio(wv, rate=sr, normalize=False))
print('Reconstructed')
IPython.display.display(IPython.display.Audio(wv_rec.squeeze().cpu().numpy(), rate=sr, normalize=False))

torch.Size([2, 11065214])
wv torch.Size([2, 525312])
latent torch.Size([16, 512])
wv_rec torch.Size([2, 525312])
Original


Reconstructed


In [3]:
def read_wav_segment(file_path, start, end, dtype="float32"):
    """
    Reads a specific segment from a .wav file efficiently.

    Args:
        file_path (str): Path to the .wav file.
        start (int): Start frame index.
        end (int): End frame index.

    Returns:
        numpy.ndarray: Audio data for the specified segment.
        int: Sample rate of the audio file.
    """
    # Open the .wav file
    with sf.SoundFile(file_path) as audio_file:
        # Read only the required frames
        audio_file.seek(start)
        data = audio_file.read(frames=end-start, dtype=dtype)
        samplerate = audio_file.samplerate

    return data, samplerate

def get_audio_length(file_path):
    """
    Retrieves the length of an audio file in seconds and frames.

    Args:
        file_path (str): Path to the audio file.

    Returns:
        float: Length of the audio file in seconds.
        int: Total number of frames in the audio file.
        int: Sample rate of the audio file.
    """
    with sf.SoundFile(file_path) as audio_file:
        total_frames = len(audio_file)  # Total number of frames
        samplerate = audio_file.samplerate  # Sample rate
        duration = total_frames / samplerate  # Duration in seconds

    return duration, total_frames, samplerate

audio_path="/home/eloi/projects/project_mfm_eloi/src/audio_examples/9220_ld_multi.wav"

duration, total_frames, samplerate = get_audio_length(audio_path)
start = 175*samplerate
end = start + 2**19+1024
#end = 2**19+1024

x, sr = read_wav_segment(audio_path, start, end)
x=torch.from_numpy(x).permute(1,0)  # Add batch dimension
rms= torch.sqrt(torch.mean(x**2, dim=-1))
rms_dB = 20 * torch.log10(rms)  # Convert to dB
print("rms", rms_dB)
IPython.display.display(IPython.display.Audio(x, rate=sr, normalize=False))

rms tensor([-20.4003, -20.4316])


In [4]:
print("x", x.shape)


x torch.Size([2, 525312])


In [8]:
x=x.mean(dim=0)  # Average across the batch dimension
y=torch.roll(x, shifts=100, dims=-1)  # Shift the tensor along the last dimension
N = x.size(-1)
M = y.size(-1)

X = torch.fft.rfft(x, n=N + M - 1)
Y = torch.fft.rfft(y, n=N + M - 1)
corr = torch.fft.irfft(X.conj() * Y)

In [9]:
print(corr.shape)

torch.Size([1050622])


In [10]:

corr

tensor([ -802.3566,  -583.9730,  -356.5474,  ..., -1355.2931, -1190.1633,
        -1005.3810])

In [14]:
import plotly.express as px

px.line(x=torch.arange(corr[0:3000].shape[-1]), y=corr[0:3000].cpu().numpy(), title="Correlation", labels={'x': 'Sample', 'y': 'Amplitude'}).show()