In [3]:
import torch
import torch.nn as nn
torch.__version__

'2.9.1+cpu'

In [4]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=32, out_channels=6, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.encoder(x)

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(in_channels=6, out_channels=32, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=32, out_channels=16, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=16, out_channels=1, kernel_size=4, stride=2, padding=1),
            nn.Tanh(),
        )

    def forward(self, x):
        return self.decoder(x)

In [5]:
# Create batches of frames of a given length n (frame_size) every k (hop_length) samples in the audio
def audio_to_frames(audio_tensor, audio_length, frame_size = 1024, hop_length = 512):
    # If, for iteration i, i * hop_length + frame_size > audio_length, torch.unfold will drop any data at the end of the 1D tensor
    # So here we calculate the number of 0s we need to pad to the end of the 1D tensor
    padding = (frame_size - audio_length % hop_length) % frame_size
    padded_audio_tensor = torch.nn.functional.pad(audio_tensor, (0, padding))

    frames = padded_audio_tensor.unfold(0, frame_size, hop_length)
    frames = frames.unsqueeze(1) # Add channel dimension
    return frames

# Reconstruct audio given a tensor of frames
def frames_to_audio(audio_frames, audio_length, frame_size = 1024, hop_length = 512):
    num_frames = audio_frames.shape[0]
    total_length = hop_length * (num_frames - 1) + frame_size
    reconstructed = torch.zeros(total_length)
    weight = torch.zeros(total_length)

    for i, frame in enumerate(audio_frames.squeeze(1)):
        start = i * hop_length
        end = start + frame_size
        reconstructed[start:end] += frame
        weight[start:end] += 1

    reconstructed = (reconstructed / weight)[:audio_length]
    return reconstructed

In [6]:
import librosa
import IPython.display as ipd
import os

# Load test audio with a sample rate of 44khz
SAMPLE_RATE = 44100
audios = []
nb_audios = 2
compteur = 0

for file in os.listdir("audios"):
    audio, sr = librosa.load("audios/"+file, sr=SAMPLE_RATE)
    audios.append(audio)
    compteur += 1
    if compteur >= nb_audios:
        break

print(len(audios))


2


In [7]:
# Transform numpy array into torch tensor
audio_tensor = []
for audio in audios:
    audio_tensor.append(torch.from_numpy(audio))

In [8]:
frame_size = 1024
hop_length = 512

frames = []
for tensor in audio_tensor:
    frames.append(audio_to_frames(tensor, len(tensor), frame_size, hop_length))
frames = torch.cat(frames)
print(frames.shape)

torch.Size([3601, 1, 1024])


In [9]:
# Basic test for the output of our encoder -> decoder models
encoder_model = Encoder()
decoder_model = Decoder()

encoded_data = encoder_model(frames[0].unsqueeze(0))
print(encoded_data.shape)
decoded_data = decoder_model(encoded_data)

# Does the output have the same shape as the input?
frames[0].unsqueeze(0).shape, decoded_data.shape

torch.Size([1, 6, 128])


(torch.Size([1, 1, 1024]), torch.Size([1, 1, 1024]))

In [10]:
# Training the model
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(frames)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

epochs = 10
loss_fn = nn.L1Loss() # L1 loss against the original audio sample works fine
optimizer = torch.optim.Adam(list(encoder_model.parameters()) + list(decoder_model.parameters()), lr=0.01)

print("Beginning Training...")
for epoch in range(epochs):
    print(f"---- Epoch {epoch} ----")
    encoder_model.train()
    decoder_model.train()

    train_loss = 0
    for batch_tuple in dataloader:
        batch = batch_tuple[0]  # shape (batch_size, 1, frame_size)
        latent = encoder_model(batch)
        reconstructed = decoder_model(latent)

        loss = loss_fn(reconstructed, batch)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= len(dataloader)

    print(f"Train Loss: {train_loss:.5f}")



Beginning Training...
---- Epoch 0 ----
Train Loss: 0.03305
---- Epoch 1 ----
Train Loss: 0.02175
---- Epoch 2 ----
Train Loss: 0.02143
---- Epoch 3 ----
Train Loss: 0.02195
---- Epoch 4 ----
Train Loss: 0.02119
---- Epoch 5 ----
Train Loss: 0.02130
---- Epoch 6 ----
Train Loss: 0.02151
---- Epoch 7 ----
Train Loss: 0.02137
---- Epoch 8 ----
Train Loss: 0.02115
---- Epoch 9 ----
Train Loss: 0.02193


In [11]:
encoder_model.eval()
decoder_model.eval()

with torch.inference_mode():
    encoded_audio = encoder_model(frames)
    decoded_audio = decoder_model(encoded_audio)

    from sklearn.metrics import mean_absolute_error

    print(mean_absolute_error(frames.squeeze(1), decoded_audio.squeeze(1)))

0.020875651389360428


In [12]:
reconstructed_audio = frames_to_audio(decoded_audio, len(audios[0]), frame_size, hop_length)
ipd.Audio(reconstructed_audio, rate=SAMPLE_RATE)

In [13]:
import pandas as pd

encoded_dataFrame = pd.DataFrame(encoded_audio.flatten(start_dim=1).numpy())
print(encoded_dataFrame.shape)
encoded_dataFrame.to_csv("encoded_csv.csv", index=False)

(3601, 768)


In [None]:
encoded_audio_quantized = torch.from_numpy(pd.read_csv("encoded_quantized_csv.csv").to_numpy()).float()
n, m = encoded_audio_quantized.shape
encoded_audio_quantized = encoded_audio_quantized.view(n, 6, m//6)

torch.float32
torch.float32


In [46]:
decoder_model.eval()

with torch.inference_mode():
    print(encoded_audio_quantized.shape, encoded_audio.shape)
    decoded_audio_quantized = decoder_model(encoded_audio_quantized)

    from sklearn.metrics import mean_absolute_error
    print(mean_absolute_error(frames.squeeze(1), decoded_audio_quantized.squeeze(1)))
    print(f"Compression : {(1-(encoded_audio.shape[1]*encoded_audio.shape[2]*8)/(frames.shape[2]*32))*100}%")


torch.Size([3601, 6, 128]) torch.Size([3601, 6, 128])
0.023608900606632233
Compression : 81.25%
