In [None]:
# Embeddings/Autoencoder.ipynb

import os
import glob
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from tqdm.auto import tqdm


In [None]:
# Collect only aligned_spectrogram images
data_root = "../Results/Dev"
pattern = os.path.join(data_root, "*", "aligned_spectrogram_*.png")
filenames = sorted(glob.glob(pattern))

print(f"Found {len(filenames)} spectrogram images")
print(f"Example: {filenames[0]}")


In [None]:
class SpectrogramDataset(Dataset):
    def __init__(self, filenames=None, root_dir=None, img_size=128):
        """
        Args:
            filenames (list[str]): Explicit list of image paths to load.
            root_dir (str): Directory containing song folders with spectrograms.
                            If provided, will auto-discover aligned_spectrogram images.
            img_size (int): Final resize to (img_size, img_size).
        """
        if filenames is not None:
            self.files = filenames
        elif root_dir is not None:
            self.files = []
            for song_dir in glob(os.path.join(root_dir, "*")):
                self.files.extend(
                    [f for f in glob(os.path.join(song_dir, "*.png")) if "aligned_spectrogram" in f]
                )
        else:
            raise ValueError("Must provide either filenames or root_dir")

        self.transform = T.Compose([
            T.Resize((img_size, img_size)),
            T.ToTensor(),  # scales to [0,1]
        ])

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        img = Image.open(self.files[idx]).convert("L")  # grayscale
        return self.transform(img)


# Parameters
img_size = 128
batch_size = 32

dataset = SpectrogramDataset(filenames, img_size=img_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("Dataset ready:", len(dataset), "images")


In [None]:
class SimpleAutoencoder(nn.Module):
    def __init__(self, embedding_dim=128):
        super().__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(img_size * img_size, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim)
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, img_size * img_size),
            nn.Sigmoid(),  # outputs in [0,1]
        )

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        recon = recon.view(-1, 1, img_size, img_size)
        return recon, z

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SimpleAutoencoder(embedding_dim=128).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print("Model ready on", device)


In [None]:
epochs = 10  # adjust as needed

for epoch in range(epochs):
    epoch_loss = 0
    for imgs in dataloader:
        imgs = imgs.to(device)
        recon, z = model(imgs)
        loss = criterion(recon, imgs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss/len(dataloader):.6f}")


In [None]:
# Switch to eval
model.eval()
all_embeddings = []

with torch.no_grad():
    for imgs in DataLoader(dataset, batch_size=batch_size):
        imgs = imgs.to(device)
        _, z = model(imgs)
        all_embeddings.append(z.cpu())

embeddings_tensor = torch.cat(all_embeddings, dim=0)  # shape (N, embedding_dim)
print("Final embeddings shape:", embeddings_tensor.shape)

# Save
save_path = "../Results/EmbeddingData/autoencoder_embeddings.pt"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
torch.save(embeddings_tensor, save_path)
print(f"Saved embeddings to {save_path}")


In [None]:
# --------------------------
# Conv Autoencoder
# --------------------------
class ConvAutoencoder(nn.Module):
    def __init__(self, latent_dim=128):
        super().__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=2, padding=1),  # 1x128x128 -> 16x64x64
            nn.ReLU(True),
            nn.Conv2d(16, 32, 3, stride=2, padding=1), # 32x32x32
            nn.ReLU(True),
            nn.Conv2d(32, 64, 3, stride=2, padding=1), # 64x16x16
            nn.ReLU(True),
            nn.Conv2d(64, 128, 3, stride=2, padding=1),# 128x8x8
            nn.ReLU(True),
        )
        self.flatten = nn.Flatten()
        self.fc_enc = nn.Linear(128 * 8 * 8, latent_dim)

        # Decoder
        self.fc_dec = nn.Linear(latent_dim, 128 * 8 * 8)
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(128, 64, 3, stride=2, padding=1, output_padding=1),  # 64x16x16
            nn.ReLU(True),
            nn.ConvTranspose2d(64, 32, 3, stride=2, padding=1, output_padding=1),   # 32x32x32
            nn.ReLU(True),
            nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1),   # 16x64x64
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 1, 3, stride=2, padding=1, output_padding=1),    # 1x128x128
            nn.Sigmoid(),  # keep values in [0,1]
        )

    def encode(self, x):
        h = self.encoder(x)
        h = self.flatten(h)
        z = self.fc_enc(h)
        return z

    def decode(self, z):
        h = self.fc_dec(z)
        h = h.view(-1, 128, 8, 8)
        x_recon = self.decoder(h)
        return x_recon

    def forward(self, x):
        z = self.encode(x)
        x_recon = self.decode(z)
        return x_recon

# --------------------------
# Training Setup
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Parameters
img_size = 128
batch_size = 32

dataset = SpectrogramDataset(filenames, img_size=img_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = ConvAutoencoder(latent_dim=128).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)



In [None]:
# --------------------------
# Training Loop
# --------------------------
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    for imgs in tqdm(dataloader, desc=f"Epoch {epoch+1}/{n_epochs}"):
        imgs = imgs.to(device)
        recon = model(imgs)
        loss = criterion(recon, imgs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)

    epoch_loss = running_loss / len(dataset)
    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {epoch_loss:.6f}")
