# Variational Auto Encoder
The logic for this has been taken from the original paper and Lecture 13 of CS231n

## Model

In [2]:
import torch
import torch.nn as nn
from datasets import tqdm

In [11]:
class VariationalAutoencoder(nn.Module):

    def __init__(self, input_dim, hidden_dim = 128*128, latent_dim = 32*32):
        super(VariationalAutoencoder, self).__init__()

        #Encoding
        self.img2hid = nn.Linear(input_dim, hidden_dim)
        self.hid2mu = nn.Linear(hidden_dim, latent_dim)
        self.hid2var = nn.Linear(hidden_dim, latent_dim)

        #Decoding
        self.z2hid = nn.Linear(latent_dim, hidden_dim)
        self.hid2img = nn.Linear(hidden_dim, input_dim)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def encode(self, x):

        h = self.relu(self.img2hid(x))
        mu = self.hid2mu(h)
        sigma = self.hid2var(h)

        return mu, sigma

    def decode(self, z):

        h = self.relu(self.z2hid(z))

        return self.sigmoid(self.hid2img(h))

    def forward(self, x):

        mu, sigma = self.encode(x)
        epsilon = torch.randn_like(sigma)
        z_reparametarized = mu + epsilon * sigma
        reconstructed = self.decode(z_reparametarized)

        return reconstructed, mu, torch.log(sigma)

In [14]:
if __name__ == '__main__':
    x = torch.randn(10, 256 * 256)
    vae = VariationalAutoencoder(256*256)
    x_reconstructed, mu, sigma = vae(x)
    print(x_reconstructed.shape)
    print(mu.shape)
    print(sigma.shape)

torch.Size([10, 65536])
torch.Size([10, 1024])
torch.Size([10, 1024])


## Loading dataset

This block is mostly AI generated.

In [15]:
import os
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from pathlib import Path

In [16]:
def create_image_dataloader(root_dir='./dataset', batch_size=16, target_size=(256, 256)):
    """
    Creates a DataLoader for unlabeled images optimized for CUDA

    Args:
        root_dir (str): Directory containing JPG images
        batch_size (int): Batch size for DataLoader
        target_size (tuple): Target size for image resizing (H, W)

    Returns:
        DataLoader: PyTorch DataLoader ready for training
        dict: Dataset information dictionary
    """
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    if device.type == 'cuda':
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

    class UnlabeledImageDataset(Dataset):
        def __init__(self, root_dir, transform=None, target_size=target_size):
            self.root_dir = Path(root_dir)
            self.transform = transform
            self.target_size = target_size

            # Collect all JPG images
            self.image_paths = list(self.root_dir.glob('*.jpg'))

            if not self.image_paths:
                raise RuntimeError(f"No JPG images found in {root_dir}")

            print(f"Found {len(self.image_paths)} images in dataset")

            # Pre-cache images if using CUDA
            self.cache = {}
            if device.type == 'cuda':
                print("Pre-caching images for GPU acceleration...")
                for i in range(len(self.image_paths)):
                    self.__getitem__(i)
                print("Image caching completed")

        def __len__(self):
            return len(self.image_paths)

        def __getitem__(self, idx):
            # Use cached image if available
            if idx in self.cache:
                return self.cache[idx]

            img_path = self.image_paths[idx]

            try:
                # Load image and ensure RGB format
                image = Image.open(img_path).convert('RGB')

                # Apply transformations if specified
                if self.transform:
                    image = self.transform(image)

                # Cache the image tensor
                self.cache[idx] = image
                return image

            except Exception as e:
                print(f"Error loading image {img_path}: {str(e)}")
                # Return a blank image if there's an error
                blank = torch.zeros(3, *self.target_size)
                self.cache[idx] = blank
                return blank

    # Define transformations
    transform = transforms.Compose([
        transforms.Resize(target_size),          # Resize to target size
        transforms.ToTensor(),                   # Convert to tensor [0, 1]
        transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet stats
                             std=[0.229, 0.224, 0.225])
    ])

    # Create dataset
    dataset = UnlabeledImageDataset(
        root_dir=root_dir,
        transform=transform
    )

    # Create data loader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,  # Set to 0 for Windows compatibility
        pin_memory=(device.type == 'cuda')  # Pin memory for faster GPU transfers
    )

    # Prepare dataset information
    dataset_info = {
        'num_images': len(dataset),
        'image_paths': [str(p) for p in dataset.image_paths],
        'original_size': (1920, 1080),
        'processed_size': target_size,
        'device': str(device),
        'batch_size': batch_size
    }

    print("\nDataLoader created successfully")
    print(f"  Total images: {len(dataset)}")
    print(f"  Batch size: {batch_size}")
    print(f"  Number of batches: {len(dataloader)}")

    if device.type == 'cuda':
        print(f"  CUDA Memory Allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")

    return dataloader, dataset_info

## Training

In [17]:
from tqdm import tqdm

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = VariationalAutoencoder(256*256, 128*128, 32*32).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 12.39 GiB is allocated by PyTorch, and 1.48 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [8]:
num_epochs = 1

In [9]:
if __name__ == "__main__":

    dataloader, dataset_info = create_image_dataloader()

    torch.save(dataset_info, "./dataset/dataset_info.pth")

    for epoch in range(num_epochs):
        iter = tqdm(dataloader)
        for batch_idx, data in enumerate(iter):

            data = data.to(device).reshape(-1, 3, 256, 256)
            print(data.shape)
            output, mu, logvar = model(data)


            reconstruction_loss = criterion(output, data)
            kl_loss = -torch.sum(1 + 2*logvar - mu.pow(2) - logvar.exp()/2)

            net_loss = reconstruction_loss + kl_loss

            optimizer.zero_grad()
            net_loss.backward()
            optimizer.step()
            iter.set_postfix(loss=net_loss.item())



Using device: cuda
GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Total GPU Memory: 8.00 GB
Found 748 images in dataset
Pre-caching images for GPU acceleration...
Image caching completed

DataLoader created successfully
  Total images: 748
  Batch size: 16
  Number of batches: 47
  CUDA Memory Allocated: 8384.38 MB


  0%|          | 0/47 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (12288x256 and 65536x16384)

## Predicitons
