In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.models import vgg16, VGG16_Weights
from PIL import Image
import os
import numpy as np
import matplotlib.pyplot as plt
import kagglehub

# Configurations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_dim = 256
num_classes = 5
batch_size = 64
epochs = 3000
learning_rate = 3e-4
image_size = 128
channels = 3
output_dir = "FL_VEHICLE_CVAE_IMPROVED"
beta_annealing_epochs = 100
save_interval = 50

os.makedirs(output_dir, exist_ok=True)

# Dataset Loader
class VehicleTypeDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.images, self.labels, self.class_names, self.class_to_idx = [], [], [], {}

        for root, dirs, files in os.walk(root_dir):
            image_files = [f for f in files if f.lower().endswith(('jpg', 'png', 'jpeg'))]
            if image_files:
                class_name = os.path.basename(root)
                if class_name not in self.class_to_idx:
                    self.class_names.append(class_name)
                    self.class_to_idx[class_name] = len(self.class_names) - 1
                for img_file in image_files:
                    self.images.append(os.path.join(root, img_file))
                    self.labels.append(self.class_to_idx[class_name])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = Image.open(self.images[idx]).convert("RGB")
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

path = kagglehub.dataset_download("sujaykapadnis/vehicle-type-image-dataset")
dataset = VehicleTypeDataset(root_dir=path, transform=transform)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Enhanced Encoder
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3 + num_classes, 64, 4, 2, 1), nn.BatchNorm2d(64), nn.LeakyReLU(0.2),
            nn.Conv2d(64, 128, 4, 2, 1), nn.BatchNorm2d(128), nn.LeakyReLU(0.2),
            nn.Conv2d(128, 256, 4, 2, 1), nn.BatchNorm2d(256), nn.LeakyReLU(0.2),
            nn.Conv2d(256, 512, 4, 2, 1), nn.BatchNorm2d(512), nn.LeakyReLU(0.2),
            nn.Conv2d(512, 512, 4, 2, 1), nn.BatchNorm2d(512), nn.LeakyReLU(0.2),
        )
        self.fc_mu = nn.Linear(512 * 4 * 4, latent_dim)
        self.fc_logvar = nn.Linear(512 * 4 * 4, latent_dim)

    def forward(self, x, y):
        y = F.one_hot(y, num_classes=num_classes).float().unsqueeze(2).unsqueeze(3)
        y = y.expand(-1, -1, x.size(2), x.size(3))
        x = torch.cat([x, y], dim=1)
        h = self.conv(x).view(x.size(0), -1)
        return self.fc_mu(h), self.fc_logvar(h)

# Enhanced Decoder
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(latent_dim + num_classes, 512 * 4 * 4)
        self.deconv = nn.Sequential(
            nn.ConvTranspose2d(512, 512, 4, 2, 1), nn.BatchNorm2d(512), nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(512, 256, 4, 2, 1), nn.BatchNorm2d(256), nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(256, 128, 4, 2, 1), nn.BatchNorm2d(128), nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(128, 64, 4, 2, 1), nn.BatchNorm2d(64), nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(64, 3, 4, 2, 1), nn.Tanh()
        )

    def forward(self, z, y):
        y = F.one_hot(y, num_classes=num_classes).float()
        z = torch.cat([z, y], dim=1)
        h = self.fc(z).view(-1, 512, 4, 4)
        return self.deconv(h)

# CVAE Model
class CVAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x, y):
        mu, logvar = self.encoder(x, y)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z, y), mu, logvar

# Loss Function
def cvae_loss(recon, x, mu, logvar, beta):
    recon_loss = F.mse_loss(recon, x, reduction='sum')
    kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kl, recon_loss, kl

# Training
model = CVAE().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=50, factor=0.5)

for epoch in range(1, epochs + 1):
    model.train()
    train_loss, recon_sum, kl_sum = 0, 0, 0
    beta = min(1.0, epoch / beta_annealing_epochs)

    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        x_recon, mu, logvar = model(x, y)
        loss, recon, kl = cvae_loss(x_recon, x, mu, logvar, beta)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_loss += loss.item()
        recon_sum += recon.item()
        kl_sum += kl.item()

    print(f"Epoch {epoch} | Loss: {train_loss:.2f} | Recon: {recon_sum:.2f} | KL: {kl_sum:.2f} | Beta: {beta:.2f}")
    scheduler.step(train_loss)

    # Save samples
    if epoch % save_interval == 0:
        model.eval()
        with torch.no_grad():
            z = torch.randn(10, latent_dim).to(device)
            y = torch.arange(0, 5).repeat(2).to(device)
            samples = model.decoder(z, y)
            samples = (samples + 1) / 2
            save_image(samples, f"{output_dir}/sample_epoch_{epoch}.png")
        torch.save(model.state_dict(), f"{output_dir}/cvae_epoch_{epoch}.pth")


Epoch 1 | Loss: 30286965955.06 | Recon: 46536266.78 | KL: 3024042991624.71 | Beta: 0.01
Epoch 2 | Loss: 25191744.06 | Recon: 25039430.53 | KL: 7615673.18 | Beta: 0.02
Epoch 3 | Loss: 20543412.41 | Recon: 20360132.02 | KL: 6109348.21 | Beta: 0.03
Epoch 4 | Loss: 17473174.67 | Recon: 17252687.67 | KL: 5512175.94 | Beta: 0.04
Epoch 5 | Loss: 15381701.03 | Recon: 15135035.36 | KL: 4933312.24 | Beta: 0.05
Epoch 6 | Loss: 13799258.20 | Recon: 13514835.44 | KL: 4740378.98 | Beta: 0.06
Epoch 7 | Loss: 12479590.44 | Recon: 12164020.66 | KL: 4508140.09 | Beta: 0.07
Epoch 8 | Loss: 11393143.95 | Recon: 11044985.43 | KL: 4351981.79 | Beta: 0.08
Epoch 9 | Loss: 10504969.70 | Recon: 10126468.88 | KL: 4205565.06 | Beta: 0.09
Epoch 10 | Loss: 9736519.02 | Recon: 9329616.96 | KL: 4069020.36 | Beta: 0.10
Epoch 11 | Loss: 9150034.96 | Recon: 8713737.41 | KL: 3966341.15 | Beta: 0.11
Epoch 12 | Loss: 8570001.77 | Recon: 8106426.13 | KL: 3863130.21 | Beta: 0.12
Epoch 13 | Loss: 8181143.84 | Recon: 7692216.8

In [2]:
##batchsize is decreases to 8

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.models import vgg16, VGG16_Weights
from PIL import Image
import os
import numpy as np
import matplotlib.pyplot as plt
import kagglehub

# Configurations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_dim = 256
num_classes = 5
batch_size = 8  # Changed from 64 to 8 to address blurry images
epochs = 3000
learning_rate = 3e-4
image_size = 128
channels = 3
output_dir = "FL_VEHICLE_CVAE_IMPROVED_8"
beta_annealing_epochs = 100
save_interval = 50

os.makedirs(output_dir, exist_ok=True)

# Dataset Loader
class VehicleTypeDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.images, self.labels, self.class_names, self.class_to_idx = [], [], [], {}

        for root, dirs, files in os.walk(root_dir):
            image_files = [f for f in files if f.lower().endswith(('jpg', 'png', 'jpeg'))]
            if image_files:
                class_name = os.path.basename(root)
                if class_name not in self.class_to_idx:
                    self.class_names.append(class_name)
                    self.class_to_idx[class_name] = len(self.class_names) - 1
                for img_file in image_files:
                    self.images.append(os.path.join(root, img_file))
                    self.labels.append(self.class_to_idx[class_name])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = Image.open(self.images[idx]).convert("RGB")
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

path = kagglehub.dataset_download("sujaykapadnis/vehicle-type-image-dataset")
dataset = VehicleTypeDataset(root_dir=path, transform=transform)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Enhanced Encoder
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3 + num_classes, 64, 4, 2, 1), nn.BatchNorm2d(64), nn.LeakyReLU(0.2),
            nn.Conv2d(64, 128, 4, 2, 1), nn.BatchNorm2d(128), nn.LeakyReLU(0.2),
            nn.Conv2d(128, 256, 4, 2, 1), nn.BatchNorm2d(256), nn.LeakyReLU(0.2),
            nn.Conv2d(256, 512, 4, 2, 1), nn.BatchNorm2d(512), nn.LeakyReLU(0.2),
            nn.Conv2d(512, 512, 4, 2, 1), nn.BatchNorm2d(512), nn.LeakyReLU(0.2),
        )
        self.fc_mu = nn.Linear(512 * 4 * 4, latent_dim)
        self.fc_logvar = nn.Linear(512 * 4 * 4, latent_dim)

    def forward(self, x, y):
        y = F.one_hot(y, num_classes=num_classes).float().unsqueeze(2).unsqueeze(3)
        y = y.expand(-1, -1, x.size(2), x.size(3))
        x = torch.cat([x, y], dim=1)
        h = self.conv(x).view(x.size(0), -1)
        return self.fc_mu(h), self.fc_logvar(h)

# Enhanced Decoder
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(latent_dim + num_classes, 512 * 4 * 4)
        self.deconv = nn.Sequential(
            nn.ConvTranspose2d(512, 512, 4, 2, 1), nn.BatchNorm2d(512), nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(512, 256, 4, 2, 1), nn.BatchNorm2d(256), nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(256, 128, 4, 2, 1), nn.BatchNorm2d(128), nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(128, 64, 4, 2, 1), nn.BatchNorm2d(64), nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(64, 3, 4, 2, 1), nn.Tanh()
        )

    def forward(self, z, y):
        y = F.one_hot(y, num_classes=num_classes).float()
        z = torch.cat([z, y], dim=1)
        h = self.fc(z).view(-1, 512, 4, 4)
        return self.deconv(h)

# CVAE Model
class CVAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x, y):
        mu, logvar = self.encoder(x, y)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z, y), mu, logvar

# Loss Function
def cvae_loss(recon, x, mu, logvar, beta):
    recon_loss = F.mse_loss(recon, x, reduction='sum')
    kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kl, recon_loss, kl

# Training
model = CVAE().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=50, factor=0.5)

for epoch in range(1, epochs + 1):
    model.train()
    train_loss, recon_sum, kl_sum = 0, 0, 0
    beta = min(1.0, epoch / beta_annealing_epochs)

    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        x_recon, mu, logvar = model(x, y)
        loss, recon, kl = cvae_loss(x_recon, x, mu, logvar, beta)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_loss += loss.item()
        recon_sum += recon.item()
        kl_sum += kl.item()

    print(f"Epoch {epoch} | Loss: {train_loss:.2f} | Recon: {recon_sum:.2f} | KL: {kl_sum:.2f} | Beta: {beta:.2f}")
    scheduler.step(train_loss)

    # Save samples
    if epoch % save_interval == 0:
        model.eval()
        with torch.no_grad():
            z = torch.randn(10, latent_dim).to(device)
            y = torch.arange(0, 5).repeat(2).to(device)
            samples = model.decoder(z, y)
            samples = (samples + 1) / 2
            save_image(samples, f"{output_dir}/sample_epoch_{epoch}.png")
        torch.save(model.state_dict(), f"{output_dir}/cvae_epoch_{epoch}.pth")

Epoch 1 | Loss: 27974349.74 | Recon: 27762668.89 | KL: 21168086.47 | Beta: 0.01
Epoch 2 | Loss: 20417541.37 | Recon: 18827611.17 | KL: 79496509.14 | Beta: 0.02
Epoch 3 | Loss: 16241337.95 | Recon: 15887804.80 | KL: 11784437.91 | Beta: 0.03
Epoch 4 | Loss: 14514451.87 | Recon: 14133008.79 | KL: 9536077.31 | Beta: 0.04
Epoch 5 | Loss: 12870571.52 | Recon: 12485384.77 | KL: 7703735.55 | Beta: 0.05
Epoch 6 | Loss: 11736583.21 | Recon: 11347478.85 | KL: 6485072.84 | Beta: 0.06
Epoch 7 | Loss: 10598821.45 | Recon: 10205605.59 | KL: 5617369.11 | Beta: 0.07
Epoch 8 | Loss: 9621446.55 | Recon: 9224304.73 | KL: 4964272.91 | Beta: 0.08
Epoch 9 | Loss: 8898323.58 | Recon: 8488985.66 | KL: 4548199.03 | Beta: 0.09
Epoch 10 | Loss: 8184528.27 | Recon: 7765335.99 | KL: 4191922.87 | Beta: 0.10
Epoch 11 | Loss: 7682895.01 | Recon: 7248593.00 | KL: 3948200.17 | Beta: 0.11
Epoch 12 | Loss: 7232259.58 | Recon: 6782735.77 | KL: 3746031.81 | Beta: 0.12
Epoch 13 | Loss: 6838030.91 | Recon: 6373474.58 | KL: 35