### Mounting Google Drive in Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Importing Libraries and Modules and Data

In [None]:
import os
import json
import torch
from PIL import Image
from torchvision import transforms

In [None]:
# Path to dataset
dataset_path = '/content/drive/MyDrive/dataset'
categories = ['dress', 'shirt', 'toptee']

### Importing Libraries and Modules and Data

In [None]:
# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

In [None]:
data_list = []

### Loading Data from Each Category

In [None]:
# Load data from each category
for category in categories:
    json_file = os.path.join(dataset_path, f'cap.{category}.train.json')
    with open(json_file, 'r') as file:
        data = json.load(file)

    for item in data:
        image_path = os.path.join(dataset_path, category, f"{item['target']}.jpg")
        if os.path.exists(image_path):
            image = Image.open(image_path)
            image = transform(image)
            data_list.append((image, item['captions']))
        else:
            print(f"Image not found: {image_path}")

### Importing Libraries and Modules, Dataset, and Models Initialization

1. **Importing Libraries and Modules**: The code begins by importing necessary libraries and modules, including torch, torch.nn, DataLoader, Dataset from torch.utils.data, and BertModel, BertTokenizer from transformers.

2. **Device Configuration**: It checks if a CUDA-enabled GPU is available and sets the device accordingly (either "cuda" or "cpu").

3. **Custom Dataset Class (FashionDataset)**: This class defines a custom dataset for the fashion images and captions. It loads the data_list containing tuples of image paths and captions. The `getitem` method preprocesses the captions using a BERT tokenizer and returns the preprocessed data.

4. **Generator Model**: The Generator class defines the generator model for the GAN. It consists of a BERT text encoder, an image encoder, a linear layer to match the number of channels in image features, and a decoder to reconstruct the image.

5. **Discriminator Model**: The Discriminator class defines the discriminator model for the GAN. It consists of a series of convolutional layers followed by linear layers to classify whether an image is real or fake.

6. **Initializing Models and Dataset**: It initializes the generator, discriminator, and the FashionDataset using the provided data_list. It also initializes the dataloader for batch processing.

7. **Optimizers**: It defines Adam optimizers for both the generator and discriminator models.

8. **Loss Function**: It defines the adversarial loss (BCELoss) for training the GAN.


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer

# Configuration for device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom Dataset Class
class FashionDataset(Dataset):
    def __init__(self, data_list):
        self.data = data_list
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image, captions = self.data[idx]
        # If captions is a list, use the first caption. Adjust according to your data structure.
        caption = captions[0] if isinstance(captions, list) else captions
        encoded_captions = self.tokenizer(caption, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        input_ids = encoded_captions['input_ids'].squeeze(0)  # Remove the batch dimension
        attention_mask = encoded_captions['attention_mask'].squeeze(0)
        return image, input_ids, attention_mask

# Generator Model
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.text_encoder = BertModel.from_pretrained('bert-base-uncased').to(device)
        self.image_encoder = nn.Sequential(
            nn.Conv2d(3, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
        ).to(device)
        
        self.text_linear = nn.Linear(768, 256 * 64 * 64).to(device)  # Adjust the output size to match the size of image_features
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.ConvTranspose2d(128, 3, kernel_size=4, stride=2, padding=1),
            nn.Tanh(),
        ).to(device)

    def forward(self, images, input_ids, attention_mask):
        text_features = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        text_features = self.text_linear(text_features)  
        text_features = text_features.view(text_features.size(0), 256, 64, 64)  # Reshape to match image_features
        image_features = self.image_encoder(images)
        combined_features = image_features + text_features
        output = self.decoder(combined_features)
        return output


# Discriminator Model
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(3, 128, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Flatten(),
            nn.Linear(256 * 64 * 64, 1),
            nn.Sigmoid()
        ).to(device)

    def forward(self, images):
        return self.model(images)

# Initializing models and dataset
generator = Generator()
discriminator = Discriminator()
dataset = FashionDataset(data_list)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Optimizers
optimizer_G = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

# Loss function
adversarial_loss = nn.BCELoss()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The below is training loop for a Generative Adversarial Network (GAN) designed for fashion image generation. The loop is structured to iterate over a specified number of epochs, with each epoch comprising a series of steps for training the discriminator and generator components of the GAN. 

During each iteration, the loop fetches batches of images, input IDs, and attention masks from the data loader, ensuring that these tensors are moved to the GPU if available for faster computation. The training process begins with training the discriminator. For this, it computes the loss on real images (`real_loss`) using real labels (assigned as 1), and on fake images (`fake_loss`) generated by the generator using fake labels (assigned as 0). The total discriminator loss (`d_loss`) is then calculated as the average of `real_loss` and `fake_loss`. The gradients of `d_loss` are used to update the discriminator's parameters, improving its ability to distinguish between real and generated images.

Following the discriminator training, the loop proceeds to train the generator. It generates fake images using the generator and computes the generator loss (`g_loss`) based on the discriminator's output on these fake images. The gradients of `g_loss` are used to update the generator's parameters, enhancing its ability to generate more realistic images that can fool the discriminator. 

Throughout the training loop, the losses of both the discriminator and generator are printed for each epoch and batch, allowing for monitoring of the training progress. The loop concludes with a message indicating the completion of the training process, at which point the model should be trained and ready for evaluation or further use. Adjustments to the number of epochs and other parameters can be made as needed based on the specific requirements of the model and dataset.


In [None]:
# Training loop
num_epochs = 50  
for epoch in range(num_epochs):
    for images, input_ids, attention_mask in dataloader:
        # Ensure all tensors are on the GPU if available
        images = images.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        real_labels = torch.ones(images.size(0), 1, device=device)
        fake_labels = torch.zeros(images.size(0), 1, device=device)

        # Train Discriminator: maximize log(D(x)) + log(1 - D(G(z)))
        optimizer_D.zero_grad()

        # Compute the discriminator losses on real images
        real_output = discriminator(images)
        real_loss = adversarial_loss(real_output, real_labels)

        # Generate fake images
        fake_images = generator(images, input_ids, attention_mask)
        fake_output = discriminator(fake_images.detach())
        fake_loss = adversarial_loss(fake_output, fake_labels)

        # Backpropagate the total discriminator loss
        d_loss = (real_loss + fake_loss) / 2
        d_loss.backward()
        optimizer_D.step()

        # Train Generator: maximize log(D(G(z)))
        optimizer_G.zero_grad()
        output = discriminator(fake_images)  # This time do not detach
        g_loss = adversarial_loss(output, real_labels)
        g_loss.backward()
        optimizer_G.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss D: {d_loss.item()}, Loss G: {g_loss.item()}')

print("Training complete.")


Epoch [1/50], Loss D: 0.5619608163833618, Loss G: 4.997202396392822
Epoch [1/50], Loss D: 0.32781508564949036, Loss G: 3.6902079582214355
Epoch [1/50], Loss D: 0.07392589747905731, Loss G: 2.5140812397003174
Epoch [1/50], Loss D: 0.05293041467666626, Loss G: 2.6695785522460938
Epoch [1/50], Loss D: 0.03877795487642288, Loss G: 3.571597099304199
Epoch [2/50], Loss D: 0.017652057111263275, Loss G: 4.375607967376709
Epoch [2/50], Loss D: 0.013916470110416412, Loss G: 4.827449798583984
Epoch [2/50], Loss D: 0.0214516744017601, Loss G: 4.859555721282959
Epoch [2/50], Loss D: 0.004736026283353567, Loss G: 4.957368850708008
Epoch [2/50], Loss D: 0.014230530709028244, Loss G: 4.870750904083252
Epoch [3/50], Loss D: 0.009708729572594166, Loss G: 4.836544990539551
Epoch [3/50], Loss D: 0.006310194730758667, Loss G: 4.880497932434082
Epoch [3/50], Loss D: 0.008388573303818703, Loss G: 4.929664611816406
Epoch [3/50], Loss D: 0.010319022461771965, Loss G: 4.939871788024902
Epoch [3/50], Loss D: 0.0

The decreasing loss in the generator (Loss G) and the discriminator (Loss D) suggests that the model is learning to generate more realistic images over time. In the initial epochs, both losses are relatively high, indicating that the generator is struggling to produce convincing images, and the discriminator is having difficulty distinguishing between real and generated images. However, as training progresses, both losses decrease, indicating that the generator is improving its ability to generate realistic images, and the discriminator is becoming more accurate in distinguishing between real and fake images. This trend suggests that the model is learning effectively and is on the right track to produce high-quality generated images.