In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -qU torchtext spacy
!python -m spacy download en_core_web_sm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.2/29.2 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.7.1 requires spacy<3.8.0,>=3.7.2, but you have spacy 3.8.5 which is incompatible.[0m[31m
[0mCollecting en-core-web-sm==3.8.0
  Downloadin

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd
import numpy as np
import spacy
import os

In [4]:

# Dataset configuration
class CUBDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_dir = os.path.join(root_dir, 'CUB_200_2011/images')
        self.text_path = os.path.join(root_dir, 'cvpr2016_cub/text_c10')
        
        # Load image metadata
        self.image_df = pd.read_csv(os.path.join(root_dir, 'CUB_200_2011/images.txt'), 
                                  sep=' ', names=['img_id', 'img_path'])
        self.split_df = pd.read_csv(os.path.join(root_dir, 'CUB_200_2011/train_test_split.txt'),
                                  sep=' ', names=['img_id', 'is_training'])
        
        # Filter training images
        self.train_images = self.image_df.merge(self.split_df, on='img_id').query('is_training == 1')
        
        # Text processing
        self.nlp = spacy.load('en_core_web_sm')

    def __len__(self):
        return len(self.train_images)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.train_images.iloc[idx]['img_path'])
        image = Image.open(img_path).convert('RGB')
        
        # Text processing (using first caption for simplicity)
        text_file = os.path.join(self.text_path, 
                               self.train_images.iloc[idx]['img_path'].replace('.jpg', '.txt'))
        with open(text_file, 'r') as f:
            caption = f.readline().strip()
        
        doc = self.nlp(caption)
        tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
        
        if self.transform:
            image = self.transform(image)
            
        # In CUBDataset __getitem__:
        return image, caption  # Return raw caption string instead of processed tokens


# Image transformations
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Initialize dataset
dataset = CUBDataset(root_dir='/kaggle/input/cub2002011',
                    transform=transform)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=2)

# %% [code]
# Model Architecture (GAN-INT-CLS)
class Generator(nn.Module):
    def __init__(self, text_embed_dim=128, noise_dim=100):
        super().__init__()
        self.main = nn.Sequential(
            nn.ConvTranspose2d(noise_dim + text_embed_dim, 512, 4, 1, 0, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.ConvTranspose2d(512, 256, 4, 2, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            nn.ConvTranspose2d(256, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.ConvTranspose2d(64, 3, 4, 2, 1, bias=False),
            nn.Tanh()
        )

    def forward(self, noise, text_embed):
        text_embed = text_embed.view(-1, 128, 1, 1)
        combined = torch.cat([noise.view(-1, 100, 1, 1), text_embed], 1)
        return self.main(combined)

class Discriminator(nn.Module):
    def __init__(self, text_embed_dim=128):
        super().__init__()
        self.main = nn.Sequential(
            nn.Conv2d(3, 64, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 256, 4, 2, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(256, 512, 4, 2, 1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True)
        )
        
        self.text_embed_proj = nn.Linear(text_embed_dim, 512)
        self.discriminator = nn.Conv2d(512, 1, 4, 1, 0, bias=False)

    def forward(self, image, text_embed):
        features = self.main(image)
        text_proj = self.text_embed_proj(text_embed).view(-1, 512, 1, 1)
        combined = features * text_proj
        return self.discriminator(combined).view(-1)

# Modified Text Encoder with proper token handling
class TextEncoder(nn.Module):
    def __init__(self, vocab_size=10000, embed_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.GRU(embed_dim, 256, batch_first=True)
        self.linear = nn.Linear(256, 128)
        
        # Build vocabulary from dataset
        self.word2idx = {}
        self.idx2word = {}
        self._build_vocab()

    def _build_vocab(self):
        all_captions = []
        for _, caption in dataset:
            all_captions.extend(caption.split())
        
        unique_words = list(set(all_captions))
        self.word2idx = {word: idx+2 for idx, word in enumerate(unique_words)}  # +2 for padding/unknown
        self.word2idx['<pad>'] = 0
        self.word2idx['<unk>'] = 1
        self.idx2word = {v: k for k, v in self.word2idx.items()}

    def _text_to_indices(self, text):
        return [self.word2idx.get(word, 1) for word in text.split()]  # 1 for unknown

    def forward(self, text_list):
        # Convert text to indices
        indexed = [self._text_to_indices(text) for text in text_list]
        
        # Convert to tensor with padding
        lengths = torch.tensor([len(seq) for seq in indexed])
        padded = torch.zeros(len(indexed), max(lengths)).long()
        for i, seq in enumerate(indexed):
            padded[i, :len(seq)] = torch.tensor(seq)
        
        padded = padded.to(device)
        embedded = self.embedding(padded)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), 
                                                 batch_first=True, enforce_sorted=False)
        _, hidden = self.rnn(packed)
        return self.linear(hidden[-1])



    




In [5]:
# Create image saving directory
import torchvision.utils as vutils
import matplotlib.pyplot as plt
os.makedirs('/kaggle/working/generated_images', exist_ok=True)

def save_generated_images(epoch, netG, text_encoder, device):
    """Save 3x3 grid of generated images"""
    netG.eval()
    text_encoder.eval()
    
    # Sample text prompts (modify based on your dataset)
    sample_texts = [
        "a small blue bird", "a red bird with black wings",
        "yellow bird sitting on branch", "brown and white sparrow",
        "black crow with shiny feathers", "white egret in flight",
        "green parrot with red beak", "woodpecker with striped head",
        "flamingo standing in water"
    ]
    
    with torch.no_grad():
        # Process text embeddings
        text_embed = text_encoder(sample_texts).to(device)
        
        # Generate images
        noise = torch.randn(9, 100).to(device)  # 9 images for 3x3 grid
        fake_images = netG(noise, text_embed).cpu()
    
    # Denormalize images from [-1,1] to [0,1]
    fake_images = (fake_images + 1) / 2
    
    # Create and save grid
    grid = vutils.make_grid(fake_images, nrow=3, padding=2)
    plt.figure(figsize=(8,8))
    plt.axis("off")
    plt.title(f"Generated Images - Epoch {epoch}")
    plt.imshow(grid.permute(1, 2, 0))
    plt.savefig(f'/kaggle/working/generated_images/epoch_{epoch}.png')
    plt.close()
    
    netG.train()
    text_encoder.train()

In [6]:
# Training Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
netG = Generator().to(device)
netD = Discriminator().to(device)
text_encoder = TextEncoder().to(device)

# Optimizers
optimizerG = optim.Adam(netG.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizerD = optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999))

# Loss function
criterion = nn.BCEWithLogitsLoss()

# Training Loop
num_epochs = 200
# Modified training loop with proper graph handling
for epoch in range(num_epochs):
    for i, (real_images, captions) in enumerate(dataloader):
        batch_size = real_images.size(0)
        
        # Prepare data
        real_images = real_images.to(device)
        
        # Get text embeddings (keep gradient for generator)
        text_embed = text_encoder(captions).to(device)
        
        # Train Discriminator
        netD.zero_grad()
        
        # Real images with detached text embeddings
        output_real = netD(real_images, text_embed.detach())
        errD_real = criterion(output_real, torch.ones(batch_size).to(device))
        
        # Fake images with full graph
        noise = torch.randn(batch_size, 100).to(device)
        with torch.no_grad():
            fake_images = netG(noise, text_embed)
            
        # Discriminator forward with detached inputs
        output_fake = netD(fake_images.detach(), text_embed.detach())
        errD_fake = criterion(output_fake, torch.zeros(batch_size).to(device))
        
        # Discriminator backward
        errD = errD_real + errD_fake
        errD.backward()
        optimizerD.step()
        
        # Train Generator
        netG.zero_grad()
        
        # Generate new fake images with gradient
        fake_images = netG(noise, text_embed)
        
        # Discriminator evaluation with text embeddings
        output = netD(fake_images, text_embed)
        errG = criterion(output, torch.ones(batch_size).to(device))
        
        # Generator backward
        errG.backward()
        optimizerG.step()

    if (epoch + 1) % 25 == 0 or epoch == 0:
        save_generated_images(epoch+1, netG, text_encoder, device)
        
    print(f'Epoch [{epoch+1}/{num_epochs}] Loss D: {errD.item():.4f} Loss G: {errG.item():.4f}')

# Save models
torch.save(netG.state_dict(), 'generator.pth')
torch.save(netD.state_dict(), 'discriminator.pth')

Epoch [1/200] Loss D: 0.5645 Loss G: 3.2629
Epoch [2/200] Loss D: 0.8347 Loss G: 3.2872
Epoch [3/200] Loss D: 0.7153 Loss G: 5.2462
Epoch [4/200] Loss D: 0.8308 Loss G: 2.1147
Epoch [5/200] Loss D: 0.8490 Loss G: 3.5733
Epoch [6/200] Loss D: 0.5237 Loss G: 2.8441
Epoch [7/200] Loss D: 0.7282 Loss G: 5.4472
Epoch [8/200] Loss D: 0.2940 Loss G: 4.1658
Epoch [9/200] Loss D: 0.6381 Loss G: 3.5027
Epoch [10/200] Loss D: 0.5702 Loss G: 3.6230
Epoch [11/200] Loss D: 0.5026 Loss G: 5.1574
Epoch [12/200] Loss D: 0.9674 Loss G: 1.9625
Epoch [13/200] Loss D: 1.4796 Loss G: 2.2233
Epoch [14/200] Loss D: 0.4612 Loss G: 2.1228
Epoch [15/200] Loss D: 0.3734 Loss G: 4.4133
Epoch [16/200] Loss D: 0.2362 Loss G: 5.5360
Epoch [17/200] Loss D: 0.5464 Loss G: 2.9254
Epoch [18/200] Loss D: 0.2781 Loss G: 3.7782
Epoch [19/200] Loss D: 0.5230 Loss G: 2.5578
Epoch [20/200] Loss D: 0.3070 Loss G: 5.3695
Epoch [21/200] Loss D: 0.2167 Loss G: 5.1027
Epoch [22/200] Loss D: 0.2797 Loss G: 6.9065
Epoch [23/200] Loss

In [7]:
from IPython.display import FileLink

# Compress directory
!zip -r generated_images.zip /kaggle/working/generated_images

# Create download link
FileLink('generated_images.zip')


  adding: kaggle/working/generated_images/ (stored 0%)
  adding: kaggle/working/generated_images/epoch_125.png (deflated 5%)
  adding: kaggle/working/generated_images/epoch_25.png (deflated 5%)
  adding: kaggle/working/generated_images/epoch_75.png (deflated 5%)
  adding: kaggle/working/generated_images/epoch_200.png (deflated 5%)
  adding: kaggle/working/generated_images/epoch_1.png (deflated 4%)
  adding: kaggle/working/generated_images/epoch_50.png (deflated 5%)
  adding: kaggle/working/generated_images/epoch_100.png (deflated 5%)
  adding: kaggle/working/generated_images/epoch_175.png (deflated 5%)
  adding: kaggle/working/generated_images/epoch_150.png (deflated 5%)
