In [23]:
import torch
import json
from transformers import CLIPTokenizer,CLIPTextModel

In [39]:
from torch.utils.data import Dataset
from torchvision.transforms import transforms
from PIL import Image
import numpy as np

transform = transforms.Compose([
                                transforms.Resize((256,256)),
                                transforms.ToTensor(),
                                transforms.Normalize([0.5,0.5,0.5],[0.5,0.5,0.5]),
                                ])


class CocoWithAnnotations(Dataset):
    def __init__(self, path, tokenizer, text_model, transform, timesteps=1000, train=True):
        super().__init__()
        self.path = path
        self.data = None
        self.transform = transform
        self.tokenizer = tokenizer
        self.text_model = text_model
        self.timesteps = timesteps
        self.train = train
        if self.data is None:
            self.open_json()
        
        # Define noise schedule
        self.betas = np.linspace(0.0001, 0.02, self.timesteps)
        self.alphas = 1 - self.betas
        self.alpha_cumprod = np.cumprod(self.alphas)

    def open_json(self):
        if self.train:
            print('======================= Loading training annotations =======================')
            with open(f'{self.path}/annotations/captions_train2014.json', 'r+') as stream:
                self.data = json.load(stream)
            self.data = self.data['annotations']
        else:
            print('======================= Loading validation annotations =======================')
            with open(f'./{self.path}/annotations/captions_val2014.json', 'r+') as stream:
                self.data = json.load(stream)
            self.data = self.data['annotations']
        print('======================= ANNOTATIONS LOADED =======================')
        
    def __getitem__(self, index):
        annot = self.data[index]
        image_id = str(annot["image_id"]).zfill(6)
        
        # Load and transform the image
        image = self.transform(Image.open(f'{self.path}/train2014/COCO_train2014_000000{image_id}.jpg'))
        
        # Get text embeddings
        tokens = self.tokenizer(annot['caption'], padding="max_length", truncation=True, max_length=77, return_tensors="pt")
        text_embs = self.text_model(**tokens).last_hidden_state.squeeze(0)
        
        # Add noise to the image
        t = np.random.randint(0, self.timesteps)  # Random timestep
        alpha_t = torch.tensor(self.alpha_cumprod[t]).float()
        noise = torch.randn_like(image)
        noised_image = (alpha_t.sqrt() * image) + ((1 - alpha_t).sqrt() * noise)
        
        return noised_image, noise, t, text_embs

    def __len__(self):
        return len(self.data)

In [40]:
def linear_beta_schedule(timesteps):
    beta_start = 0.0001
    beta_end = 0.02
    return torch.linspace(beta_start, beta_end, timesteps)

# Define alpha values (cumulative product of 1 - beta)
def compute_alpha_cumprod(beta_schedule):
    alpha = 1.0 - beta_schedule
    alpha_cumprod = torch.cumprod(alpha, dim=0)
    return alpha_cumprod

# Define timesteps and compute schedule
timesteps = 1000
beta_schedule = linear_beta_schedule(timesteps)
alpha_cumprod = compute_alpha_cumprod(beta_schedule)

In [6]:
from torch.utils.data import DataLoader
from tqdm import tqdm
from utils.unet import UNet
import torch.nn.functional as F

# Hyperparameters
epochs = 5
batch_size = 16
learning_rate = 1e-4
timesteps = 1000

# Tokenizer and embedder
tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch32')
text_model = CLIPTextModel.from_pretrained('openai/clip-vit-base-patch32')

# Prepare data and model
dataset = CocoWithAnnotations("./coco", tokenizer, text_model, transform, timesteps)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
unet = UNet(image_size=256).cuda()
optimizer = torch.optim.AdamW(unet.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    unet.train()
    epoch_loss = 0.0
    for noised_image, noise, t, context in tqdm(dataloader):
        noised_image, noise, context = noised_image.cuda(), noise.cuda(), context.cuda()

        # Predict noise using UNet
        predicted_noise = unet(noised_image, context)
        
        # Compute loss
        loss = F.mse_loss(predicted_noise, noise)
        epoch_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {epoch_loss / len(dataloader):.4f}")




  0%|          | 0/25883 [00:06<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 10.02 GiB is allocated by PyTorch, and 620.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)