In [1]:
import os
import sys

module_path = os.path.join(os.getcwd(), '../src')
sys.path.append(module_path)

import torch 
from gan_t2i.models.CLIP import CLIPModel
from gan_t2i.utils.model_loading import download_CLIP_model , CLIP_DATASETS


from gan_t2i.models.GAN import WGAN

from gan_t2i.datasets.DatasetFactory import DatasetFactory
import torchvision.transforms as transforms
import clip
from PIL import Image

In [2]:
print(torch.cuda.is_available())

True


# Loading data from checkpoints 

In [3]:
checkpoints_path = download_CLIP_model(CLIP_DATASETS.FLOWERS)

[92mCLIP model FLOWERS already exits at /home/xxx/Desktop/Deep Learning/Deep-Learning-Final-Project/examples/models_weights/CLIP/CLIP~FT_FLOWERS/CLIP~FT_FLOWERS.pt[0m


In [4]:
clip_model = CLIPModel.load(checkpoints_path)

Model loaded on device: cuda


------------------------------------

In [5]:
from torch.utils.data import DataLoader , SubsetRandomSampler

# Loading dataset

In [6]:
""" Text Transformation

You need to tokenize your text before passing it to the model.
"""
def tokenize_text(text):
    
    # Try except is needed due to error thrown by CLIP model that limit the context size
    # to 77 tokens so we need to split the text in smaller chunks and keep only a small portion
    # of it if the text is too long
    try:
        return clip.tokenize([text])[0]
    except:
        return clip.tokenize([text.split(".")[0]])[0]    
    

In [7]:
""" Image transformations """
transform_img = transforms.Compose([
    transforms.Resize(224, interpolation=Image.BICUBIC),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    
    # Previously computed mean and std
    transforms.Normalize([0.4355, 0.3777, 0.2879], [0.2571, 0.2028, 0.2101])
])

In [8]:
dataset = DatasetFactory.Flowers(os.path.join(os.getcwd(), "..", "data"), transform_img=transform_img, transform_caption=tokenize_text)

Captions already downloaded
images already downloaded
Captions already extracted
images already extracted
[92mThe dataset is already stored in HDF5 format[0m


In [9]:
# Create train, validation and test set     NOTE: We are using small subset of the dataset for this example
#                                                 This may and will cause overfitting this is only a demo
train_size = int(0.05 * len(dataset))       
val_size = int(0.02 * len(dataset))
test_size = int(0.02 * len(dataset))

# Cration of train, validation and test set indices and samplers
train_indices = list(range(train_size))
val_indices = list(range(train_size, train_size + val_size))
test_indices = list(range(train_size + val_size, train_size + val_size + test_size))

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)

# Creation of train, validation and test dataloaders
train_loader = DataLoader(dataset, batch_size=16, sampler=train_sampler, pin_memory=True)
val_loader = DataLoader(dataset, batch_size=16, sampler=val_sampler, pin_memory=True)
test_loader = DataLoader(dataset, batch_size=16, sampler=test_sampler, pin_memory=True)

---------------------------------------

# Creating the GAN model and training it

- Get the image and text embedding

In [10]:
# Per ottenere l'ultimo livello
dim_img_size , dim_text_size = clip_model.get_output_dimensions()
print("Ultimo livello del modello:", dim_img_size[1] , dim_text_size[1])

tensor([[49406, 34246,  4160, 49407,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
Ultimo livello del modello: 512 512


In [11]:
embedding_size = dim_text_size[1]
p_emb_dim = 128
WGAN_model = WGAN(clip_model,embedding_size,p_emb_dim)


In [12]:
checkpoints_path = os.path.join(os.getcwd(), "checkpoints")

WGAN_model.fit(
    train_dataloader = train_loader, 
    val_dataloader = val_loader,
    num_epochs = 1, 
    save_path=checkpoints_path
)

Training on device:  cuda
One : tensor([1.], device='cuda:0') | Mone : tensor([-1.], device='cuda:0') 
epoch iter 0 


Epoch [1/1] Batch [1/256] :00<02:27,  1.73it/s]
  => Loss Discriminator: -0.040023 
 => Loss Generator: 0.250033 
Epoch [1/1] Batch [1/256] [00:00<02:27,  1.73it/s]
  => Loss Discriminator: -0.040023 
 => Loss Generator: 0.250033 
Epoch [1/1] Batch [2/256] [00:00<01:40,  2.53it/s]
  => Loss Discriminator: 0.002061 
 => Loss Generator: 0.248786 
Epoch [1/1] Batch [2/256] [00:00<01:40,  2.53it/s] 
  => Loss Discriminator: 0.002061 
 => Loss Generator: 0.248786 
Epoch [1/1] Batch [3/256] [00:01<01:23,  3.02it/s]
  => Loss Discriminator: 0.043221 
 => Loss Generator: 0.248746 
Epoch [1/1] Batch [3/256] [00:01<01:23,  3.02it/s]
  => Loss Discriminator: 0.043221 
 => Loss Generator: 0.248746 
Epoch [1/1] Batch [4/256] [00:01<01:15,  3.34it/s]
  => Loss Discriminator: 0.052907 
 => Loss Generator: 0.249045 
Epoch [1/1] Batch [4/256] [00:01<01:15,  3.34it/s]
  => Loss Discriminator: 0.052907 
 => Loss Generator: 0.249045 
Epoch [1/1] Batch [5/256] [00:01<01:10,  3.55it/s]
  => Loss Discriminat

Epoch [1/1] Summary: 

	=> Train Generator Loss: 0.248668 
 
	=> Train Discriminator Loss: 0.069705 
 



----------------------------------------------------

# Load trained Model and predict an image based on the caption

In [13]:
checkpoints_full_path = os.path.join(os.getcwd(), "checkpoints/WGAN_epoch-1.pt")
aux_model = WGAN(clip_model,embedding_size,p_emb_dim)
WGAN_model = aux_model.load(model_pt_filepath=checkpoints_full_path)

Checkpoint loaded. Resuming training from epoch 1.


In [14]:
import matplotlib.pyplot as plt

def show_image(image_tensor):
    plt.imshow(image_tensor)
    plt.axis('off')
    plt.show()


In [15]:
# Esempio di utilizzo della funzione predict
import random

# Seleziona un batch casuale dal dataloader
random_batch = random.choice(list(test_loader))

# Estrai immagini e didascalie dal batch
images, captions, _ = random_batch

# Estrai la didascalia casuale e l'immagine corrispondente
device = ("cuda" if torch.cuda.is_available() else "cpu")
random_captions = captions.to(device)

#random_image = images[random_index].to(device)
generated_image = WGAN_model.predict(random_captions)
show_image(generated_image)

RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 4 is not equal to len(dims) = 3