In [50]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import string
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
import random
import time
from pathlib import Path

import re
from IPython import display

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


I0000 00:00:1763478550.257210    1196 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5561 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


: 

: 

<h2> Preprocess Text </h2>

In [None]:
dictionary_path = './dictionary'
vocab = np.load(dictionary_path + '/vocab.npy')
print('there are {} vocabularies in total'.format(len(vocab)))

word2Id_dict = dict(np.load(dictionary_path + '/word2Id.npy'))
id2word_dict = dict(np.load(dictionary_path + '/id2Word.npy'))
print('Word to id mapping, for example: %s -> %s' % ('flower', word2Id_dict['flower']))
print('Id to word mapping, for example: %s -> %s' % ('1', id2word_dict['1']))
print('Tokens: <PAD>: %s; <RARE>: %s' % (word2Id_dict['<PAD>'], word2Id_dict['<RARE>']))

there are 5427 vocabularies in total
Word to id mapping, for example: flower -> 1
Id to word mapping, for example: 1 -> flower
Tokens: <PAD>: 5427; <RARE>: 5428


: 

: 

In [None]:
def sent2IdList(line, MAX_SEQ_LENGTH=20):
    MAX_SEQ_LIMIT = MAX_SEQ_LENGTH
    padding = 0
    
    # data preprocessing, remove all puntuation in the texts
    prep_line = re.sub('[%s]' % re.escape(string.punctuation), ' ', line.rstrip())
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('  ', ' ')
    prep_line = prep_line.replace('.', '')
    tokens = prep_line.split(' ')
    tokens = [
        tokens[i] for i in range(len(tokens))
        if tokens[i] != ' ' and tokens[i] != ''
    ]
    l = len(tokens)
    padding = MAX_SEQ_LIMIT - l
    
    # make sure length of each text is equal to MAX_SEQ_LENGTH, and replace the less common word with <RARE> token
    for i in range(padding):
        tokens.append('<PAD>')
    line = [
        word2Id_dict[tokens[k]]
        if tokens[k] in word2Id_dict else word2Id_dict['<RARE>']
        for k in range(len(tokens))
    ]

    return line

text = "the flower shown has yellow anther red pistil and bright red petals."
print(text)
print(sent2IdList(text))

the flower shown has yellow anther red pistil and bright red petals.
[np.str_('9'), np.str_('1'), np.str_('82'), np.str_('5'), np.str_('11'), np.str_('70'), np.str_('20'), np.str_('31'), np.str_('3'), np.str_('29'), np.str_('20'), np.str_('2'), np.str_('5427'), np.str_('5427'), np.str_('5427'), np.str_('5427'), np.str_('5427'), np.str_('5427'), np.str_('5427'), np.str_('5427')]


: 

<h2> Dataset</h2>

In [51]:
data_path = './dataset'
df = pd.read_pickle(data_path + '/text2ImgData.pkl')
num_training_sample = len(df)
n_images_train = num_training_sample
print('There are %d image in training data' % (n_images_train))

There are 7370 image in training data


<h2>Create Dataset by Dataset API </h2>

In [71]:
# in this competition, you have to generate image in size 64x64x3
IMAGE_HEIGHT = 64
IMAGE_WIDTH = 64
IMAGE_CHANNEL = 3
IMAGE_SIZE_CROPPED = 32

def training_data_generator(caption, image_path):
    # load in the image according to image path
    img = tf.io.read_file(image_path)
    img = tf.image.decode_image(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img.set_shape([None, None, 3])
    img = tf.image.resize(img, size=[IMAGE_HEIGHT, IMAGE_WIDTH])
    img.set_shape([IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL])

    distorted_image = tf.image.random_crop(img, [IMAGE_SIZE_CROPPED,IMAGE_SIZE_CROPPED,IMAGE_CHANNEL])
    distorted_image = tf.image.resize(distorted_image, [IMAGE_HEIGHT, IMAGE_WIDTH])
    distorted_image = tf.image.random_flip_left_right(distorted_image)
    distorted_image = tf.image.random_brightness(distorted_image, max_delta=0.2)
    distorted_image = tf.image.random_contrast(distorted_image, lower=0.8, upper=1.2)
    distorted_image = tf.image.per_image_standardization(distorted_image)
    distorted_image = tf.clip_by_value(distorted_image, 0.0, 1.0)

    caption = tf.cast(caption, tf.int32)

    return distorted_image, caption

def dataset_generator(filenames, batch_size, data_generator):
    # load the training data into two NumPy arrays
    df = pd.read_pickle(filenames)
    captions = df['Captions'].values
    caption = []
    # each image has 1 to 10 corresponding captions
    # we choose one of them randomly for training
    for i in range(len(captions)):
        caption.append(random.choice(captions[i]))
    caption = np.asarray(caption)
    caption = caption.astype(int)
    image_path = df['ImagePath'].values
    
    # assume that each row of `features` corresponds to the same row as `labels`.
    assert caption.shape[0] == image_path.shape[0]
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, image_path))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(len(caption)).batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset

In [72]:
BATCH_SIZE = 64
dataset = dataset_generator(data_path + '/text2ImgData.pkl', BATCH_SIZE, training_data_generator)

<h1> Conditional GAN Model </h1>


<h2> Text Encoder </h2>
A RNN encoder that captures the meaning of input text.

Input: text, which is a list of ids. <br>
Output: embedding, or hidden representation of input text.

In [74]:
class TextEncoder(tf.keras.Model):
    """
    Encode text (a caption) into hidden representation
    input: text, which is a list of ids
    output: embedding, or hidden representation of input text in dimension of RNN_HIDDEN_SIZE
    """
    def __init__(self, hparas):
        super(TextEncoder, self).__init__()
        self.hparas = hparas
        self.batch_size = self.hparas['BATCH_SIZE']
        
        # embedding with tensorflow API
        self.embedding = layers.Embedding(self.hparas['VOCAB_SIZE'], self.hparas['EMBED_DIM'])
        # RNN, here we use GRU cell, another common RNN cell similar to LSTM
        self.gru = layers.GRU(self.hparas['RNN_HIDDEN_SIZE'],
                              return_sequences=True,
                              return_state=True,
                              recurrent_initializer='glorot_uniform')
    
    def call(self, text, hidden):
        text = self.embedding(text)
        output, state = self.gru(text, initial_state = hidden)
        return output[:, -1, :], state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.hparas['BATCH_SIZE'], self.hparas['RNN_HIDDEN_SIZE']))

<h2> Generator</h2>
A image generator which generates the target image illustrating the input text.

Input: hidden representation of input text and random noise z with random seed. <br>
Output: target image, which is conditioned on the given text, in size 64x64x3.

In [75]:
class Generator(tf.keras.Model):
    """
    Generate fake image based on given text(hidden representation) and noise z
    input: text and noise
    output: fake image with size 64*64*3
    """
    def __init__(self, hparas):
        super(Generator, self).__init__()
        self.hparas = hparas
        self.flatten = tf.keras.layers.Flatten()
        self.d1 = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        self.d2 = tf.keras.layers.Dense(64*64*3)
        
    def call(self, text, noise_z):
        text = self.flatten(text)
        text = self.d1(text)
        text = tf.nn.leaky_relu(text)
        
        # concatenate input text and random noise
        text_concat = tf.concat([noise_z, text], axis=1)
        text_concat = self.d2(text_concat)
        
        logits = tf.reshape(text_concat, [-1, 64, 64, 3])
        output = tf.nn.tanh(logits)
        
        return logits, output

<h2>Discriminator</h2>
A binary classifier which can discriminate the real and fake image:

1. Real image
    - Input: real image and the paired text
    - Output: a floating number representing the result, which is expected to be 1.
2. Fake Image
    - Input: generated image and paired text
    - Output: a floating number representing the result, which is expected to be 0.

In [76]:
class Discriminator(tf.keras.Model):
    """
    Differentiate the real and fake image
    input: image and corresponding text
    output: labels, the real image should be 1, while the fake should be 0
    """
    def __init__(self, hparas):
        super(Discriminator, self).__init__()
        self.hparas = hparas
        self.flatten = tf.keras.layers.Flatten()
        self.d_text = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        self.d_img = tf.keras.layers.Dense(self.hparas['DENSE_DIM'])
        self.d = tf.keras.layers.Dense(1)
    
    def call(self, img, text):
        text = self.flatten(text)
        text = self.d_text(text)
        text = tf.nn.leaky_relu(text)
        
        img = self.flatten(img)
        img = self.d_img(img)
        img = tf.nn.leaky_relu(img)
        
        # concatenate image with paired text
        img_text = tf.concat([text, img], axis=1)
        
        logits = self.d(img_text)
        output = tf.nn.sigmoid(logits)
        
        return logits, output

In [77]:
hparas = {
    'MAX_SEQ_LENGTH': 20,                     # maximum sequence length
    'EMBED_DIM': 256,                         # word embedding dimension
    'VOCAB_SIZE': len(word2Id_dict),          # size of dictionary of captions
    'RNN_HIDDEN_SIZE': 128,                   # number of RNN neurons
    'Z_DIM': 512,                             # random noise z dimension
    'DENSE_DIM': 128,                         # number of neurons in dense layer
    'IMAGE_SIZE': [64, 64, 3],                # render image size
    'BATCH_SIZE': 64,
    'LR': 1e-4,
    'LR_DECAY': 0.5,
    'BETA_1': 0.5,
    'N_EPOCH': 500,                            # number of epoch for demo
    'N_SAMPLE': num_training_sample,          # size of training data
    'CHECKPOINTS_DIR': './checkpoints/demo',  # checkpoint path
    'PRINT_FREQ': 1                           # printing frequency of loss
}

In [78]:
text_encoder = TextEncoder(hparas)
generator = Generator(hparas)
discriminator = Discriminator(hparas)

<h2> Loss Function and Optimization </h2>
Although the conditional GAN model is quite complex, the loss function used to optimize the network is relatively simple. Actually, it is simply a binary classification task, thus we use cross entropy as our loss.

In [59]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [79]:
def discriminator_loss(real_logits, fake_logits):
    # output value of real image should be 1
    real_loss = cross_entropy(tf.ones_like(real_logits), real_logits)
    # output value of fake image should be 0
    fake_loss = cross_entropy(tf.zeros_like(fake_logits), fake_logits)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    # output value of fake image should be 0
    return cross_entropy(tf.ones_like(fake_output), fake_output)

In [80]:
# we use seperated optimizers for training generator and discriminator
generator_optimizer = tf.keras.optimizers.Adam(hparas['LR'])
discriminator_optimizer = tf.keras.optimizers.Adam(hparas['LR'])

In [81]:
# one benefit of tf.train.Checkpoint() API is we can save everything seperately
checkpoint_dir = hparas['CHECKPOINTS_DIR']
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 text_encoder=text_encoder,
                                 generator=generator,
                                 discriminator=discriminator)

In [83]:
@tf.function
def train_step(real_image, caption, hidden):
    # random noise for generator
    noise = tf.random.normal(shape=[hparas['BATCH_SIZE'], hparas['Z_DIM']], mean=0.0, stddev=1.0)
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        text_embed, hidden = text_encoder(caption, hidden)
        _, fake_image = generator(text_embed, noise)
        real_logits, real_output = discriminator(real_image, text_embed)
        fake_logits, fake_output = discriminator(fake_image, text_embed)

        g_loss = generator_loss(fake_logits)
        d_loss = discriminator_loss(real_logits, fake_logits)

    grad_g = gen_tape.gradient(g_loss, generator.trainable_variables)
    grad_d = disc_tape.gradient(d_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(grad_g, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(grad_d, discriminator.trainable_variables))
    
    return g_loss, d_loss

In [84]:
@tf.function
def test_step(caption, noise, hidden):
    text_embed, hidden = text_encoder(caption, hidden)
    _, fake_image = generator(text_embed, noise)
    return fake_image

<h2> Visualiztion </h2>
During training, we can visualize the generated image to evaluate the quality of generator. The followings are some functions helping visualization.

In [85]:
def merge(images, size):
    h, w = images.shape[1], images.shape[2]
    img = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        img[j*h:j*h+h, i*w:i*w+w, :] = image
    return img

def imsave(images, size, path):
    # getting the pixel values between [0, 1] to save it
    return plt.imsave(path, merge(images, size)*0.5 + 0.5)

def save_images(images, size, image_path):
    return imsave(images, size, image_path)

In [86]:
def sample_generator(caption, batch_size):
    caption = np.asarray(caption)
    caption = caption.astype(int)
    dataset = tf.data.Dataset.from_tensor_slices(caption)
    dataset = dataset.batch(batch_size)
    return dataset

We always use same random seed and same senteces during training, which is more convenient for us to evaluate the quality of generated image.

In [87]:
ni = int(np.ceil(np.sqrt(hparas['BATCH_SIZE'])))
sample_size = hparas['BATCH_SIZE']
sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
sample_sentence = ["the flower shown has yellow anther red pistil and bright red petals."] * int(sample_size/ni) + \
                  ["this flower has petals that are yellow, white and purple and has dark lines"] * int(sample_size/ni) + \
                  ["the petals on this flower are white with a yellow center"] * int(sample_size/ni) + \
                  ["this flower has a lot of small round pink petals."] * int(sample_size/ni) + \
                  ["this flower is orange in color, and has petals that are ruffled and rounded."] * int(sample_size/ni) + \
                  ["the flower has yellow petals and the center of it is brown."] * int(sample_size/ni) + \
                  ["this flower has petals that are blue and white."] * int(sample_size/ni) +\
                  ["these white flowers have petals that start off white in color and end in a white towards the tips."] * int(sample_size/ni)

for i, sent in enumerate(sample_sentence):
    sample_sentence[i] = sent2IdList(sent)
sample_sentence = sample_generator(sample_sentence, hparas['BATCH_SIZE'])

<h2> Training</h2>

In [88]:
if not os.path.exists('samples/demo'):
    os.makedirs('samples/demo')

In [89]:
def train(dataset, epochs):
    # hidden state of RNN
    hidden = text_encoder.initialize_hidden_state()
    steps_per_epoch = int(hparas['N_SAMPLE']/hparas['BATCH_SIZE'])
    
    for epoch in range(hparas['N_EPOCH']):
        g_total_loss = 0
        d_total_loss = 0
        start = time.time()
        
        for image, caption in dataset:
            g_loss, d_loss = train_step(image, caption, hidden)
            g_total_loss += g_loss
            d_total_loss += d_loss
            
        time_tuple = time.localtime()
        time_string = time.strftime("%m/%d/%Y, %H:%M:%S", time_tuple)
            
        print("Epoch {}, gen_loss: {:.4f}, disc_loss: {:.4f}".format(epoch+1,
                                                                     g_total_loss/steps_per_epoch,
                                                                     d_total_loss/steps_per_epoch))
        print('Time for epoch {} is {:.4f} sec'.format(epoch+1, time.time()-start))
        
        # save the model
        if (epoch + 1) % 50 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
        
        # visualization
        if (epoch + 1) % hparas['PRINT_FREQ'] == 0:
            for caption in sample_sentence:
                fake_image = test_step(caption, sample_seed, hidden)
            save_images(fake_image, [ni, ni], 'samples/demo/train_{:02d}.jpg'.format(epoch))

In [90]:
train(dataset, hparas['N_EPOCH'])

Epoch 1, gen_loss: 0.4744, disc_loss: 1.0821
Time for epoch 1 is 13.0698 sec
Epoch 2, gen_loss: 0.5088, disc_loss: 1.0445
Time for epoch 2 is 10.1853 sec


2025-11-18 23:47:28.651011: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:453] ShuffleDatasetV3:24: Filling up shuffle buffer (this may take a while): 6454 of 7370
2025-11-18 23:47:29.588771: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:483] Shuffle buffer filled.


Epoch 3, gen_loss: 0.7372, disc_loss: 0.8094
Time for epoch 3 is 12.1837 sec
Epoch 4, gen_loss: 1.2155, disc_loss: 0.4767
Time for epoch 4 is 9.0302 sec
Epoch 5, gen_loss: 2.1637, disc_loss: 0.1637
Time for epoch 5 is 8.6122 sec
Epoch 6, gen_loss: 1.8494, disc_loss: 0.2246
Time for epoch 6 is 8.5663 sec


2025-11-18 23:47:57.282643: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 7, gen_loss: 2.2657, disc_loss: 0.1518
Time for epoch 7 is 6.2509 sec
Epoch 8, gen_loss: 2.4750, disc_loss: 0.1345
Time for epoch 8 is 5.9473 sec
Epoch 9, gen_loss: 3.3259, disc_loss: 0.0817
Time for epoch 9 is 6.2755 sec
Epoch 10, gen_loss: 3.1633, disc_loss: 0.1257
Time for epoch 10 is 6.3384 sec
Epoch 11, gen_loss: 3.6073, disc_loss: 0.1250
Time for epoch 11 is 5.8538 sec
Epoch 12, gen_loss: 3.4676, disc_loss: 0.2188
Time for epoch 12 is 9.3209 sec
Epoch 13, gen_loss: 3.7564, disc_loss: 0.2967
Time for epoch 13 is 6.2543 sec
Epoch 14, gen_loss: 3.2343, disc_loss: 0.4113
Time for epoch 14 is 6.3133 sec
Epoch 15, gen_loss: 3.1046, disc_loss: 0.5961
Time for epoch 15 is 5.8222 sec
Epoch 16, gen_loss: 3.6075, disc_loss: 0.6133
Time for epoch 16 is 5.8780 sec
Epoch 17, gen_loss: 3.5659, disc_loss: 0.5888
Time for epoch 17 is 9.1776 sec
Epoch 18, gen_loss: 3.0460, disc_loss: 0.8053
Time for epoch 18 is 6.1880 sec
Epoch 19, gen_loss: 2.6490, disc_loss: 0.8931
Time for epoch 19 is 6.4

2025-11-18 23:49:45.446332: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 23, gen_loss: 2.8307, disc_loss: 0.6429
Time for epoch 23 is 7.7397 sec
Epoch 24, gen_loss: 2.7131, disc_loss: 0.5649
Time for epoch 24 is 7.0300 sec
Epoch 25, gen_loss: 2.5156, disc_loss: 0.5978
Time for epoch 25 is 7.0146 sec
Epoch 26, gen_loss: 2.3961, disc_loss: 0.6570
Time for epoch 26 is 8.8056 sec
Epoch 27, gen_loss: 2.3936, disc_loss: 0.6855
Time for epoch 27 is 6.9505 sec
Epoch 28, gen_loss: 2.1838, disc_loss: 0.7968
Time for epoch 28 is 7.0657 sec
Epoch 29, gen_loss: 2.5868, disc_loss: 0.6139
Time for epoch 29 is 6.5204 sec
Epoch 30, gen_loss: 2.5624, disc_loss: 0.6878
Time for epoch 30 is 6.8514 sec
Epoch 31, gen_loss: 2.8060, disc_loss: 0.5265
Time for epoch 31 is 9.5227 sec
Epoch 32, gen_loss: 2.2354, disc_loss: 0.6818
Time for epoch 32 is 6.9282 sec
Epoch 33, gen_loss: 2.1495, disc_loss: 0.5810
Time for epoch 33 is 6.8299 sec
Epoch 34, gen_loss: 1.3416, disc_loss: 0.7964
Time for epoch 34 is 6.2176 sec
Epoch 35, gen_loss: 1.3697, disc_loss: 0.6988
Time for epoch 35 

2025-11-18 23:53:32.227630: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 55, gen_loss: 1.2459, disc_loss: 1.0362
Time for epoch 55 is 5.9513 sec
Epoch 56, gen_loss: 1.1941, disc_loss: 1.1944
Time for epoch 56 is 9.0275 sec
Epoch 57, gen_loss: 1.2770, disc_loss: 1.0603
Time for epoch 57 is 6.6623 sec
Epoch 58, gen_loss: 1.3282, disc_loss: 1.0069
Time for epoch 58 is 5.8260 sec
Epoch 59, gen_loss: 1.2671, disc_loss: 1.1458
Time for epoch 59 is 6.5665 sec
Epoch 60, gen_loss: 1.4645, disc_loss: 1.0229
Time for epoch 60 is 6.6010 sec
Epoch 61, gen_loss: 1.7480, disc_loss: 0.8756
Time for epoch 61 is 9.4505 sec
Epoch 62, gen_loss: 1.7490, disc_loss: 0.8418
Time for epoch 62 is 6.3509 sec
Epoch 63, gen_loss: 1.9214, disc_loss: 0.6115
Time for epoch 63 is 6.1128 sec
Epoch 64, gen_loss: 1.7322, disc_loss: 0.6158
Time for epoch 64 is 6.2754 sec
Epoch 65, gen_loss: 1.8946, disc_loss: 0.5795
Time for epoch 65 is 6.3628 sec
Epoch 66, gen_loss: 1.8037, disc_loss: 0.6154
Time for epoch 66 is 8.3870 sec
Epoch 67, gen_loss: 1.7925, disc_loss: 0.7135
Time for epoch 67 

2025-11-19 00:00:55.611914: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 119, gen_loss: 1.5469, disc_loss: 0.9483
Time for epoch 119 is 6.0726 sec
Epoch 120, gen_loss: 1.4798, disc_loss: 1.0447
Time for epoch 120 is 5.9917 sec
Epoch 121, gen_loss: 1.6131, disc_loss: 0.9563
Time for epoch 121 is 5.7368 sec
Epoch 122, gen_loss: 1.4579, disc_loss: 1.0411
Time for epoch 122 is 8.6366 sec
Epoch 123, gen_loss: 1.4323, disc_loss: 1.0303
Time for epoch 123 is 5.7960 sec
Epoch 124, gen_loss: 1.3881, disc_loss: 1.0528
Time for epoch 124 is 5.8756 sec
Epoch 125, gen_loss: 1.5445, disc_loss: 0.9842
Time for epoch 125 is 5.7908 sec
Epoch 126, gen_loss: 1.4696, disc_loss: 1.0187
Time for epoch 126 is 5.6987 sec
Epoch 127, gen_loss: 1.4810, disc_loss: 0.9828
Time for epoch 127 is 9.7869 sec
Epoch 128, gen_loss: 1.4557, disc_loss: 1.0399
Time for epoch 128 is 6.9226 sec
Epoch 129, gen_loss: 1.2879, disc_loss: 1.1703
Time for epoch 129 is 6.5490 sec
Epoch 130, gen_loss: 1.5021, disc_loss: 1.0193
Time for epoch 130 is 6.2184 sec
Epoch 131, gen_loss: 1.4767, disc_loss: 

2025-11-19 00:09:53.133746: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:453] ShuffleDatasetV3:24: Filling up shuffle buffer (this may take a while): 5808 of 7370
2025-11-19 00:09:54.716111: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:483] Shuffle buffer filled.


Epoch 196, gen_loss: 1.5020, disc_loss: 1.1874
Time for epoch 196 is 12.8369 sec
Epoch 197, gen_loss: 1.6647, disc_loss: 1.0103
Time for epoch 197 is 9.3718 sec
Epoch 198, gen_loss: 1.5587, disc_loss: 1.0023
Time for epoch 198 is 6.1760 sec
Epoch 199, gen_loss: 1.5826, disc_loss: 1.0789
Time for epoch 199 is 10.2270 sec
Epoch 200, gen_loss: 1.4385, disc_loss: 1.1876
Time for epoch 200 is 6.4763 sec
Epoch 201, gen_loss: 1.5747, disc_loss: 1.0684
Time for epoch 201 is 10.0416 sec
Epoch 202, gen_loss: 1.4573, disc_loss: 1.1758
Time for epoch 202 is 6.0276 sec
Epoch 203, gen_loss: 1.5890, disc_loss: 1.0697
Time for epoch 203 is 6.3927 sec
Epoch 204, gen_loss: 1.4824, disc_loss: 1.0583
Time for epoch 204 is 7.2034 sec
Epoch 205, gen_loss: 1.5384, disc_loss: 1.0545
Time for epoch 205 is 6.4880 sec
Epoch 206, gen_loss: 1.4727, disc_loss: 1.0805
Time for epoch 206 is 10.2695 sec
Epoch 207, gen_loss: 1.4575, disc_loss: 1.1106
Time for epoch 207 is 6.1055 sec
Epoch 208, gen_loss: 1.4821, disc_lo

2025-11-19 00:15:47.540746: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 247, gen_loss: 1.7208, disc_loss: 0.9575
Time for epoch 247 is 8.8089 sec
Epoch 248, gen_loss: 1.5742, disc_loss: 0.9886
Time for epoch 248 is 6.2782 sec
Epoch 249, gen_loss: 1.6643, disc_loss: 0.8866
Time for epoch 249 is 7.0054 sec
Epoch 250, gen_loss: 1.4031, disc_loss: 1.1704
Time for epoch 250 is 6.1672 sec
Epoch 251, gen_loss: 1.5151, disc_loss: 1.0678
Time for epoch 251 is 6.5822 sec
Epoch 252, gen_loss: 1.5337, disc_loss: 1.0863
Time for epoch 252 is 9.1965 sec
Epoch 253, gen_loss: 1.5235, disc_loss: 1.0815
Time for epoch 253 is 6.2257 sec
Epoch 254, gen_loss: 1.5091, disc_loss: 1.1236
Time for epoch 254 is 6.2603 sec
Epoch 255, gen_loss: 1.6282, disc_loss: 0.9375
Time for epoch 255 is 5.8817 sec
Epoch 256, gen_loss: 1.4227, disc_loss: 1.1569
Time for epoch 256 is 6.3715 sec
Epoch 257, gen_loss: 1.5686, disc_loss: 0.9964
Time for epoch 257 is 8.8179 sec
Epoch 258, gen_loss: 1.5061, disc_loss: 1.1102
Time for epoch 258 is 6.3685 sec
Epoch 259, gen_loss: 1.6218, disc_loss: 

<h1> Evaluation </h1>
dataset/testData.pkl is a pandas dataframe containing testing text with attributes 'ID' and 'Captions'.

- 'ID': text ID used to name generated image.
- 'Captions': text used as condition to generate image.

For each captions, you need to generate inference_ID.png to evaluate quality of generated image. You must name the generated image in this format, otherwise we cannot evaluate your images.

<h2> Testing Dataset </h2>

If you change anything during preprocessing of training dataset, you must make sure same operations have be done in testing dataset.

In [91]:
def testing_data_generator(caption, index):
    caption = tf.cast(caption, tf.float32)
    return caption, index

def testing_dataset_generator(batch_size, data_generator):
    data = pd.read_pickle('./dataset/testData.pkl')
    captions = data['Captions'].values
    caption = []
    for i in range(len(captions)):
        caption.append(captions[i])
    caption = np.asarray(caption)
    caption = caption.astype(int)
    index = data['ID'].values
    index = np.asarray(index)
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, index))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat().batch(batch_size)
    
    return dataset

In [92]:
testing_dataset = testing_dataset_generator(hparas['BATCH_SIZE'], testing_data_generator)

In [93]:
data = pd.read_pickle('./dataset/testData.pkl')
captions = data['Captions'].values

NUM_TEST = len(captions)
EPOCH_TEST = int(NUM_TEST / hparas['BATCH_SIZE'])

<h2> Inferece </h2>

In [94]:
if not os.path.exists('./inference/demo'):
    os.makedirs('./inference/demo')

In [95]:
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

In [96]:
def inference(dataset):
    hidden = text_encoder.initialize_hidden_state()
    sample_size = hparas['BATCH_SIZE']
    sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
    
    step = 0
    start = time.time()
    for captions, idx in dataset:
        if step > EPOCH_TEST:
            break
        
        fake_image = test_step(captions, sample_seed, hidden)
        step += 1
        for i in range(hparas['BATCH_SIZE']):
            plt.imsave('./inference/demo/inference_{:04d}.jpg'.format(idx[i]), fake_image[i].numpy()*0.5 + 0.5)
            
    print('Time for inference is {:.4f} sec'.format(time.time()-start))

In [97]:
latest_ckpt = tf.train.latest_checkpoint(checkpoint_dir)
print("Restoring from:", latest_ckpt)
checkpoint.restore(latest_ckpt)


Restoring from: ./checkpoints/demo/ckpt-10


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x727096a40500>

In [98]:
inference(testing_dataset)

Time for inference is 5.9873 sec
