<a href="https://colab.research.google.com/github/Sakib-lite/text-to-image-generation-deep-learning-model/blob/main/text2image.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing necessary libraries**

In [1]:
import os
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import AutoTokenizer


**Mouting google drive**


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Extracting the dataset zip file**

In [3]:
import zipfile
import os

zip_file_path = '/content/drive/MyDrive/Colab_Datasets/BNATURE.zip'
extract_to = '/content/bnature'

os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Extraction complete. Contents are now in:", extract_to)

Extraction complete. Contents are now in: /content/bnature


**Defining Picure and Caption file path**

In [4]:
dataset_folder = '/content/bnature/'
image_folder = os.path.join(dataset_folder, 'Pictures')
caption_file = os.path.join(dataset_folder, 'caption/caption.txt')

**Checking total images**

In [5]:
image_files = [f for f in os.listdir(image_folder) if os.path.isfile(os.path.join(image_folder, f))]
image_count = len(image_files)

print(f"Total images: {image_count}")

Total images: 8000


**Loading Captions**

In [6]:
def load_captions(caption_file):
    caption_mapping = {}
    with open(caption_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('.jpg', 1)
            if len(parts) < 2:
                continue
            image_file = parts[0].strip() + '.jpg'
            caption = parts[1].strip()
            if image_file not in caption_mapping:
                caption_mapping[image_file] = []
            caption_mapping[image_file].append(caption)
    return caption_mapping

captions = load_captions(caption_file)
print("Sample captions:", list(captions.items())[:3])

Sample captions: [('1.jpg', ['গ্রামে হাঁটা দুই শিশু।', 'দুই শিশু গ্রামে হাঁটছে।', 'একটা নীল জামা পড়া শিশু।', 'দুই শিশু খেলা করে।', 'দুইটা শিশু দাঁড়িয়ে আছে।']), ('2.jpg', ['কিছু লোক নদী পার করছে।', 'কিছু লোক বিল পার করছে।', 'অনেকগুলো লোক নদী পার করছে।', 'জমির ভিতর অনেকগুলো লোক।', 'কিছু লোক নদী পার করছে।']), ('3.jpg', ['এক কৃষক ক্ষেতে কাজ করছে।', 'এক কৃষক ধান ক্ষেতে কাজ করছে।', 'রোদ এর নিচে একটা লোক দাঁড়িয়ে আছে।', 'এক কৃষক ধান দেখছে।', 'শার্ট পড়া একটা লোক।'])]


**STEPS**


1.   ***Import necessary libraries***
2.   ***Mount google drive***
3.   ***Extract ZIP***
4.   ***Load Captions***
4.   ***Pre Process images(Resize)***
4.   ***Tokenize caption with bangla-bert***




**Pre process image function**

In [7]:
def preprocess_image(image_path, target_size=(128, 128)):
    img = load_img(image_path, target_size=target_size)
    img_array = img_to_array(img) / 255.0  # Normalize to [0, 1]
    return img_array


# TESTING
sample_image_path = os.path.join(image_folder, '1.jpg')
sample_image = preprocess_image(sample_image_path)
print("Sample image shape:", sample_image.shape)


Sample image shape: (128, 128, 3)


**Tokenize caption with bangla-bert**

In [8]:
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

def tokenize_caption(caption):
    tokenized = tokenizer(caption, padding="max_length", truncation=True, max_length=20, return_tensors="tf")
    input_ids = tf.squeeze(tokenized["input_ids"], axis=0)
    attention_mask = tf.squeeze(tokenized["attention_mask"], axis=0)
    return input_ids, attention_mask

sample_caption = captions['1.jpg'][0]
input_ids, attention_mask = tokenize_caption(sample_caption)
print("Tokenized caption:", input_ids.numpy())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Tokenized caption: [  101 32993 33532 70919  7932 72990  1014   102     0     0     0     0
     0     0     0     0     0     0     0     0]


**Create Dataset**

In [9]:
def create_dataset(image_folder, captions, batch_size=8, target_size=(128, 128)):
    image_paths = []
    all_captions = []

    for image_file, caption_list in captions.items():
        for caption in caption_list:
            image_paths.append(os.path.join(image_folder, image_file))
            all_captions.append(caption)

    def generator():
        for img_path, caption in zip(image_paths, all_captions):
            img_array = preprocess_image(img_path, target_size)
            input_ids, attention_mask = tokenize_caption(caption)
            yield img_array, (input_ids, attention_mask)

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(target_size[0], target_size[1], 3), dtype=tf.float32),
            (
                tf.TensorSpec(shape=(20,), dtype=tf.int32),
                tf.TensorSpec(shape=(20,), dtype=tf.int32),
            ),
        ),
    )

    dataset = dataset.shuffle(buffer_size=len(image_paths)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

dataset = create_dataset(image_folder, captions, batch_size=4)

In [10]:
for batch_images, (batch_input_ids, batch_attention_masks) in dataset.take(1):
    print("Batch images shape:", batch_images.shape)
    print("Batch input IDs shape:", batch_input_ids.shape)
    print("Batch attention masks shape:", batch_attention_masks.shape)


Batch images shape: (4, 128, 128, 3)
Batch input IDs shape: (4, 20)
Batch attention masks shape: (4, 20)


In [11]:
import tensorflow as tf
from tensorflow.keras import layers, Model





> The ConditioningAugmentation class is a custom TensorFlow/Keras layer designed to process text embeddings for further use in tasks like image generation. It applies a series of transformations to the input text embeddings, including dimensionality reduction and the addition of noise, to make the embeddings more robust and suitable for downstream tasks.

**embedding_dim**: The dimensionality of the input text embedding.

**latent_dim**: The target dimensionality of the processed embedding (often smaller than embedding_dim).

**self.dense**: A fully connected layer (Dense) that maps the high-dimensional text embedding (embedding_dim) to a lower-dimensional latent space (latent_dim).

**self.activation**: A LeakyReLU activation function to introduce non-linearity and handle negative values in a more nuanced way than regular ReLU.

**self.noise**: A utility for generating random noise (Gaussian noise in this case).



In [12]:
class ConditioningAugmentation(layers.Layer):
    def __init__(self, embedding_dim, latent_dim):
        super(ConditioningAugmentation, self).__init__()
        self.dense = layers.Dense(latent_dim)
        self.activation = layers.LeakyReLU()
        self.noise = tf.random.normal

    def call(self, text_embedding):
        projected = self.dense(text_embedding)
        conditioned = self.activation(projected)
        noise = self.noise(tf.shape(conditioned))
        return conditioned + noise


> The upsampling_block function defines a block for upsampling in a deep learning model, typically used in image generation tasks or semantic segmentation. Upsampling increases the spatial dimensions (width and height) of the feature map while applying transformations to extract meaningful features.

In [13]:
def upsampling_block(x, filters, kernel_size=4, strides=2, padding="same"):
    x = layers.Conv2DTranspose(filters, kernel_size, strides, padding)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    return x


> This function build_generator defines the generator model for a GAN (Generative Adversarial Network). The generator's goal is to create synthetic images based on the provided text embeddings and noise, progressively upsampling a low-resolution representation into a high-resolution image.

In [14]:
def build_generator(text_embedding_dim, noise_dim, image_size=(128, 128, 3)):
    text_input = layers.Input(shape=(text_embedding_dim,))
    noise_input = layers.Input(shape=(noise_dim,))

    # Conditioning augmentation
    conditioning = ConditioningAugmentation(text_embedding_dim, 128)
    conditioned_text = conditioning(text_input)

    # Combine text embeddings and noise
    combined_input = layers.Concatenate()([conditioned_text, noise_input])

    # Initial dense layer to form a low-resolution image
    x = layers.Dense(4 * 4 * 512, activation="relu")(combined_input)
    x = layers.Reshape((4, 4, 512))(x)

    # Upsampling to the target image size
    x = upsampling_block(x, 256)  # 8x8
    x = upsampling_block(x, 128)  # 16x16
    x = upsampling_block(x, 64)   # 32x32
    x = upsampling_block(x, 32)   # 64x64
    x = upsampling_block(x, 16)   # 128x128

    # Output layer
    output = layers.Conv2DTranspose(image_size[-1], kernel_size=3, strides=1, padding="same", activation="tanh")(x)

    return Model(inputs=[text_input, noise_input], outputs=output, name="Generator")


**Test the Generator**

In [15]:
# Define hyperparameters
text_embedding_dim = 768  # Output dimension of Bangla BERT
noise_dim = 100  # Dimension of random noise vector
image_size = (128, 128, 3)

# Build the generator
generator = build_generator(text_embedding_dim, noise_dim, image_size)

# Test the generator
sample_text_embedding = tf.random.normal((8, text_embedding_dim))  # Simulate 8 text embeddings
sample_noise = tf.random.normal((8, noise_dim))  # Simulate 8 noise vectors

# Generate images
generated_images = generator([sample_text_embedding, sample_noise])
print("Generated images shape:", generated_images.shape)  # Expected: (8, 128, 128, 3)


Generated images shape: (8, 128, 128, 3)


**Discriminator Implementation**


In [16]:
def build_discriminator(image_size=(128, 128, 3), text_embedding_dim=768):
    # Image input
    image_input = layers.Input(shape=image_size)

    # Text embedding input
    text_input = layers.Input(shape=(text_embedding_dim,))

    # Image feature extractor
    x = layers.Conv2D(64, kernel_size=4, strides=2, padding="same")(image_input)  # 64x64
    x = layers.LeakyReLU()(x)
    x = layers.Conv2D(128, kernel_size=4, strides=2, padding="same")(x)  # 32x32
    x = layers.BatchNormalization()(x)
    x = layers.LeakyReLU()(x)
    x = layers.Conv2D(256, kernel_size=4, strides=2, padding="same")(x)  # 16x16
    x = layers.BatchNormalization()(x)
    x = layers.LeakyReLU()(x)
    x = layers.Conv2D(512, kernel_size=4, strides=2, padding="same")(x)  # 8x8
    x = layers.BatchNormalization()(x)
    x = layers.LeakyReLU()(x)
    x = layers.Flatten()(x)

    # Concatenate image features with text embeddings
    combined_features = layers.Concatenate()([x, text_input])
    combined_features = layers.Dense(512, activation="relu")(combined_features)

    # Adversarial head: Real vs. Fake classification
    adv_output = layers.Dense(1, activation="sigmoid", name="Adversarial")(combined_features)

    # Matching head: Text-image alignment score
    match_output = layers.Dense(1, activation="sigmoid", name="Matching")(combined_features)

    # Build model
    discriminator = Model(inputs=[image_input, text_input], outputs=[adv_output, match_output], name="Discriminator")
    return discriminator


**Test the Discriminator**



In [17]:
image_size = (128, 128, 3)
text_embedding_dim = 768

discriminator = build_discriminator(image_size, text_embedding_dim)

# Test the discriminator
sample_images = tf.random.normal((8, *image_size))  # Simulate 8 images
sample_text_embeddings = tf.random.normal((8, text_embedding_dim))  # Simulate 8 text embeddings

# Get predictions
adv_preds, match_preds = discriminator([sample_images, sample_text_embeddings])
print("Adversarial predictions shape:", adv_preds.shape)  # Expected: (8, 1)
print("Matching predictions shape:", match_preds.shape)    # Expected: (8, 1)


Adversarial predictions shape: (8, 1)
Matching predictions shape: (8, 1)


**Implement Loss Functions**

In [18]:
def adversarial_loss(real_preds, fake_preds):
    # Binary cross-entropy loss
    bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)
    real_loss = bce(tf.ones_like(real_preds), real_preds)  # Real images classified as real
    fake_loss = bce(tf.zeros_like(fake_preds), fake_preds)  # Fake images classified as fake
    return real_loss + fake_loss


**Generator Loss**

In [19]:
def generator_loss(fake_preds):
    # Generator tries to make fake images look real
    bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)
    return bce(tf.ones_like(fake_preds), fake_preds)


**Matching Loss**

In [20]:
def matching_loss(real_text_preds, fake_text_preds):
    # Match generated image with corresponding text
    bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)
    real_loss = bce(tf.ones_like(real_text_preds), real_text_preds)
    fake_loss = bce(tf.zeros_like(fake_text_preds), fake_text_preds)
    return real_loss + fake_loss


**Testing the Loss Functions**

In [21]:
# Simulate predictions
real_preds = tf.random.uniform((8, 1), 0.7, 1.0)  # Discriminator's output for real images
fake_preds = tf.random.uniform((8, 1), 0.0, 0.3)  # Discriminator's output for fake images
real_text_preds = tf.random.uniform((8, 1), 0.7, 1.0)  # Matching head output for real text-image pair
fake_text_preds = tf.random.uniform((8, 1), 0.0, 0.3)  # Matching head output for fake text-image pair

# Calculate losses
disc_loss = adversarial_loss(real_preds, fake_preds)
gen_loss = generator_loss(fake_preds)
match_loss = matching_loss(real_text_preds, fake_text_preds)

print("Discriminator Loss:", disc_loss.numpy())
print("Generator Loss:", gen_loss.numpy())
print("Matching Loss:", match_loss.numpy())


Discriminator Loss: 0.38897952
Generator Loss: 2.3805423
Matching Loss: 0.38053903


**Define Optimizers**

In [22]:
generator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5)
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5)


**Training Step Function**

In [23]:
@tf.function
def train_step(images, text_embeddings, attention_masks, generator, discriminator):
    batch_size = tf.shape(images)[0]
    noise_dim = 100
    random_noise = tf.random.normal((batch_size, noise_dim))

    # Train Discriminator
    with tf.GradientTape() as disc_tape:
        # Real images
        real_preds, real_match_preds = discriminator([images, text_embeddings], training=True)

        # Fake images
        generated_images = generator([text_embeddings, random_noise], training=True)
        fake_preds, fake_match_preds = discriminator([generated_images, text_embeddings], training=True)

        # Discriminator Loss
        d_loss = adversarial_loss(real_preds, fake_preds) + matching_loss(real_match_preds, fake_match_preds)

    disc_gradients = disc_tape.gradient(d_loss, discriminator.trainable_variables)
    discriminator_optimizer.apply_gradients(zip(disc_gradients, discriminator.trainable_variables))

    # Train Generator
    with tf.GradientTape() as gen_tape:
        # Generate fake images
        generated_images = generator([text_embeddings, random_noise], training=True)
        fake_preds, fake_match_preds = discriminator([generated_images, text_embeddings], training=True)

        # Generator Loss
        g_loss = generator_loss(fake_preds) + matching_loss(real_match_preds, fake_match_preds)

    gen_gradients = gen_tape.gradient(g_loss, generator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gen_gradients, generator.trainable_variables))

    return d_loss, g_loss


**Full Training Loop**

In [24]:
def train(dataset, generator, discriminator, epochs):
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for step, (images, (text_embeddings, attention_masks)) in enumerate(dataset):
            d_loss, g_loss = train_step(images, text_embeddings, attention_masks, generator, discriminator)

            if step % 100 == 0:
                print(f"Step {step}: Discriminator Loss = {d_loss.numpy()}, Generator Loss = {g_loss.numpy()}")

        # Save model and generate sample images
        if (epoch + 1) % 5 == 0:
            generator.save(f"generator_epoch_{epoch + 1}.h5")
            discriminator.save(f"discriminator_epoch_{epoch + 1}.h5")

            # Generate and save images for inspection
            random_noise = tf.random.normal((8, noise_dim))
            generated_images = generator([text_embeddings[:8], random_noise])
            save_generated_images(generated_images, epoch + 1)


**Save Sample Images**

In [25]:
import matplotlib.pyplot as plt
import os

def save_generated_images(images, epoch):
    output_dir = "generated_images"
    os.makedirs(output_dir, exist_ok=True)
    images = (images + 1) / 2.0  # Rescale to [0, 1]
    for i, img in enumerate(images):
        plt.imsave(os.path.join(output_dir, f"epoch_{epoch}_img_{i}.png"), img.numpy())


** Train the Model**

In [None]:
a = [ ] while(1): a.append('1')

generator_lr = 0.0002
discriminator_lr = 0.0002

generator_optimizer = tf.keras.optimizers.Adam(generator_lr, beta_1=0.5)
discriminator_optimizer = tf.keras.optimizers.Adam(discriminator_lr, beta_1=0.5)

# Step 2: Define Training Hyperparameters
epochs = 20
noise_dim = 100

train(dataset, generator, discriminator, epochs)

Epoch 1/20
