In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import re
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import layers
from keras.applications import efficientnet
from keras.layers import TextVectorization
from nltk.translate.bleu_score import sentence_bleu

keras.utils.set_random_seed(111)

In [None]:
pip install datasets

In [None]:
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
!unzip -qq Flickr8k_Dataset.zip
!unzip -qq Flickr8k_text.zip
!rm Flickr8k_Dataset.zip Flickr8k_text.zip

In [None]:
from datasets import load_dataset
dset = load_dataset("jmhessel/newyorker_caption_contest", "explanation")
dset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# get ms coco dataset
!git clone https://github.com/tylin/coco-caption.git

In [None]:
# Path to the images
IMAGES_PATH = "Flicker8k_Dataset"

# Desired image dimensions
IMAGE_SIZE = (299, 299)

# Vocabulary size
VOCAB_SIZE = 10000

# Fixed length allowed for any sequence
SEQ_LENGTH = 25

# Dimension for the image embeddings and token embeddings
EMBED_DIM = 512

# Per-layer units in the feed-forward network
FF_DIM = 512

# Other training parameters
BATCH_SIZE = 64
EPOCHS = 30
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
def preprocess_caption(caption):
    caption = caption.lower().strip()
    caption = re.sub(r'[^\w\s]', '', caption)
    caption = re.sub('\s+', ' ', caption)
    return '<start> ' + caption + ' <end>'

# Flickr8k
IMAGES_PATH_FLICKR8K = "Flicker8k_Dataset"
def load_flickr8k_data(filename):
    with open(filename) as file:
        data = file.readlines()
    image_captions = {}
    for line in data:
        image_path, caption = line.strip().split('\t')
        image_path = os.path.join(IMAGES_PATH_FLICKR8K, image_path.split('#')[0])
        if image_path in image_captions:
            image_captions[image_path].append(preprocess_caption(caption))
        else:
            image_captions[image_path] = [preprocess_caption(caption)]
    return image_captions

flickr8k_captions = load_flickr8k_data("Flickr8k.token.txt")

# MS COCO
BASE_PATH_COCO = '../input/coco-2017-dataset/coco2017'
def load_coco_data(filename):
    with open(filename, 'r') as file:
        data = json.load(file)['annotations']
    image_captions = {}
    for item in data:
        image_path = os.path.join(BASE_PATH_COCO, 'train2017', f'{item["image_id"]:012d}.jpg')
        caption = preprocess_caption(item['caption'])
        if image_path in image_captions:
            image_captions[image_path].append(caption)
        else:
            image_captions[image_path] = [caption]
    return image_captions

def train_val_split(caption_data, train_size=0.8, shuffle=True):

    all_images = list(caption_data.keys())

    if shuffle:
        np.random.shuffle(all_images)

    train_size = int(len(caption_data) * train_size)

    training_data = {
        img_name: caption_data[img_name] for img_name in all_images[:train_size]
    }
    validation_data = {
        img_name: caption_data[img_name] for img_name in all_images[train_size:]
    }

    return training_data, validation_data


coco_captions = load_coco_data(f'{BASE_PATH_COCO}/annotations/captions_train2017.json')

# Combine the datasets
combined_captions = {**flickr8k_captions, **coco_captions}

# convert to a list of (image, caption) pairs for easier processing later
combined_data = [(img, cap) for img, caps in combined_captions.items() for cap in caps]

# Split the dataset into training and validation sets
train_data, valid_data = train_val_split(combined_data)
print("Number of training samples: ", len(train_data))
print("Number of validation samples: ", len(valid_data))

In [None]:
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
strip_chars = strip_chars.replace("<", "")
strip_chars = strip_chars.replace(">", "")

vectorization = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQ_LENGTH,
    standardize=custom_standardization,
)
vectorization.adapt(text_data)

# Data augmentation for image data
image_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.2),
        layers.RandomContrast(0.3),
    ]
)
