### Imports

In [20]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import efficientnet
from tensorflow.keras.layers import TextVectorization

### Constants

In [37]:
SUBMISSIONS_DATA = "../data/submissions.csv" 
COMMENTS_DATA = "../data/top_comments_50000_100.csv"
IMAGE_SIZE = (299, 299)
VOCAB_SIZE = 30000
SEQ_LENGTH = 50
EMBED_DIM = 512
FF_DIM = 512
BATCH_SIZE = 64
EPOCHS = 30
AUTOTUNE = tf.data.AUTOTUNE
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

### Functions

In [41]:
def load_data():
    submissions = pd.read_csv(SUBMISSIONS_DATA)
    comments = pd.read_csv(COMMENTS_DATA)

    # Filter out comments with less than SEQ_LENGTH tokens
    tokens = comments.body.apply(lambda body: body.strip().lower().split())
    word_count = tokens.apply(len)
    comments = comments[word_count <= SEQ_LENGTH]

    # Add start and end token to comments
    comments["caption"] = comments.body.apply(lambda body: f"<start> {body} <end>")

    # Map images to comments
    caption_map = defaultdict(list)
    for _, comment in comments.iterrows():
        image_path = f"../data/images/{comment.parent_id}.jpg"
        if os.path.isfile(image_path):
            caption_map[image_path].append(comment.caption)

    return caption_map, comments.caption

def train_val_split(caption_data, train_size=0.8):
    # Shuffle dataset
    image_paths = list(caption_data.keys())
    np.random.shuffle(image_paths)

    # Split data
    train_size = int(len(caption_data) * train_size)
    train = {img_path: caption_data[img_path] for img_path in image_paths[:train_size]}
    val = {img_path: caption_data[img_path] for img_path in image_paths[train_size:]}

    return train, val

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

def decode_and_resize(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMAGE_SIZE)
    img = tf.image.convert_image_dtype(img, tf.float32)
    return img

def process_input(img_path, captions):
    return decode_and_resize(img_path), vectorization(captions)

def make_dataset(images, captions):
    dataset = tf.data.Dataset.from_tensor_slices((images, captions))
    dataset = dataset.shuffle(BATCH_SIZE * 8)
    dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    return dataset

### Load Data

In [31]:
comment_image_map, comment_data = load_data()

  submissions = pd.read_csv(SUBMISSIONS_DATA)


### Split Data

In [39]:
train, val = train_val_split(comment_image_map)
print(f"Train size: {len(train):,}")
print(f"Val Size: {len(val):,}")

Train size: 5,000
Val Size: 1,250


### Vectorise Data

In [38]:
strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
strip_chars = strip_chars.replace("<", "")
strip_chars = strip_chars.replace(">", "")

vectorization = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQ_LENGTH,
    standardize=custom_standardization,
)
vectorization.adapt(comment_data)

### Augment Data

In [40]:
image_augmentation = keras.Sequential(
    [
        layers.RandomContrast(0.3),
        layers.RandomBrightness(0.3)
    ]
)

### Dataset Pipeline

In [43]:
train_dataset = make_dataset(list(train.keys()), list(train.values()))
val_dataset = make_dataset(list(valid.keys()), list(valid.values()))

ValueError: Can't convert non-rectangular Python sequence to Tensor.