In [1]:
import concurrent.futures
import collections
import math
import os
import pathlib
import random
import re
import string
import time
import shutil

import einops
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import tqdm

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_datasets as tfds

In [2]:
def flickr8k():
    path = pathlib.Path.cwd()
    
    if len(list(path.rglob('*'))) < 16197:
        tf.keras.utils.get_file(
            origin='https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip',
            cache_dir=path,
            cache_subdir='datasets',
            extract=True)
        tf.keras.utils.get_file(
            origin='https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip',
            cache_dir=path,
            cache_subdir='datasets',
            extract=True)

    path = pathlib.Path(str(path) + '\\datasets')
    
    captions = (path/"Flickr8k.token.txt").read_text().splitlines()
    captions = (line.split('\t') for line in captions)
    captions = ((fname.split('#')[0], caption) for (fname, caption) in captions)
    
    cap_dict = collections.defaultdict(list)
    for fname, cap in captions:
        cap_dict[fname].append(cap)
    
    train_files = (path/'Flickr_8k.trainImages.txt').read_text().splitlines()
    train_captions = [(str(path/'Flicker8k_Dataset'/fname), cap_dict[fname]) for fname in train_files]
    
    test_files = (path/'Flickr_8k.testImages.txt').read_text().splitlines()
    test_captions = [(str(path/'Flicker8k_Dataset'/fname), cap_dict[fname]) for fname in test_files]
    
    train_ds = tf.data.experimental.from_list(train_captions)
    test_ds = tf.data.experimental.from_list(test_captions)
    
    return train_ds, test_ds

In [3]:
train_raw, test_raw = flickr8k()

In [4]:
for ex_path, ex_captions in train_raw.take(1):
    print(ex_path)
    print(ex_captions)

tf.Tensor(b'D:\\Projects\\Image_Caption\\Image-Caption\\Model\\datasets\\Flicker8k_Dataset\\2513260012_03d33305cf.jpg', shape=(), dtype=string)
tf.Tensor(
[b'A black dog is running after a white dog in the snow .'
 b'Black dog chasing brown dog through snow'
 b'Two dogs chase each other across the snowy ground .'
 b'Two dogs play together in the snow .'
 b'Two dogs running through a low lying body of water .'], shape=(5,), dtype=string)


In [5]:
IMAGE_SHAPE=(224, 224, 3)
mobilenet = tf.keras.applications.MobileNetV3Large(
    input_shape=IMAGE_SHAPE,
    include_top=False,
    include_preprocessing=True)
mobilenet.trainable=False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v3/weights_mobilenet_v3_large_224_1.0_float_no_top_v2.h5


In [6]:
if not os.path.exists(pathlib.Path.cwd() / 'Model_Data'):
    os.mkdir(pathlib.Path.cwd() / 'Model_Data')
shutil.move(pathlib.Path.home() / '.keras/models/weights_mobilenet_v3_large_224_1.0_float_no_top_v2.h5', pathlib.Path.cwd() / 'Model_Data/mobilenet_v3_large_weights.h5')

WindowsPath('D:/Projects/Image_Caption/Image-Caption/Model/Model_Data/mobilenet_v3_large_weights.h5')

In [7]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMAGE_SHAPE[:-1])
    return img

In [8]:
test_img_batch = load_image(ex_path)[tf.newaxis, :]

print(test_img_batch.shape)
print(mobilenet(test_img_batch).shape)

(1, 224, 224, 3)
(1, 7, 7, 960)


In [9]:
def standardize(s):
    s = tf.strings.lower(s)
    s = tf.strings.regex_replace(s, f'[{re.escape(string.punctuation)}]', '')
    s = tf.strings.join(['[START]', s, '[END]'], separator=' ')
    return s

In [10]:
vocabulary_size = 5000
tokenizer = tf.keras.layers.TextVectorization(
    max_tokens=vocabulary_size,
    standardize=standardize,
    ragged=True)

In [11]:
tokenizer.adapt(train_raw.map(lambda fp,txt: txt).unbatch().batch(1024))

In [12]:
def match_shapes(images, captions):
    caption_shape = einops.parse_shape(captions, 'b c')
    captions = einops.rearrange(captions, 'b c -> (b c)')
    images = einops.repeat(
      images, 'b ... -> (b c) ...',
      c = caption_shape['c'])
    return images, captions

In [13]:
def prepare_txt(imgs, txts):
    tokens = tokenizer(txts)
    
    input_tokens = tokens[..., :-1]
    label_tokens = tokens[..., 1:]
    return (imgs, input_tokens), label_tokens

In [14]:
def prepare_dataset(ds, tokenizer, batch_size=32, shuffle_buffer=1000):
    ds = (ds
        .shuffle(10000)
        .map(lambda path, caption: (load_image(path), caption))
        .apply(tf.data.experimental.ignore_errors())
        .batch(batch_size))
    
    def to_tensor(inputs, labels):
        (images, in_tok), out_tok = inputs, labels
        return (images, in_tok.to_tensor()), out_tok.to_tensor()
    
    return (ds
          .map(match_shapes, tf.data.AUTOTUNE)
          .unbatch()
          .shuffle(shuffle_buffer)
          .batch(batch_size)
          .map(prepare_txt, tf.data.AUTOTUNE)
          .map(to_tensor, tf.data.AUTOTUNE)
          )

In [15]:
train_ds = prepare_dataset(train_raw, tokenizer)
test_ds = prepare_dataset(test_raw, tokenizer)

In [16]:
# Caching
def save_dataset(ds, save_path, image_model, tokenizer, shards=10, batch_size=32):
    # Load the images and make batches.
    ds = (ds
        .map(lambda path, caption: (load_image(path), caption))
        .apply(tf.data.experimental.ignore_errors())
        .batch(batch_size))
    
    # Run the feature extractor on each batch
    # Don't do this in a .map, because tf.data runs on the CPU. 
    def gen():
        for (images, captions) in tqdm.tqdm(ds): 
            feature_maps = image_model(images)
            
            feature_maps, captions = match_shapes(feature_maps, captions)
            yield feature_maps, captions
    
    # Wrap the generator in a new tf.data.Dataset.
    new_ds = tf.data.Dataset.from_generator(
      gen,
      output_signature=(
          tf.TensorSpec(shape=image_model.output_shape),
          tf.TensorSpec(shape=(None,), dtype=tf.string)))
    
    # Apply the tokenization 
    new_ds = (new_ds
            .map(prepare_txt, tf.data.AUTOTUNE)
            .unbatch()
            .shuffle(1000))
    
    # Save the dataset into shard files.
    def shard_func(i, item):
        return i % shards
    new_ds.enumerate().save(save_path, shard_func=shard_func)

In [18]:
save_dataset(train_raw, str(pathlib.Path.cwd() / 'Model_Data/train_cache'), mobilenet, tokenizer)
save_dataset(test_raw, str(pathlib.Path.cwd() / 'Model_Data/test_cache'), mobilenet, tokenizer)

188it [00:42,  4.38it/s]
32it [00:07,  4.55it/s]


In [20]:
import pickle
with open(str(pathlib.Path.cwd() / 'Model_Data/tokenizer.pkl'), 'wb') as f:
    pickle.dump(tokenizer, f)

InvalidArgumentError: Cannot convert a Tensor of dtype resource to a NumPy array.