In [1]:
import os
import collections
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_hub as hub
import tensorflow_addons as tfa
import tensorflow_text as text
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

2021-09-24 19:41:01.909407: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
tf.get_logger().setLevel("ERROR")

In [3]:
!nvidia-smi

Fri Sep 24 19:41:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.142.00   Driver Version: 450.142.00   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1E.0 Off |                    0 |
| N/A   31C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Data prep

In [6]:
root_dir = "../data"
sub_root_dir = os.path.join(root_dir, "gt_processed")
images_dir = os.path.join(sub_root_dir, "images")
tfrecords_dir = os.path.join(sub_root_dir, "tfrecords")
lp_file = os.path.join(sub_root_dir, "correct_data.json")

# Model

In [7]:
def project_embeddings(
    embeddings, num_projection_layers, projection_dims, dropout_rate
):
    projected_embeddings = layers.Dense(units=projection_dims)(embeddings)
    for _ in range(num_projection_layers):
        x = tf.nn.gelu(projected_embeddings)
        x = layers.Dense(projection_dims)(x)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.Add()([projected_embeddings, x])
        projected_embeddings = layers.LayerNormalization()(x)
    return projected_embeddings

In [8]:
def create_vision_encoder(
    num_projection_layers, projection_dims, dropout_rate, trainable=False
):
    xception = keras.applications.Xception(
        include_top=False, weights="imagenet", pooling="avg"
    )
    for layer in xception.layers:
        layer.trainable = trainable
    inputs = layers.Input(shape=(299, 299, 3), name="image_input")
    xception_input = tf.keras.applications.xception.preprocess_input(inputs)
    embeddings = xception(xception_input)
    outputs = project_embeddings(
        embeddings, num_projection_layers, projection_dims, dropout_rate
    )
    return keras.Model(inputs, outputs, name="vision_encoder")

In [9]:
def create_text_encoder(
    num_projection_layers, projection_dims, dropout_rate, trainable=False
):
    preprocess = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2",
        name="text_preprocessing",
    )
    bert = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
        "bert",
    )
    bert.trainable = trainable
    inputs = layers.Input(shape=(), dtype=tf.string, name="text_input")
    bert_inputs = preprocess(inputs)
    embeddings = bert(bert_inputs)["pooled_output"]
    outputs = project_embeddings(
        embeddings, num_projection_layers, projection_dims, dropout_rate
    )
    return keras.Model(inputs, outputs, name="text_encoder")

In [10]:
class IGTCPreTrainer(keras.Model):
    def __init__(self, text_encoder, image_encoder, **kwargs):
        super(IGTCPreTrainer, self).__init__(**kwargs)
        self.text_encoder = text_encoder
        self.image_encoder = image_encoder
        self.temp = tf.Variable(1.)
        self.loss_tracker = keras.metrics.Mean(name="loss")

    @property
    def metrics(self):
        return [self.loss_tracker]

    def call(self, features, training=False):
        lp_embeddings = text_encoder(features["lp"], training=training)
        image_embeddings = vision_encoder(features["image"], training=training)
        return lp_embeddings, image_embeddings, tf.math.exp(tf.math.scalar_mul(self.temp, tf.math.log(1/0.07)))

    def compute_loss(self, lp_embeddings, image_embeddings, logit_scale):
        logits = (
            tf.matmul(lp_embeddings, image_embeddings, transpose_b=True)
            * logit_scale
        )
        images_similarity = tf.matmul(
            image_embeddings, image_embeddings, transpose_b=True
        )
        lp_similarity = tf.matmul(
            lp_embeddings, lp_embeddings, transpose_b=True
        )
        targets = keras.activations.softmax(
            (lp_similarity + images_similarity) / 2
        )
        lp_loss = keras.losses.categorical_crossentropy(
            y_true=targets, y_pred=logits, from_logits=True
        )
        images_loss = keras.losses.categorical_crossentropy(
            y_true=tf.transpose(targets), y_pred=tf.transpose(logits), from_logits=True
        )
        return (lp_loss + images_loss) / 2

    def train_step(self, features):
        with tf.GradientTape() as tape:
            lp_embeddings, image_embeddings, logit_scale = self(features, training=True)
            logit_scale = tf.clip_by_value(logit_scale, clip_value_min=0, clip_value_max=4.6052)
            loss = self.compute_loss(lp_embeddings, image_embeddings, logit_scale)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}
 
    def test_step(self, features):
        lp_embeddings, image_embeddings, logit_scale = self(features, training=False)
        logit_scale = tf.clip_by_value(logit_scale, clip_value_min=0, clip_value_max=4.6052)
        loss = self.compute_loss(lp_embeddings, image_embeddings, logit_scale)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

In [11]:
vision_encoder = create_vision_encoder(
    num_projection_layers=1, projection_dims=256, dropout_rate=0.2)
text_encoder = create_text_encoder(
    num_projection_layers=1, projection_dims=256, dropout_rate=0.1)
pretrainer = IGTCPreTrainer(text_encoder, vision_encoder)
pretrainer.compile(
    optimizer=tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=0.001)
)

2021-09-24 19:41:53.276853: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-09-24 19:41:53.280413: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-09-24 19:41:53.400520: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-24 19:41:53.401522: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:1e.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2021-09-24 19:41:53.401558: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-09-24 19:41:53.430324: I tensorflow/stream_executor/platform/defau

In [12]:
def decode_fn(sample):
    features = tf.io.parse_single_example(
        sample,
        {
           "lp": tf.io.FixedLenFeature([], dtype=tf.string),
           "image": tf.io.FixedLenFeature([], dtype=tf.string)
        }
    )
    features['image'] = tf.io.decode_jpeg(
        features['image'], channels=3
    )
    return features
    

def fetch_dataset_tfrecord(string_pattern, batch_size, shuffle_size):
    return (
        tf.data.TFRecordDataset(
            tf.data.Dataset.list_files(string_pattern))
        .map(decode_fn, num_parallel_calls=8)
        .shuffle(shuffle_size)
        .batch(batch_size)
    )

In [15]:
train_dataset = fetch_dataset_tfrecord(
    os.path.join(tfrecords_dir, "train-*.tfrecord"),
    batch_size,
    train_size
)
valid_dataset = fetch_dataset_tfrecord(
    os.path.join(tfrecords_dir, "valid-*.tfrecord"),
    batch_size,
    valid_size
)

In [28]:
pretrainer.built = True
pretrainer.load_weights('experiments/clipPretrained_model_05-4.89.h5')

In [32]:
img_em, lp_em, temp = pretrainer.predict(valid_dataset)

TypeError: 'BatchDataset' object is not subscriptable

In [30]:
cos_sim = tf.keras.losses.cosine_similarity(img_em, lp_em)

In [31]:
cos_sim

<tf.Tensor: shape=(2400,), dtype=float32, numpy=
array([-0.24133351, -0.23939323, -0.23878878, ..., -0.24384187,
       -0.23763987, -0.23753689], dtype=float32)>

In [33]:
temp

(19,)

In [43]:
sim = tf.matmul(img_em, lp_em, transpose_b=True)

In [52]:
k = tf.matmul([lp_em[110]], img_em, transpose_b=True) 

In [55]:
110 in tf.math.top_k(k, 100).indices.numpy()

False