In [None]:
import math
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os

In [None]:
import zipfile
with zipfile.ZipFile('../input/notebook36cfb71da3/database-full.zip', 'r') as zip_ref:
    zip_ref.extractall('./data')

In [None]:
os.remove('./data/database-full/ image11396.jpg')
import shutil
shutil.rmtree('./data/__MACOSX')

In [None]:
def balance(X, y_, limit=15):
    
    balanced_X = []
    balanced_y = []
    
    y = y_.squeeze()
    for i in range(y.max()):
        if( len(y[y == i]) > limit):
            balanced_X.append(X[y == i][:limit])
            balanced_y.append(y[y == i][:limit])
        else:
            balanced_X.append(X[y == i])
            balanced_y.append(y[y == i])
            
    return np.array([y for x in balanced_X for y in x]), np.array([[y] for x in balanced_y for y in x])

In [None]:
df = pd.read_csv('../input/aux-cluster/aux_cluster.csv')

In [None]:
X_unlabeled = ['./data/database-full/' + x for x in os.listdir('./data/database-full/') if x not in df.ID]

In [None]:
X_labeled = np.array(['./data/database-full/' + filename for filename in df.ID])
y_labeled = df.new_cluster.values.reshape(-1, 1)

In [None]:
X_labeled, y_labeled = balance(X_labeled, y_labeled)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_labeled, y_labeled, stratify = y_labeled, test_size=0.33, random_state=42)

In [None]:
y_val = y_val.squeeze()
y_train = y_train.squeeze()

In [None]:
# Dataset hyperparameters
unlabeled_dataset_size = len(X_unlabeled)
labeled_dataset_size = len(X_labeled)
image_size = 256
image_channels = 3

# Algorithm hyperparameters
num_epochs = 20
batch_size = 24
width = 128
temperature = 0.1
# Stronger augmentations for contrastive, weaker ones for supervised training
contrastive_augmentation = {"min_area": 0.25, "brightness": 0.6, "jitter": 0.2}
classification_augmentation = {"min_area": 0.75, "brightness": 0.3, "jitter": 0.1}

In [None]:
def load_image_from_path(path):
    
    image = tf.io.read_file(path)
    image = tf.io.decode_jpeg(image)
    image = tf.image.resize(image, size = (image_size, image_size))
    
    return image/255.

In [None]:
def prepare_dataset():
    # Labeled and unlabeled samples are loaded synchronously
    # with batch sizes selected accordingly
    steps_per_epoch = (unlabeled_dataset_size + labeled_dataset_size) // batch_size
    unlabeled_batch_size = unlabeled_dataset_size // steps_per_epoch
    labeled_batch_size = labeled_dataset_size // steps_per_epoch
    print(
        f"batch size is {unlabeled_batch_size} (unlabeled) + {labeled_batch_size} (labeled)"
    )

    
    unlabeled_train_dataset = tf.data.Dataset.from_tensor_slices(X_unlabeled)
    unlabeled_train_dataset = (
        unlabeled_train_dataset.shuffle(buffer_size=10 * unlabeled_batch_size)
        .map(load_image_from_path)
        .batch(unlabeled_batch_size)
    )
    
    
    labeled_train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    labeled_train_dataset = (
        labeled_train_dataset.shuffle(buffer_size=10 * labeled_batch_size)
        .map(lambda x, y: (load_image_from_path(x), y))
        .batch(labeled_batch_size)
    )
    
    test_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
    test_dataset = (
        test_dataset
        .map(lambda x, y: (load_image_from_path(x), y))
        .batch(batch_size)
        .prefetch(buffer_size=tf.data.AUTOTUNE)
    )

    # Labeled and unlabeled datasets are zipped together
    train_dataset = tf.data.Dataset.zip(
        (unlabeled_train_dataset, labeled_train_dataset)
    ).prefetch(buffer_size=tf.data.AUTOTUNE)

    return train_dataset, labeled_train_dataset, test_dataset


train_dataset, labeled_train_dataset, test_dataset = prepare_dataset()

In [None]:
# Distorts the color distibutions of images
class RandomColorAffine(layers.Layer):
    def __init__(self, brightness=0, jitter=0, **kwargs):
        super().__init__(**kwargs)

        self.brightness = brightness
        self.jitter = jitter

    def get_config(self):
        config = super().get_config()
        config.update({"brightness": self.brightness, "jitter": self.jitter})
        return config

    def call(self, images, training=True):
        if training:
            batch_size = tf.shape(images)[0]

            # Same for all colors
            brightness_scales = 1 + tf.random.uniform(
                (batch_size, 1, 1, 1), minval=-self.brightness, maxval=self.brightness
            )
            # Different for all colors
            jitter_matrices = tf.random.uniform(
                (batch_size, 1, 3, 3), minval=-self.jitter, maxval=self.jitter
            )

            color_transforms = (
                tf.eye(3, batch_shape=[batch_size, 1]) * brightness_scales
                + jitter_matrices
            )
            images = tf.clip_by_value(tf.matmul(images, color_transforms), 0, 1)
        return images


# Image augmentation module
def get_augmenter(min_area, brightness, jitter):
    zoom_factor = 1.0 - math.sqrt(min_area)
    return keras.Sequential(
        [
            keras.Input(shape=(image_size, image_size, image_channels)),
            #layers.Rescaling(1. / 255.),
            layers.RandomFlip("horizontal"),
            layers.RandomTranslation(zoom_factor / 2, zoom_factor / 2),
            layers.RandomZoom((-zoom_factor, 0.0), (-zoom_factor, 0.0)),
            RandomColorAffine(brightness, jitter),
        ]
    )


def visualize_augmentations(num_images):
    # Sample a batch from a dataset
    images = next(iter(train_dataset))[0][:num_images]
    # Apply augmentations
    augmented_images = zip(
        images,
        get_augmenter(**classification_augmentation)(images),
        get_augmenter(**contrastive_augmentation)(images),
        get_augmenter(**contrastive_augmentation)(images),
    )
    row_titles = [
        "Original:",
        "Weakly augmented:",
        "Strongly augmented:",
        "Strongly augmented:",
    ]
    plt.figure(figsize=(num_images * 2.2, 4 * 2.2), dpi=100)
    for column, image_row in enumerate(augmented_images):
        for row, image in enumerate(image_row):
            plt.subplot(4, num_images, row * num_images + column + 1)
            plt.imshow(image)
            if column == 0:
                plt.title(row_titles[row], loc="left")
            plt.axis("off")
    plt.tight_layout()


visualize_augmentations(num_images=8)


In [None]:
# Define the encoder architecture
def get_encoder():
    return keras.Sequential(
        [
            keras.Input(shape=(image_size, image_size, image_channels)),
            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
            layers.Flatten(),
            layers.Dense(width, activation="relu"),
        ],
        name="encoder",
    )


In [None]:
# Baseline supervised training with random initialization
baseline_model = keras.Sequential(
    [
        keras.Input(shape=(image_size, image_size, image_channels)),
        get_augmenter(**classification_augmentation),
        get_encoder(),
        layers.Dense(167),
    ],
    name="baseline_model",
)
baseline_model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
)

baseline_history = baseline_model.fit(
    labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset
)

print(
    "Maximal validation accuracy: {:.2f}%".format(
        max(baseline_history.history["val_acc"]) * 100
    )
)


In [None]:
# Define the contrastive model with model-subclassing
class ContrastiveModel(keras.Model):
    def __init__(self):
        super().__init__()

        self.temperature = temperature
        self.contrastive_augmenter = get_augmenter(**contrastive_augmentation)
        self.classification_augmenter = get_augmenter(**classification_augmentation)
        self.encoder = get_encoder()
        # Non-linear MLP as projection head
        self.projection_head = keras.Sequential(
            [
                keras.Input(shape=(width,)),
                layers.Dense(width, activation="relu"),
                layers.Dense(width),
            ],
            name="projection_head",
        )
        # Single dense layer for linear probing
        self.linear_probe = keras.Sequential(
            [layers.Input(shape=(width,)), layers.Dense(167)], name="linear_probe"
        )

        self.encoder.summary()
        self.projection_head.summary()
        self.linear_probe.summary()

    def compile(self, contrastive_optimizer, probe_optimizer, **kwargs):
        super().compile(**kwargs)

        self.contrastive_optimizer = contrastive_optimizer
        self.probe_optimizer = probe_optimizer

        # self.contrastive_loss will be defined as a method
        self.probe_loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

        self.contrastive_loss_tracker = keras.metrics.Mean(name="c_loss")
        self.contrastive_accuracy = keras.metrics.SparseCategoricalAccuracy(
            name="c_acc"
        )
        self.probe_loss_tracker = keras.metrics.Mean(name="p_loss")
        self.probe_accuracy = keras.metrics.SparseCategoricalAccuracy(name="p_acc")

    @property
    def metrics(self):
        return [
            self.contrastive_loss_tracker,
            self.contrastive_accuracy,
            self.probe_loss_tracker,
            self.probe_accuracy,
        ]

    def contrastive_loss(self, projections_1, projections_2):
        # InfoNCE loss (information noise-contrastive estimation)
        # NT-Xent loss (normalized temperature-scaled cross entropy)

        # Cosine similarity: the dot product of the l2-normalized feature vectors
        projections_1 = tf.math.l2_normalize(projections_1, axis=1)
        projections_2 = tf.math.l2_normalize(projections_2, axis=1)
        similarities = (
            tf.matmul(projections_1, projections_2, transpose_b=True) / self.temperature
        )

        # The similarity between the representations of two augmented views of the
        # same image should be higher than their similarity with other views
        batch_size = tf.shape(projections_1)[0]
        contrastive_labels = tf.range(batch_size)
        self.contrastive_accuracy.update_state(contrastive_labels, similarities)
        self.contrastive_accuracy.update_state(
            contrastive_labels, tf.transpose(similarities)
        )

        # The temperature-scaled similarities are used as logits for cross-entropy
        # a symmetrized version of the loss is used here
        loss_1_2 = keras.losses.sparse_categorical_crossentropy(
            contrastive_labels, similarities, from_logits=True
        )
        loss_2_1 = keras.losses.sparse_categorical_crossentropy(
            contrastive_labels, tf.transpose(similarities), from_logits=True
        )
        return (loss_1_2 + loss_2_1) / 2

    def train_step(self, data):
        unlabeled_images, (labeled_images, labels) = data

        # Both labeled and unlabeled images are used, without labels
        images = tf.concat((unlabeled_images, labeled_images), axis=0)
        # Each image is augmented twice, differently
        augmented_images_1 = self.contrastive_augmenter(images, training=True)
        augmented_images_2 = self.contrastive_augmenter(images, training=True)
        with tf.GradientTape() as tape:
            features_1 = self.encoder(augmented_images_1, training=True)
            features_2 = self.encoder(augmented_images_2, training=True)
            # The representations are passed through a projection mlp
            projections_1 = self.projection_head(features_1, training=True)
            projections_2 = self.projection_head(features_2, training=True)
            contrastive_loss = self.contrastive_loss(projections_1, projections_2)
        gradients = tape.gradient(
            contrastive_loss,
            self.encoder.trainable_weights + self.projection_head.trainable_weights,
        )
        self.contrastive_optimizer.apply_gradients(
            zip(
                gradients,
                self.encoder.trainable_weights + self.projection_head.trainable_weights,
            )
        )
        self.contrastive_loss_tracker.update_state(contrastive_loss)

        # Labels are only used in evalutation for an on-the-fly logistic regression
        preprocessed_images = self.classification_augmenter(
            labeled_images, training=True
        )
        with tf.GradientTape() as tape:
            # the encoder is used in inference mode here to avoid regularization
            # and updating the batch normalization paramers if they are used
            features = self.encoder(preprocessed_images, training=False)
            class_logits = self.linear_probe(features, training=True)
            probe_loss = self.probe_loss(labels, class_logits)
        gradients = tape.gradient(probe_loss, self.linear_probe.trainable_weights)
        self.probe_optimizer.apply_gradients(
            zip(gradients, self.linear_probe.trainable_weights)
        )
        self.probe_loss_tracker.update_state(probe_loss)
        self.probe_accuracy.update_state(labels, class_logits)

        return {m.name: m.result() for m in self.metrics}

    def test_step(self, data):
        labeled_images, labels = data

        # For testing the components are used with a training=False flag
        preprocessed_images = self.classification_augmenter(
            labeled_images, training=False
        )
        features = self.encoder(preprocessed_images, training=False)
        class_logits = self.linear_probe(features, training=False)
        probe_loss = self.probe_loss(labels, class_logits)
        self.probe_loss_tracker.update_state(probe_loss)
        self.probe_accuracy.update_state(labels, class_logits)

        # Only the probe metrics are logged at test time
        return {m.name: m.result() for m in self.metrics[2:]}


# Contrastive pretraining
pretraining_model = ContrastiveModel()
pretraining_model.compile(
    contrastive_optimizer=keras.optimizers.Adam(),
    probe_optimizer=keras.optimizers.Adam(),
)

pretraining_history = pretraining_model.fit(
    train_dataset, epochs=num_epochs, validation_data=test_dataset
)
print(
    "Maximal validation accuracy: {:.2f}%".format(
        max(pretraining_history.history["val_p_acc"]) * 100
    )
)


In [None]:
# Supervised finetuning of the pretrained encoder
finetuning_model = keras.Sequential(
    [
        layers.Input(shape=(image_size, image_size, image_channels)),
        get_augmenter(**classification_augmentation),
        pretraining_model.encoder,
        layers.Dense(167),
    ],
    name="finetuning_model",
)
finetuning_model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
)

finetuning_history = finetuning_model.fit(
    labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset
)
print(
    "Maximal validation accuracy: {:.2f}%".format(
        max(finetuning_history.history["val_acc"]) * 100
    )
)


In [None]:
finetuning_model.save('SimCLR.h5')

In [None]:
queries = pd.read_csv('../input/taller3data/names_test_queries.csv', header=None)

In [None]:
queries = np.array(['./data/database-full/' + name for name in queries.values]).squeeze()

In [None]:
queries_dataset = tf.data.Dataset.from_tensor_slices(queries)
queries_dataset = (
    queries_dataset
    .map(load_image_from_path)
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

In [None]:
preds = finetuning_model.predict(queries_dataset)

In [None]:
queries_2 = pd.read_csv('../input/taller3data/names_test_queries.csv', header=None)

queries_2['preds'] = preds.argmax(1)

In [None]:
queries_2

In [None]:
X_all = ['./data/database-full/' + x for x in os.listdir('./data/database-full/')]

all_dataset = tf.data.Dataset.from_tensor_slices(X_all)
all_dataset = (
    all_dataset
    .map(load_image_from_path)
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

In [None]:
preds_all = finetuning_model.predict(all_dataset)

In [None]:
preds_all = preds_all.argmax(1)

In [None]:
finetuning_model.summary()

In [None]:
m = keras.Sequential([l for l in finetuning_model.layers[:-1]])

In [None]:
# finetuning_model.compile(
#     optimizer=keras.optimizers.Adam(),
#     loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#     metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
# )
encode_preds = m.predict(all_dataset)

In [None]:
encode_preds.shape

In [None]:
result_SimCLR = pd.DataFrame({'paths': X_all, 'cluster': preds_all, 'encode': [x for x in encode_preds]})

In [None]:
result_SimCLR.to_csv('result_SimCLR_Kaggle.csv', index=None)

In [None]:
result_SimCLR

In [None]:
print(len('./data/database-full/ image'))

In [None]:
from sklearn.metrics import pairwise_distances
from tqdm import tqdm

def sorting(tup, reverse=True):
    return(sorted(tup, key = lambda x: x[0],reverse=reverse))

for query in tqdm(queries):
    cluster = result_SimCLR[result_SimCLR.paths == query].cluster.values[0]
    
    neighborhood = result_SimCLR[(result_SimCLR.cluster == cluster) & (result_SimCLR.paths != query)]
    neighborhood_latents = np.array(neighborhood.encode.values)
    query_latent = result_SimCLR[result_SimCLR.paths == query].encode.values[0]
    
    neighborhood_latents = np.array([x for x in neighborhood_latents])
    
    dist = pairwise_distances(query_latent.reshape(1, -1), neighborhood_latents, metric = 'cosine')[0] 
    dist_path = [(x, y) for x, y in zip(dist, neighborhood.paths)]
    
    sorted_dist_path = np.array(sorting(dist_path, False))[:,1][:100]
    indices = result_SimCLR[result_SimCLR.paths == query].index
    sorted_dist_path = [x[27:-4] for x in sorted_dist_path]
    to_add = " ".join(sorted_dist_path)
    
    result_SimCLR.loc[indices, 'most_similars'] = to_add

In [None]:
IDs = [x[27:-4] for x in result_SimCLR[result_SimCLR.paths.isin(queries)].paths.values]
Expecteds = result_SimCLR[result_SimCLR.paths.isin(queries)].most_similars

In [None]:
submit_df = pd.DataFrame({'Id': IDs, 'Expected':Expecteds}).set_index('Id')

In [None]:
indices_queries = [x[27:-4] for x in queries]

In [None]:
final_submit = pd.DataFrame(columns=['Id', 'Expected'])
for i in indices_queries:
    final_submit = final_submit.append(submit_df[submit_df.index == i])

In [None]:
final_submit = final_submit.reset_index().drop('Id', axis=1).rename(columns={'index':'Id'}).set_index('Id')

In [None]:
final_submit.to_csv("SimCLR.csv")