# [Happywhale - Whale and Dolphin Identification](https://www.kaggle.com/c/happy-whale-and-dolphin/overview)
## Identify whales and dolphins by unique characteristics

<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/22962/logos/header.png?t=2021-03-17-22-44-0">

If you like this work, please upvote!

Please also this!

[Happywhale - 🐋 & 🐬 ID [EDA] | Kaggle](https://www.kaggle.com/tomato0813/happywhale-identification-eda)

# References for this notebook:

## Code:
[1] [HappyWhale ArcFace Baseline (TPU) | Kaggle](https://www.kaggle.com/ks2019/happywhale-arcface-baseline-tpu/notebook)

[2] [[GLRec] ResNet50 ArcFace (TF2.2) | Kaggle](https://www.kaggle.com/akensert/glrec-resnet50-arcface-tf2-2)

[3] [Explanation of MAP5 scoring metric | Kaggle](https://www.kaggle.com/pestipeti/explanation-of-map5-scoring-metric)

## Discussions:
None

## Others:
[Load and preprocess images &nbsp;|&nbsp; TensorFlow Core](https://www.tensorflow.org/tutorials/load_data/images)

[Transfer learning and fine-tuning &nbsp;|&nbsp; TensorFlow Core](https://www.tensorflow.org/tutorials/images/transfer_learning)

[Data augmentation &nbsp;|&nbsp; TensorFlow Core](https://www.tensorflow.org/tutorials/images/data_augmentation)

[arcface-pytorch/metrics.py at master · ronghuaiyang/arcface-pytorch](https://github.com/ronghuaiyang/arcface-pytorch)

[TPUs in Colab](https://colab.research.google.com/notebooks/tpu.ipynb#scrollTo=ovFDeMgtjqW4)

Please Upvote these work too!

Thanks guys!

# Setup

In [None]:
!pip install -q efficientnet

In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold

import PIL
import PIL.Image

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
import efficientnet.tfkeras as efn

from kaggle_datasets import KaggleDatasets

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
SEED = 42

N_TRAIN_DATA = 51033
N_TEST_DATA = 27956
N_CLASSES = 15587
NEW_INDIVIDUAL_CLASSE = 15587


GCS_PATH = KaggleDatasets().get_gcs_path('happywhale-tfrecords-5743-v1')

TRAIN_FILES = tf.io.gfile.glob(GCS_PATH + "/happywhale-train-*.tfrecord")
TEST_FILES = tf.io.gfile.glob(GCS_PATH + "/happywhale-test-*.tfrecord")


TRAIN_SIZE = int(0.8 * len(TRAIN_FILES))
VALIDATION_SIZE = int(len(TRAIN_FILES) - TRAIN_SIZE)

SHUFFLE_BUFFER_SIZE = 512
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
IMAGE_SIZE = 512

In [None]:
GCS_PATH

# CSV

In [None]:
train_df = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
test_df.head()

In [None]:
concat_df = pd.concat([train_df['image'], test_df['image']])
concat_df.head()

In [None]:
image_name_to_image_id = dict((image_name, index) for index, image_name in enumerate(concat_df.unique()))
image_id_to_image_name = {v: k for k, v in image_name_to_image_id.items()}

In [None]:
image_ids = [image_name_to_image_id[image_name] for image_name in train_df['image']]
train_df['image_id'] = image_ids
train_df.head(20)

In [None]:
individual_id_to_label = dict((i_id, index) for index, i_id in enumerate(train_df['individual_id'].unique()))
individual_id_to_label['new_individual'] = NEW_INDIVIDUAL_CLASSE

label_to_individual_id = {v: k for k, v in individual_id_to_label.items()}

In [None]:
train_df['label'] = [individual_id_to_label[i_id] for i_id in train_df['individual_id']]
train_df.head(20)

# Images

# Create a dataset

In [None]:
def decode_image(image_raw):
    image = tf.image.decode_jpeg(image_raw, channels=3)
    image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])
    image = tf.cast(image, tf.float32) / 255.0
    
    return image


def read_tfrecord(raw_image_dataset):
    feature_description = {
        "image_id": tf.io.FixedLenFeature([], tf.int64),
        "image_raw": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.int64),
    }

    parsed_image_dataset = tf.io.parse_single_example(raw_image_dataset, feature_description)
    image_id = tf.cast(parsed_image_dataset['image_id'], tf.int32)
    image = decode_image(parsed_image_dataset['image_raw'])
    label = tf.cast(parsed_image_dataset['label'], tf.int32)
    
    return image_id, image, label


def load_dataset(filenames):
    raw_image_dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
    image_dataset = raw_image_dataset.map(read_tfrecord, num_parallel_calls=AUTOTUNE) 
    
    return image_dataset


def check_format(image_id, image, label):
    return image_id


def arcface_format(image_id, image, label):
    return {'input/image': image, 'input/label': label}, label


def arcface_evaluation_format(image_id, image, label):
    return image

def arcface_test_format(image_id, image, label):
    return image_id, image


def augment(image):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_hue(image, 0.01)
    image = tf.image.random_saturation(image, 0.70, 1.30)
    image = tf.image.random_contrast(image, 0.80, 1.20)
    image = tf.image.random_brightness(image, 0.10)
    
    return image


def get_check_dataset(filenames):
    ds = load_dataset(filenames)
    ds = ds.map(check_format, num_parallel_calls=AUTOTUNE)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds

    
def get_training_dataset(filenames):
    ds = load_dataset(filenames)
    ds = ds.map(lambda image_id, image, label: (image, label))
    ds = ds.repeat()
    ds = ds.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.map(lambda image, label: (augment(image), label), num_parallel_calls=AUTOTUNE)
    ds = ds.map(lambda image, label: ({'input/image': image, 'input/label': label}, label))
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds
    
    
def get_validation_dataset(filenames):
    ds = load_dataset(filenames)
    ds = ds.map(arcface_format, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds
    

def get_evaluation_dataset(filenames):
    ds = load_dataset(filenames)
    ds = ds.map(arcface_evaluation_format, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds


def get_test_dataset(filenames):
    ds = load_dataset(filenames)
    ds = ds.map(arcface_evaluation_format, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

train_filenames = []
val_filenames = []

for train_index, val_index in kf.split(TRAIN_FILES):
    train_filenames = [TRAIN_FILES[i] for i in train_index]
    val_filenames = [TRAIN_FILES[i] for i in val_index]

In [None]:
train_filenames

In [None]:
val_filenames

In [None]:
check_train_ds = get_check_dataset(train_filenames)
check_val_ds = get_check_dataset(val_filenames)

In [None]:
%%time
train_image_id = [i.numpy() for i in check_train_ds]

In [None]:
train_ds_df = train_df.iloc[train_image_id].copy()
train_ds_df.head(20)

In [None]:
%%time
val_image_id = [i.numpy() for i in check_val_ds]

In [None]:
val_ds_df = train_df.iloc[val_image_id].copy()
val_ds_df.head(20)

In [None]:
train_ds_count = len(train_ds_df)
print("train data count: ", train_ds_count)

val_ds_count = len(val_ds_df)
print("valdation data count: ", val_ds_count)

In [None]:
train_ds = get_training_dataset(train_filenames)
val_ds = get_validation_dataset(val_filenames)

In [None]:
row = 10; col = 8;
row = min(row,BATCH_SIZE//col)

for (sample,label) in train_ds:
    img = sample['input/image']
    plt.figure(figsize=(25,int(25*row/col)))
    for j in range(row*col):
        plt.subplot(row,col,j+1)
        plt.title(label[j].numpy())
        plt.axis('off')
        plt.imshow(img[j,])
    plt.show()
    break
print(img.shape)

# Train a model

In [None]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False, ls_eps=0.0, **kwargs):
        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

        
    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)


    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(tf.math.l2_normalize(X, axis=1), tf.math.l2_normalize(self.W, axis=0))
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
            
        one_hot = tf.cast(tf.one_hot(y, depth=self.n_classes), dtype=cosine.dtype)
        
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        
        return output

In [None]:
def create_model(input_shape,
                 n_classes,
                 dense_units=512,
                 dropout_rate=0.0,
                 scale=30,
                 margin=0.3):
    
    """
    preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input
    backbone = tf.keras.applications.MobileNetV2(input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3), include_top=False, weights='imagenet')

    preprocess_input = tf.keras.applications.efficientnet.preprocess_input
    backbone = tf.keras.applications.efficientnet.EfficientNetB5(include_top=False,
                                                                 weights='imagenet',
                                                                 input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
                                                                 
    x = preprocess_input(image)                                                             
    """
    
    backbone = efn.EfficientNetB5(include_top=False, weights='noisy-student')
    
    backbone.trainable=True

    pooling = tf.keras.layers.GlobalAveragePooling2D(name='head/pooling')
    dropout = tf.keras.layers.Dropout(dropout_rate, name='head/dropout')
    dense = tf.keras.layers.Dense(dense_units, name='head/dense')

    margin = ArcMarginProduct(n_classes=n_classes,
                              s=scale,
                              m=margin,
                              name='head/arc_margin',
                              dtype='float32')

    softmax = tf.keras.layers.Softmax(dtype='float32')

    image = tf.keras.layers.Input(input_shape, name='input/image')
    label = tf.keras.layers.Input((), name='input/label')

    
    x = backbone(image)
    x = pooling(x)
    x = dropout(x)
    embed = dense(x)
    x = margin([embed, label])
    outputs = softmax(x)
    
    model = tf.keras.Model(inputs=[image, label], outputs=outputs)
    embed_model = tf.keras.Model(inputs=image, outputs=embed)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss = [tf.keras.losses.SparseCategoricalCrossentropy()],
              metrics = [tf.keras.metrics.SparseCategoricalAccuracy(), tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5)])

    return model, embed_model

In [None]:
with strategy.scope(): # creating the model in the TPUStrategy scope means we will train the model on the TPU
    model, embed_model = create_model(input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
                                      n_classes=N_CLASSES,
                                      dense_units=512,
                                      dropout_rate=0.2,
                                      scale=30,
                                      margin=0.3)

In [None]:
model.summary()

In [None]:
class ExpDecayScheduler:
    def __init__(self, warmup_lr_limit=0.001, warmup_epochs=4, lr_decay=0.9):
        self.warmup_lr_limit = warmup_lr_limit
        self.warmup_epochs = warmup_epochs
        self.lr_decay = lr_decay
        
        
    def __call__(self, epoch):
        epoch = max(epoch, 1)
        if epoch <= self.warmup_epochs:
            return self.warmup_lr_limit * epoch / self.warmup_epochs
        
        return self.warmup_lr_limit * (self.lr_decay ** (epoch - self.warmup_epochs))

    
class CosineDecayScheduler:
    def __init__(self, max_epochs, warmup_lr_limit=0.001, warmup_epochs=4):
        self.max_epochs = max_epochs
        self.warmup_lr_limit = warmup_lr_limit
        self.warmup_epochs = warmup_epochs


    def __call__(self, epoch):
        epoch = max(epoch, 1)
        if epoch <= self.warmup_epochs:
            return self.warmup_lr_limit * epoch / self.warmup_epochs
        
        epoch -= 1
        rad = math.pi * epoch / self.max_epochs
        weight = (math.cos(rad) + 1.0) / 2
        return self.warmup_lr_limit * weight

    
scheduler = ExpDecayScheduler(warmup_lr_limit=0.000005*BATCH_SIZE, warmup_epochs=4, lr_decay=0.9)  
lr_scheduler_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model_weights.h5', 
                                             monitor='val_loss', 
                                             verbose=0, 
                                             save_best_only=True,
                                             save_weights_only=True, 
                                             mode='min', 
                                             save_freq='epoch')

In [None]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=1,
          verbose=1,
          callbacks=[lr_scheduler_callback, model_checkpoint],
          steps_per_epoch=(train_ds_count//BATCH_SIZE),
          validation_steps=(val_ds_count//BATCH_SIZE))

In [None]:
model.load_weights(f'model_weights.h5')

# Evaluation

In [None]:
eval_ds_df = val_ds_df.copy()

ids_set = set([i_id for i_id in train_ds_df['individual_id'].unique()])
eval_ds_df.loc[~eval_ds_df.individual_id.isin(ids_set),'individual_id'] = 'new_individual'
eval_ds_df.loc[~eval_ds_df.individual_id.isin(ids_set),'label'] = NEW_INDIVIDUAL_CLASSE
eval_ds_df.head(20)

In [None]:
eval_train_ds = get_evaluation_dataset(train_filenames)
eval_val_ds = get_evaluation_dataset(val_filenames)

In [None]:
train_embeddings = embed_model.predict(eval_train_ds, verbose=1)
val_embeddings = embed_model.predict(eval_val_ds, verbose=1)

In [None]:
from sklearn.neighbors import NearestNeighbors
eval_nn = NearestNeighbors(n_neighbors=150, metric='cosine')
eval_nn.fit(train_embeddings)

In [None]:
%%time
eval_distances, eval_indices = eval_nn.kneighbors(val_embeddings, n_neighbors=150, return_distance=True)

In [None]:
eval_indices_list = eval_indices.tolist()
eval_confidences_list = (1 - eval_distances).tolist()

eval_ds_df['prediction_labels'] = None
eval_ds_df['confidence'] = None

for i, (eval_inds, confs) in enumerate(zip(eval_indices_list, eval_confidences_list)):
    prediction_labels = []
    confidence = []
    
    for eval_ind, conf in zip(eval_inds, confs):
        pred = train_ds_df.at[train_ds_df.index[eval_ind], 'label']
        
        if not (pred in prediction_labels):
            prediction_labels.append(pred)
            confidence.append(conf)
            
            eval_ds_df.at[eval_ds_df.index[i], 'prediction_labels'] = prediction_labels
            eval_ds_df.at[eval_ds_df.index[i], 'confidence'] = confidence
            
            if len(prediction_labels) == 5:
                break

In [None]:
eval_ds_df.head()

In [None]:
def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0

In [None]:
thresholds = np.linspace(0, 1.0, 11)
best_threshold = 0
best_cv = 0


for th in thresholds:
    temp_val = eval_ds_df.copy()
    
    for i, (confs, preds) in enumerate(zip(temp_val['confidence'], temp_val['prediction_labels'])):
        temp_preds = preds
        
        for j, conf in enumerate(confs):
            if conf <= th:
                temp_preds.insert(j, NEW_INDIVIDUAL_CLASSE)
                temp_preds.pop()
                break
            
        temp_val.at[temp_val.index[i], 'prediction_labels'] = temp_preds
        

    eval_ds_df[f'threshold_{th}'] = None
    for i, (label, preds) in enumerate(zip(temp_val['label'], temp_val['prediction_labels'])):
        eval_ds_df.at[temp_val.index[i], f'threshold_{th}'] = map_per_image(label, preds)
  
    cv = eval_ds_df[f'threshold_{th}'].mean()
    print(f"Threshold: {th}, CV: {cv}")
    
    if cv > best_cv:
        best_threshold = th
        best_cv = cv

        
print("Best threshold", best_threshold)
print("Best cv", best_cv)

In [None]:
eval_ds_df

In [None]:
## Adjustment: Since Public lb has nearly 10% 'new_individual' (Be Careful for private LB)
threshold_df = eval_ds_df[[f'threshold_{th}' for th in thresholds]]
eval_ds_df['is_new_individual'] = eval_ds_df['individual_id']=='new_individual'
print(eval_ds_df.is_new_individual.value_counts().to_dict())
threshold_df

In [None]:
eval_scores = pd.DataFrame(thresholds, columns=['threshold'])
eval_scores['adjusted_cv'] = 0
adjusted_cv = threshold_df[eval_ds_df['is_new_individual']].mean() * 0.1 + threshold_df[~eval_ds_df['is_new_individual']].mean() * 0.9
eval_scores['adjusted_cv'] = [i for i in adjusted_cv]
best_threshold_adjusted = eval_scores['threshold'].iloc[eval_scores['adjusted_cv'].idxmax()]
print("best_threshold",best_threshold_adjusted)
eval_scores

# Submission

In [None]:
test_ds_df = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
test_ds_df.head()

In [None]:
concat_train_embeddings = np.concatenate([train_embeddings, val_embeddings])
concat_train_embeddings.shape

In [None]:
concat_train_ds_df = pd.concat([train_ds_df, val_ds_df])
concat_train_ds_df.shape

In [None]:
test_nn = NearestNeighbors(n_neighbors=150, metric='cosine')
test_nn.fit(concat_train_embeddings)

In [None]:
%%time
check_test_ds = get_check_dataset(TEST_FILES)
test_image_id = [i.numpy() - N_TRAIN_DATA for i in check_test_ds]

In [None]:
test_image_id[:10]

In [None]:
test_ds_df = test_ds_df.reindex(test_image_id)
test_ds_df.head()

In [None]:
test_ds = get_test_dataset(TEST_FILES)
test_embeddings = embed_model.predict(test_ds, verbose=1)

In [None]:
%%time
test_distances, test_indices = test_nn.kneighbors(test_embeddings, n_neighbors=150, return_distance=True)

In [None]:
test_indices_list = test_indices.tolist()
test_confidences_list = (1 - test_distances).tolist()

test_ds_df['prediction_labels'] = None
test_ds_df['confidence'] = None

for i, (test_inds, confs) in enumerate(zip(test_indices_list, test_confidences_list)):
    prediction_labels = []
    confidence = []
    
    for test_ind, conf in zip(test_inds, confs):
        pred = concat_train_ds_df.at[concat_train_ds_df.index[test_ind], 'label']
        
        if not (pred in prediction_labels):
            prediction_labels.append(pred)
            confidence.append(conf)
            
            test_ds_df.at[test_ds_df.index[i], 'prediction_labels'] = prediction_labels
            test_ds_df.at[test_ds_df.index[i], 'confidence'] = confidence
            
            if len(prediction_labels) == 5:
                break

In [None]:
test_ds_df.head()

In [None]:
test_ds_df = test_ds_df.reindex([i for i in range(N_TEST_DATA)])
test_ds_df.head()

In [None]:
for i, (confs, preds) in enumerate(zip(test_ds_df['confidence'], test_ds_df['prediction_labels'])):
        for j, conf in enumerate(confs):
            if conf <= best_threshold_adjusted:
                preds.insert(j, NEW_INDIVIDUAL_CLASSE)
                preds.pop()
                break
            
        test_ds_df.at[test_ds_df.index[i], 'prediction_labels'] = preds
        
test_ds_df.head()

In [None]:
predictions = [None] * N_TEST_DATA

for i, pred_labels in enumerate(test_ds_df['prediction_labels']):
    row = []
    
    for pred_label in pred_labels:
        row.append(label_to_individual_id[pred_label])
        
    predictions[i] = " ".join(row)

In [None]:
test_ds_df['predictions'] = predictions
_ = test_ds_df.pop('prediction_labels')
_ = test_ds_df.pop('confidence')

test_ds_df.head()

In [None]:
test_ds_df.to_csv('submission.csv', index=False)