# Happy Whales and Dolphins 🐬

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

In [None]:
print(f"Competition Files and Folders: {os.listdir('/kaggle/input/happy-whale-and-dolphin')}")

Loading `train.csv` and `sample_submission.csv`.

In [None]:
train_df = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/train.csv')
samp_submission_df = pd.read_csv('/kaggle/input/happy-whale-and-dolphin/sample_submission.csv')

Looking at `train_df` contents.

In [None]:
train_df.head()

Correcting incorrect spellings for sanity check purposes.
`kiler_whale` → `killer_whale`
`bottlenose_dolpin` → `bottlenose_dolphin`

In [None]:
train_df.loc[train_df.species == 'kiler_whale', 'species'] = 'killer_whale'
train_df.loc[train_df.species == 'bottlenose_dolpin', 'species'] = 'bottlenose_dolphin'

Looking at `sample_submission` contents.

In [None]:
samp_submission_df.head()

How many images, species, and individual IDs are present?

In [None]:
print(f"Images in train index file: {train_df.image.nunique()}")
print(f"Species in train index file: {train_df.species.nunique()}")
print(f"Individual IDs in train index file: {train_df.individual_id.nunique()}")

print(f"Images in train images folder: {len(os.listdir('/kaggle/input/happy-whale-and-dolphin/train_images'))}")
print(f"Images in test images folder: {len(os.listdir('/kaggle/input/happy-whale-and-dolphin/test_images'))}")

Species frequency within the train dataset.

In [None]:
spec_freq = train_df["species"].value_counts()
df = pd.DataFrame({'Species': spec_freq.index,
                   'Images': spec_freq.values})
plt.figure(figsize = (12, 6))
plt.title('Distribution of Species Images - Train Dataset')
sns.set_color_codes("deep")
s = sns.barplot(x = "Species", y="Images", data=df)
s.set_xticklabels(s.get_xticklabels(), rotation=90)
locs, labels = plt.xticks()
plt.show()

Visualizing the Individual IDs found associated with each species.

In [None]:
id_freq = train_df.groupby(["species"])["individual_id"].nunique()
df = pd.DataFrame({'Species': id_freq.index,
                   'Unique ID Count': id_freq.values
                  })
df = df.sort_values(['Unique ID Count'], ascending=False)
plt.figure(figsize = (12,6))
plt.title('Distribution of Species Individual IDs - train dataset')
sns.set_color_codes("deep")
s = sns.barplot(x = 'Species', y="Unique ID Count", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()


Checking if the images listed in `train_df` are identical with those found within the list of images in `train_images`.

In [None]:
train_df_list = list(train_df.image.unique())
train_images_list = list(os.listdir('/kaggle/input/happy-whale-and-dolphin/train_images'))
delta = set(train_df_list) & set(train_images_list) # iterable conversion
minus = set(train_df_list) - set(train_images_list) # difference between sets
print(f"Images in train dataset: {len(train_df_list)}\nImages in train folder: {len(train_images_list)}\nIntersection: {len(delta)}\nDifference: {len(minus)}")

All the images present in `train_df` are also present in the `train_images` folder.

Creating a helper function which returns the shape of an image within `train_images`

In [None]:
def show_image_size(file_name):
    image = cv2.imread('/kaggle/input/happy-whale-and-dolphin/train_images/' + file_name)
    return list(image.shape)

Using a sample size of 2500 images, let's determine the image dimensions

As per competition format consideration import `time`

In [None]:
import time
sample_size = 2500
time_alpha = time.time() # start time
train_sample_df = train_df.sample(sample_size)
sample_img_func = np.stack(train_sample_df['image'].apply(show_image_size))
dimensions_df = pd.DataFrame(sample_img_func, columns=['width', 'height', 'c_channels'])
print(f"Total run time for {sample_size} images: {round(time.time()-time_alpha, 2)} sec.")


Now let's see how many different image dimensions are present in just 2500 image samples.

In [None]:
train_img_df = pd.concat([train_sample_df, dimensions_df], axis=1, sort=False)
print(f"Number of different image sizes in {2500} samples: {train_img_df.groupby(['width', 'height','c_channels']).count().shape[0]}")

## Data Preperation

In [None]:
import PIL
import PIL.Image
import tensorflow as tf

In [None]:
id_unique = train_df['individual_id'].unique()
id_unique

In [None]:
id_to_index = dict((name, index) for index, name in enumerate(id_unique))

In [None]:
image_id_index = [id_to_index[i] for i in train_df['individual_id']]

In [None]:
image_id_index[:10]

In [None]:
train_df['label'] = image_id_index
train_df.head()

Collect training image paths:

In [None]:
train_image_paths = ['/kaggle/input/happy-whale-and-dolphin/train_images/' + img for img in train_df['image']]
train_image_paths[:10]

Build helper function to resize images.

In [None]:
def image_preprocess(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image = image / 255.0
    return image

In [None]:
def load_and_process(path):
    image = tf.io.read_file(path)
    return image_preprocess(image)

Function test:

In [None]:
for i in range(22):
    temp_img_path = train_image_paths[i]
    temp_label = image_id_index[i]
    plt.imshow(load_and_process(temp_img_path))
    plt.grid(False)
    plt.title(id_unique[i] + " (" + train_df['species'][i] + ")")

## Tensorflow Dataset

In [None]:
paths_ds = tf.data.Dataset.from_tensor_slices(train_image_paths)
images_ds = paths_ds.map(load_and_process, num_parallel_calls=tf.data.experimental.AUTOTUNE)
labels_ds = tf.data.Dataset.from_tensor_slices(tf.cast(image_id_index, tf.int64))
image_labels_ds = tf.data.Dataset.zip((images_ds, labels_ds)) # Bringing together images and their id labels (image, label)

In [None]:
batch_size = 32
ds = image_labels_ds.shuffle(buffer_size=1024)
ds = ds.batch(batch_size)
ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
ds

In [None]:
from tensorflow.keras.applications.efficientnet import EfficientNetB0

In [None]:
preprocess_input = tf.keras.applications.efficientnet.preprocess_input

base_model = EfficientNetB0(input_shape=(224, 224, 3), include_top=False, weights='imagenet', classifier_activation='softmax')
base_model.trainable=True
prediction_layer = tf.keras.layers.Dense(len(id_unique))

Previous version neglected using a classifier_activation parameter. Accuracy has risen since its addition.

In [None]:
inputs = tf.keras.Input(shape=(224, 224, 3))
x = preprocess_input(inputs)
x = base_model(x, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
outputs = prediction_layer(x)

model = tf.keras.Model(inputs, outputs)

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(ds, epochs=5)

Taking a look at the sample_submission.csv:

In [None]:
samp_submission_df.head()

In [None]:
test_image_paths = ['/kaggle/input/happy-whale-and-dolphin/test_images/' + img for img in samp_submission_df['image']]
test_path_ds = tf.data.Dataset.from_tensor_slices(test_image_paths)
test_image_ds = test_path_ds.map(load_and_process, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_ds = test_image_ds.batch(32).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
%%time

pred = model.predict(test_ds)

In [None]:
pred = pred.argsort(axis=1)[:,::-1]
pred = pred[:,0:5]

In [None]:
index_to_id = {v: k for k, v in id_to_index.items()}
predictions = [None] * len(pred)

for i in range(len(pred)):
    row = [None] * 5
    
    for j in range(5):
        row[j] = index_to_id[pred[i][j]]
        
    predictions[i] = " ".join(row)

In [None]:
samp_submission_df['predictions'] = predictions
samp_submission_df['predictions'].head()

In [None]:
samp_submission_df.to_csv('submission.csv', index=False)

In [None]:
submission = pd.read_csv('submission.csv')

In [None]:
submission.head()