<div style="text-align: center"><h1><font size='10' face = "Comic sans MS" color="#008e94">Happy Whale 🐳</font></h1></div>

<div class="alert alert-block alert-info"> 
Hello! This notebook is made to help you explore the data easily and also for me to share some observations. 
    
It is a good idea to fork, change the seed below and view more samples in plotly! 🍻
</div>

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from skimage import io
from skimage.color import gray2rgb
from skimage.transform import resize
from rich.jupyter import print

import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split, Dataset
from torchvision import transforms
import pytorch_lightning as pl
from typing import Tuple, List

seed=400 # to make the notebook reproducible (including images)

# Meta Data 📄

In [None]:
def get_train_csv(
    path:str="/kaggle/input/happy-whale-and-dolphin/train.csv"
    )->Tuple[pd.DataFrame, pd.DataFrame]:
    # fix spelling of two species and merging subspecies, reduces the unique species from 30 -> 26
    train_df = pd.read_csv(path)
    train_df['species'].replace({'bottlenose_dolpin': 'bottlenose_dolphin', 
                                 'kiler_whale': 'killer_whale',
                                 'pilot_whale': 'short_finned_pilot_whale',
                                 'globis': 'short_finned_pilot_whale'
                                }, inplace=True)

    # assign id to species
    train_df['species_id'], species_index = train_df['species'].factorize()
    return train_df, species_index

# the extended csv is just original csv and shape info
train_df, species_index = get_train_csv(path="/kaggle/input/happywhale-extended-meta/train_extended.csv")
train_df.describe(include='all').loc[['count', 'unique', 'freq', 'top']]

<div class="alert alert-block alert-info"> 
<p style="font-size:20px; display:inline">💡</p> fixing spelling of two species and merging subspecies, reduces the unique species from 30 -> 26.
    
<a href="https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/305341#1677105"> Link - confirmation from competition host</a>
</div>

# Sample distributions 📈

In [None]:
# gets nice gradient, also avoid repeating colros
colors = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 320, len(species_index))] # https://plotly.com/python/box-plots/

# samples per species
fig = px.histogram(train_df, 'species', color='d2', labels={'d2': '# channels'},
                   title='samples per species').update_xaxes(categoryorder='total descending')
fig.show()

print(f"[blue]🌟 {len(train_df[train_df['d2']==1])} out of {len(train_df)} images are single channel", justify='center')
n_only_single_channel = train_df['individual_id'].nunique() - train_df[train_df['d2'] == 3]['individual_id'].nunique()
print(f"[blue]🌟 {n_only_single_channel} individuals have only single channel samples", justify='center')

# individuals per species
uniques_df = train_df.groupby('species', sort=False)['individual_id'].nunique().reset_index(name='uniques')
fig = px.histogram(uniques_df, x='species', y='uniques', color_discrete_sequence=['#FF6692'], title='individual per species').update_xaxes(categoryorder='total descending')
fig.show()

# individual frequency distribution
counts_df = train_df.groupby('species', sort=False)['individual_id'].value_counts().reset_index(name='frequency')
fig = px.histogram(counts_df, x='frequency', color_discrete_sequence=['#00CC96'], title='individual frequency distribution', log_y=True)
fig.show()

# individual frequencies across species
fn = lambda s: pd.Series({"frequency": list(s["frequency"]),"id": list(s["individual_id"])})
species_freq = counts_df.groupby(['species'], sort=False).apply(fn).reset_index()

fig = go.Figure()
for i in range(len(species_freq)):
    box = go.Box(y=species_freq['frequency'][i],
                 name=species_freq['species'][i],
                 hovertext=species_freq['id'][i],
                 jitter=1,
                 marker=dict(color = colors[i], size=2),
                 boxpoints='all',        
                 pointpos=0, # hide box
                 fillcolor='rgba(0,0,0,0)',
                 line_width=0)
    fig.add_trace(box)
    
fig.update_layout(
    yaxis_type="log",
    title="individual frequencies across species",
    yaxis_title="frequency of an individual",
    showlegend=False)
fig.show()

del uniques_df

#### To keep in mind:
- Should handle the observed class imbalance
- ~ 17% of the individuals have only one instance, i.e no other positive sample

# Species images - Plotly 🐬 🐋

In [None]:
def get_image_array(paths: List[str], shape: Tuple=(300,300), image_folder:str="/kaggle/input/happy-whale-and-dolphin/train_images") -> np.array:
    """read all images as a single array for plotly."""
    read_img = lambda file: io.imread(f"{image_folder}/{file}")
    images = []
    for x in paths:
        img = read_img(x)
        if len(img.shape) == 2: 
            img = gray2rgb(img)
        images.append(resize(img, shape))
    return np.asarray(images)

In [None]:
# get a sample from all species
samples = train_df.groupby('species', sort=False).apply(lambda df: df.sample(1, random_state=seed)).droplevel(0)
sample_images = get_image_array(samples['image'])

# plot image grid
fig = px.imshow(sample_images, facet_col=0, binary_string=True, facet_col_wrap=10, facet_row_spacing=0.0, facet_col_spacing=0)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

# add species name and individual_id
for i, a in enumerate(fig.layout.annotations):
    a.text = samples.iloc[i].species + "<br>" + samples.iloc[i].individual_id + "<br>" + samples.iloc[i].image + "<br>" + str(samples.iloc[i][['d0', 'd1', 'd2']].tolist())
fig.update_layout(title="Samples from all the species", autosize=True, hovermode=False, height=900, width=1500, margin=dict(l=0, r=0, t=150, b=80))
fig.show()

# Individual samples 🔍🕵

Here we see all samples of an individual with fewer total samples. This might help us a better understanding on what information we have about an individual and what the model should focus on during training to achieve better results. 

We can encourage that at the data creation step by answering if it makes more sense to 
- use the whole image 
- crop the whole animal, using shark/dolhin detector
- crop the fins alone, by training a fin detector (seems like what the competition description points to)
- ignore the images that doesnt have a nice view of the fins 
- ignore images that are too big for individuals with very high samples and so on.. 🤷🏻‍♂️
- Too many samples for an individual might hurt as they maybe from various time periods, thus confusing the models! 🙀

In [None]:
def samples_with_freq(df:pd.DataFrame, freq:int, seed:int, n_samples:int=1, verbose:bool=True):
    """Get samples of an id with particular frequency from every species"""
    samples = []
    n_total = 0
    n_sampled = 0
    for name, g in df.groupby('species'):
        subset = g[g['frequency']==freq]
        n_total += len(subset)
        if len(subset) >= n_samples:    
            n_sampled += n_samples
            samples.append(subset.sample(n_samples, random_state=seed))
    
    if verbose: print(f"['info'] samples_with_freq {freq} - sampled {n_sampled}/{n_total} [{100*n_sampled/n_total:.2f}%]")
    
    return pd.concat(samples)

# ⭐️ Single instance 🐬

In [None]:
# ids with frequncy 1
n_display = 10
freq = 1
# one sample per species
ids = samples_with_freq(counts_df, freq=freq, seed=seed, n_samples=1)['individual_id']
samples = train_df.loc[train_df['individual_id'].isin(ids.sample(n_display, random_state=seed).tolist())]
print(f"['info'] displaying {n_display} ids")
sample_images = get_image_array(samples['image'])

# plot image grid
fig = px.imshow(sample_images, facet_col=0, binary_string=True, facet_row_spacing=0.0, facet_col_spacing=0)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

# add species name and individual_id
for i, a in enumerate(fig.layout.annotations):
    a.text = samples.iloc[i].species + "<br>" + samples.iloc[i].individual_id + "<br>" + samples.iloc[i].image + "<br>" + str(samples.iloc[i][['d0', 'd1', 'd2']].tolist())
fig.update_layout(title=f"Individuals with {freq} instance", autosize=True, hovermode=False, height=400, width=1500, margin=dict(l=40, r=0, t=140, b=80))
fig.show()

<div style="display: flex; justify-content: flex-end">
<div class="alert alert-block alert-info"> Scroll right to see more images!
<p style="font-size:20px; display:inline">👉🏼</p></div>
</div>

### Observations
- As the output suggests, all the species have an idividual with 1 instance.
- We should learn to identify these individuals without any positive pair! 😥
- There is good view of the fin! 😇
- It seems hard to identidy the species by just looking at the fin 😕, for example, spotted dolphin doesnt seem to have spotted fin. 
    - Important, if you want to do candidate selection based on initial species classification. ⚠️
- Super zoomed out images, blind downsampling would hurt picking the individual's features ✂️
- Let's strengthen these observations after analyzing the test images as well! (in the later sections)

<div class="alert alert-block alert-warning"> 
I would encourage you to fork, change the seed or remove the random_state in the `samples_with_freq` method and explore more samples to see if these observations hold and make sure that they are generalizable</div>

# ⭐️ Many (10) instances 🐬🐬🐬

In [None]:
# ids with frequncy 10
n_display = 10
freq = 10
# one sample per species
ids = samples_with_freq(counts_df, freq=freq, seed=seed, n_samples=1)['individual_id']
samples = train_df.loc[train_df['individual_id'].isin(ids.sample(n_display, random_state=seed).tolist())]
print(f"['info'] displaying {n_display} ids")

id_groups = samples.groupby('individual_id')
if len(id_groups) != n_display: 
    print("!!! Something is wrong when grouping same individuals, plot is wrong!")
    
# plot image grid
for individual_id, samples in id_groups:  # samples is no longer the whole df
    sample_images = get_image_array(samples['image'])
    
    fig = px.imshow(sample_images, facet_col=0, binary_string=True, facet_row_spacing=0.0, facet_col_spacing=0)
    fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
    
    # add species name and individual_id
    for i, a in enumerate(fig.layout.annotations):
        a.text = samples.iloc[i].image + "<br>" + str(samples.iloc[i][['d0', 'd1', 'd2']].tolist())
    fig.update_layout(title=f"Instances of {individual_id} - {samples['species'].iloc[0]}", autosize=True, hovermode=False, height=350, width=1500, margin=dict(l=40, r=0, t=140, b=0))
    fig.show()

### Observations 
- Adding to the observations from the single instance section, its safe to say just cropping the fins (contrary to the initial assumption made in the initial sections), might be a bad idea.
- Cues to easily identify the individual can be seen on the body (color) and not just the fins (pattern)
- Varying colors of the water and the image tone
    - Less likely that the water color would be leak or clue to identify the individual
- It wouldnt be a bad idea to not crop at all. As some individuals seems to be shot at the same distance. 
    - Probably all shot in a day? 😅

# ⭐️ *Way too many (50!)* instances 🥴

In [None]:
# ids with frequncy 50
n_display = 1
freq = 50
# one sample per species
ids = samples_with_freq(counts_df, freq=freq, seed=seed, n_samples=1)['individual_id']
samples = train_df.loc[train_df['individual_id'].isin(ids.sample(n_display, random_state=seed).tolist())]
print(f"['info'] displaying {n_display} ids")

id_groups = samples.groupby('individual_id')
if len(id_groups) != n_display: 
    print("!!! Something is wrong when grouping same individuals, plot is wrong!")
    
# plot image grid
for individual_id, samples in id_groups:  # samples is no longer the whole df
    sample_images = get_image_array(samples['image'])
    
    fig = px.imshow(sample_images, facet_col=0, facet_col_wrap=10, binary_string=True, facet_row_spacing=0.0, facet_col_spacing=0)
    fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
    
    # add species name and individual_id
    for i, a in enumerate(fig.layout.annotations):
        a.text = samples.iloc[i].image + "<br>" + str(samples.iloc[i][['d0', 'd1', 'd2']].tolist())
    fig.update_layout(title=f"Instances of {individual_id} - {samples.iloc[0].species}", autosize=True, hovermode=False, height=1000, width=1500, margin=dict(l=40, r=0, t=120, b=10))
    fig.show()

### Observations
- Why not visualize individuals with 100 or 400 samples?
    - 1. too long to viz :) 2. They are outliers, but there are many ids with freq 20-50.
- Obviously there are some extremely zoomed out images of the whale. ID-ing it aside, there is no way to even classify it as a whale or a duck! 😅

# Test dataset

In [None]:
# read test images
n_samples = 50
test_image_folder = "/kaggle/input/happy-whale-and-dolphin/test_images/"
submission_df = pd.read_csv("/kaggle/input/happywhale-extended-meta/sample_submission_extended.csv")
samples = submission_df.sample(n_samples, random_state=seed)

print(f"[blue]🌟 {len(submission_df[submission_df['d2']==1])} out of {len(submission_df)} images are single channel", justify='center')

In [None]:
sample_images = get_image_array(paths=samples['image'], image_folder=test_image_folder)

# plot image grid
fig = px.imshow(sample_images, facet_col=0, binary_string=True, facet_col_wrap=10, facet_row_spacing=0.0, facet_col_spacing=0)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)

# add species name and individual_id
for i, a in enumerate(fig.layout.annotations):
    a.text = samples.image.iloc[i] + "<br>" + str(samples.iloc[i][['d0', 'd1', 'd2']].tolist())
fig.update_layout(title=f"Random test images", autosize=True, hovermode=False, height=1000, width=1500, margin=dict(l=40, r=0, t=120, b=10))
fig.show()

### Observations
- From the random samples we can mostly see decent images but few are too far away and cant be seen at all? Please explore more here.

<div class="alert alert-block alert-success"> 
Now its time to put these insights into good use. Thanks for reading, good luck! 🍀
</div>