# 📚 Import Dependecies

In [None]:
!pip install imagesize

!pip install ipython-autotime
%load_ext autotime

In [None]:
import os
import sys
import glob

import numpy as np 
import pandas as pd 

from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

import imagesize
import albumentations as A
import cv2

import wandb

from termcolor import colored
from colorama import Fore, Back, Style

import warnings
warnings.filterwarnings("ignore")

### Custom Colors 

In [None]:
# Custom colors
class clr:
    S = '\033[1m' + '\033[96m'
    E = '\033[0m'
    
# colored output
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
    
my_colors = ["#21295C", "#1F4E78", "#1C7293", "#73ABAF", "#C9E4CA", "#87BBA2", "#618E83", "#3B6064"]

# W & B Integration

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("WANDB_KEY")

CONFIG = {'competition': 'happywhale', '_wandb_kernel': 'ruch'}

os.environ["WANDB_SILENT"] = "true"

## Login

In [None]:
!wandb login $api_key

# Training Data

In [None]:
train_df = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")
train_df.head()

## Unique Species 

In [None]:
print(colored("Before fixing duplicate labels:", 'red'))
print("Number of unique species: ", train_df['species'].nunique())
print("\nSpecies names: ", train_df["species"].unique())

# Fixing duplicate labels
train_df['species'] = train_df['species'].str.replace('bottlenose_dolpin','bottlenose_dolphin')
train_df['species'] = train_df['species'].str.replace('kiler_whale','killer_whale')

print(colored("\nAfter fixing duplicate labels:", 'green'))
print("Number of unique species: ", train_df['species'].nunique())
print("\nSpecies names: ", train_df["species"].unique())

# Append _whale to beluga and globis
train_df["species"].replace(
    {
        "beluga": "beluga_whale", 
        "globis": "globis_whale"
    }, 
    inplace=True
)

In [None]:
# Relative paths to train and test image directories
train_img_dir = "../input/happy-whale-and-dolphin/train_images"
test_img_dir = "../input/happy-whale-and-dolphin/test_images"

train_images_path = glob.glob(f"{train_img_dir}/*.jpg")
test_images_path = glob.glob(f"{test_img_dir}/*.jpg")

print(f"{y_}Number of train images: {g_} {len(train_images_path)}\n")
print(f"{y_}Number of test images: {g_} {len(test_images_path)}\n")

run = wandb.init(project='happywhale', name='count',config = CONFIG)

un_ID = train_df.individual_id.nunique()
un_sp = train_df['species'].nunique()
wandb.log(
    {
        'Training samples': len(train_images_path), 
        'Test samples': len(test_images_path),
        'Number of individual IDs': un_ID,
        'Number of unique species': un_sp,
    }
)

run.finish()

In [None]:
def getShape(data, images_paths):
    shape = cv2.imread(images_paths[0]).shape
    
    for image_path in images_paths:
        image_shape = cv2.imread(image_path).shape
        if image_shape != shape:
            flag = False
            break
              
    if flag: 
        return f"{data}\n\tSame image shape - {shape}\n"
    else: 
        return f"{data}\n\tDifferent image shape - {shape}\n"      
        
print(getShape('Train Images', train_images_path))
print(getShape('Test Images', test_images_path))

## Display Images

In [None]:
def plot_images(img_path: str, nrows: int, ncols: int, title: str):
    figure, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16,8))
    plt.suptitle(title, fontsize=30)
    
    for i,im_path in enumerate(img_path):
        img = cv2.imread(im_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[i].imshow(img)
            ax.ravel()[i].set_axis_off()
        except:
            continue
            
    plt.tight_layout()
    plt.show()

In [None]:
plot_images(train_images_path[0:25], 5, 5, "Train Images")

In [None]:
plot_images(test_images_path[0:25], 5, 5, "Test Images")

# Distribution

In [None]:
train_df['label'] = train_df.species.map(lambda x: 'dolphin' if 'dolphin' in x else 'whale')

distdf = pd.DataFrame(train_df["label"].value_counts()).reset_index()
distdf.columns = ["Labels","Counts"]

fig = px.pie(
    distdf,
    values="Counts",
    names="Labels",
    title="Whales & Dolphins",
    hole=.4,
)
fig.update_traces(
    textposition='outside', 
    pull=[0.1, 0],
    rotation = 150
)
fig.update_layout(
    title = dict(
        font_size = 25,
    ),
    title_x=0.5,
)
fig.show()

## Species Distribution

In [None]:
fig,ax = plt.subplots(1,2,figsize=(16,8))

whales = train_df[train_df['label']=='whale']
dolphins = train_df[train_df['label']!='whale']
whales = whales.rename(columns = {"species":"species_whales"})
dolphins = dolphins.rename(columns = {"species":"species_dolphins"})

sns.countplot(
    y="species_whales", 
    data=whales, 
    order=whales.iloc[0:]["species_whales"].value_counts().index, 
    ax=ax[0], 
    palette="RdYlGn"
)
ax[0].set_title('Whales')
ax[0].set_ylabel(None)
    
sns.countplot(
    y="species_dolphins", 
    data=dolphins,order=dolphins.iloc[0:]["species_dolphins"].value_counts().index, 
    ax=ax[1], 
    palette="RdYlGn"
)
ax[1].set_title('Dolphins')
ax[1].set_ylabel(None)

plt.tight_layout()
plt.show()

In [None]:
print(f"{b_}Number of training images: {train_df.shape[0]}")
print(f"{b_}\nNumber of individual IDs: {train_df.individual_id.nunique()}")

# Images by Groups 

In [None]:
def path(df, groupby, group_type):
    PATH = "../input/happy-whale-and-dolphin/train_images"
    
    # Species
    if group_type == 'sp':
        z = df['image'][df['species']==groupby].values 
    # ID
    if group_type == 'id':
        z = df['image'][df['individual_id']==groupby].values 
   
    image_names = []
    for filename in z:
        fullpath = os.path.join(PATH, filename)
        image_names.append(fullpath)
    return image_names

def display_groups(df, group_type, lst):
    for item in lst:
        plot_images(path(df, item, group_type)[:9], 3, 3, item)
        
def species_frequency(df, col: str, freq: str, n: int):
    if freq == "Most":
        return df[col].value_counts()[:n].index.tolist()
    elif freq == "Least":
        return df[col].value_counts()[-n:].index.tolist()

## Most & Least Frequent Species

In [None]:
most_freq_species = species_frequency(train_df, "species", "Most", 5)
most_freq_ID = species_frequency(train_df, "individual_id", "Most", 5)

least_freq_species = species_frequency(train_df, "species", "Least", 5)
least_freq_ID = species_frequency(train_df, "individual_id", "Least", 5)

In [None]:
display_groups(train_df, 'sp', most_freq_species)

In [None]:
display_groups(train_df, 'sp', least_freq_species)

### Most & Least Frequent Whales

In [None]:
m_freq_species_whales = species_frequency(whales, "species_whales", "Most", 5)
whales = whales.rename(columns = {"species_whales":"species"})
display_groups(whales, 'sp', m_freq_species_whales)

In [None]:
l_freq_species_whales = species_frequency(whales, "species", "Least", 5)
display_groups(whales, 'sp', l_freq_species_whales)

### Most & Least Frequent Dolphins

In [None]:
m_freq_species_dolphins = species_frequency(dolphins, "species_dolphins", "Most", 5)
dolphins = dolphins.rename(columns = {"species_dolphins":"species"})
display_groups(dolphins, 'sp', m_freq_species_dolphins)

In [None]:
l_freq_species_dolphins = species_frequency(dolphins, "species", "Least", 5)
display_groups(dolphins, 'sp', l_freq_species_dolphins)

# Image Sizes

In [None]:
# Save image size to a new column within the training dataset
widths, heights = [], []

for path in tqdm(train_images_path):
    width, height = imagesize.get(path)
    widths.append(width)
    heights.append(height)
    
train_df["width"] = widths
train_df["height"] = heights
train_df["dimension"] = train_df["width"] * train_df["height"]

In [None]:
train_df

In [None]:
data_w = train_df[["species", "width", "label"]]
data_h = train_df[["species", "height", "label"]]

print(
    clr.S+"WIDTH - Min Value:" + clr.E, data_w["width"].min(), "pixels"
)
print(
    clr.S+"WIDTH - Max Value:"+clr.E, data_w["width"].max(), "pixels", "\n"
)
print(
    clr.S+"HEIGHT - Min Value:"+clr.E, data_h["height"].min(), "pixels"
)
print(
    clr.S+"HEIGHT - Max Value:"+clr.E, data_h["height"].max(), "pixels"
)

# Plots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 19))
fig.suptitle(
    '- Image Size distribution on Species -', 
    size = 26, 
    color = my_colors[7], 
    weight='bold'
)
axs = [ax1, ax2]


v1 = sns.violinplot(
    data=data_w, 
    x="species", 
    y="width", 
    hue="label", 
    palette=[my_colors[1], my_colors[3]], 
    ax=ax1
)
ax1.set_title(
    "Width", 
    y=0.97,
    size = 15, 
    color = my_colors[6], 
    weight='bold'
)
ax1.set_xlabel("")
ax1.set_ylabel("Width", size = 13, color = my_colors[6], weight='bold')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')


v2 = sns.violinplot(
    data=data_h, 
    x="species", 
    y="height", 
    hue="label", 
    palette=[my_colors[6], my_colors[4]], 
    ax=ax2
)
ax2.set_title(
    "Height", 
    y=0.9,
    size = 15, 
    color = my_colors[6], 
    weight='bold'
)
ax2.set_ylabel("Height", size = 13, color = my_colors[6], weight='bold')
ax2.set_xlabel("")
ax2.yaxis.set_tick_params(labelsize=13)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')


sns.despine(left=True, bottom=True)
plt.subplots_adjust(
    left=None, 
    bottom=None, 
    right=None, 
    top=0.93, 
    wspace=None, 
    hspace=None
);

In [None]:
data_d = train_df[["species", "dimension", "label"]]

# Plots
fig, (ax1) = plt.subplots(1, 1, figsize=(20, 5))
fig.suptitle(
    '- Image Dimension distribution on Species -', size = 26, color = my_colors[7], weight='bold'
)

sns.violinplot(
    data=data_d, 
    x="species", 
    y="dimension", 
    hue="label", 
    palette=[my_colors[1], my_colors[3]], 
    ax=ax1
)

ax1.set_xlabel("")
ax1.set_ylabel("Dimension", size = 13, color = my_colors[6], weight='bold')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')

sns.despine(left=True, bottom=True)
plt.subplots_adjust(
    left=None, 
    bottom=None, 
    right=None, 
    top=0.93, 
    wspace=None, 
    hspace=None
)

# Data Augmentations

In [None]:
def plot_augmentations(images, titles, sup_title):
    fig, axes = plt.subplots(figsize=(20, 16), nrows=3, ncols=4, squeeze=False)
    
    for indx, (img, title) in enumerate(zip(images, titles)):
        axes[indx // 4][indx % 4].imshow(img)
        axes[indx // 4][indx % 4].set_title(title, fontsize=15)
        
    plt.tight_layout()
    fig.suptitle(sup_title, fontsize = 20)
    fig.subplots_adjust(wspace=0.2, hspace=0.2, top=0.93)
    axes[2,2].set_visible(False)
    axes[2,3].set_visible(False)
    plt.show()
    
def augment(paths, data):
    
    # list of albumentations
    albumentations = [
        A.RandomSunFlare(p=0.02), 
        A.RandomFog(p=1), 
        A.RandomBrightness(p=1),
        A.Rotate(p=1, limit=90),
        A.RGBShift(p=1), 
        A.RandomSnow(p=0.02),
        A.HorizontalFlip(p=1),
        A.VerticalFlip(p=1),
        A.RandomContrast(limit=0.5, p = 1),
        A.HueSaturationValue(p=1, hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=50)
    ]
    
    # image titles
    titles = [
        "RandomSunFlare",
        "RandomFog",
        "RandomBrightnessContrast",
        "Rotate", 
        "RGBShift", 
        "RandomSnow",
        "HorizontalFlip",
        "VerticalFlip",
        "RandomContrast",
        "HSV"
    ]
    
    for i in paths:
        image_path = i
        
        # getting image name from path
        image_name = image_path.split("/")[4].split(".")[0]
        
        # reading image
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        
        # resizing the image
        image = cv2.resize(image, (224, 224))
        
        # list of images
        images = []
        
        # creating image augmentations
        for augmentation_type in albumentations:
            augmented_img = augmentation_type(image = image)['image']
            images.append(augmented_img)

        # original image
        titles.insert(0, "Original")
        images.insert(0,image)  
        
        sup_title = "Image Augmentation for " + data + " - " + image_name
        plot_augmentations(images, titles, sup_title)
        
        titles.remove("Original")
        
augment(train_images_path[50:55], 'train')

# THE END