![](https://i.imgur.com/Kk8L8Ei.png)

# Import libraries 📚

In [None]:
import numpy as np 
import pandas as pd 
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import albumentations as A
import wandb

from termcolor import colored
from colorama import Fore, Back, Style
# colored output
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA

import warnings
warnings.filterwarnings("ignore")

<center><img src="https://camo.githubusercontent.com/dd842f7b0be57140e68b2ab9cb007992acd131c48284eaf6b1aca758bfea358b/68747470733a2f2f692e696d6775722e636f6d2f52557469567a482e706e67"></center>

I will be integrating ```W&B``` for ```visualizations``` and ```logging artifacts```!

[Happywhale - Whale and Dolphin Identification Project on W&B Dashboard](https://wandb.ai/ruchi798/happywhale?workspace=user-ruchi798) 🏋️‍♀️

* To get the API key, an account is to be created on the website first.
* Next, use secrets to use API Keys more securely🤫

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("api_key")

CONFIG = {'competition': 'happywhale', '_wandb_kernel': 'ruch'}

os.environ["WANDB_SILENT"] = "true"

In [None]:
! wandb login $api_key

In [None]:
train_df = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")
train_df.head()

# Unique species and Species names 🐋 🐬

In [None]:
print(colored("Before fixing duplicate labels:", 'green'))
print("Number of unique species: ",train_df['species'].nunique())
print("\nSpecies names: " ,train_df["species"].unique())

# fixing duplicate labels
train_df['species'] = train_df['species'].str.replace('bottlenose_dolpin','bottlenose_dolphin')
train_df['species'] = train_df['species'].str.replace('kiler_whale','killer_whale')

print(colored("\nAfter fixing duplicate labels:", 'green'))
print("Number of unique species: ",train_df['species'].nunique())
print("\nSpecies names: " ,train_df["species"].unique())

# append _whale to beluga and globis
train_df["species"].replace({"beluga": "beluga_whale", "globis": "globis_whale"}, inplace=True)

In [None]:
# specifying directory paths

train_jpg_directory = '../input/happy-whale-and-dolphin/train_images'
test_jpg_directory = '../input/happy-whale-and-dolphin/test_images'

# function to get image paths from train and test directory

def getImagePaths(path):
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

train_images_path = getImagePaths(train_jpg_directory)
test_images_path = getImagePaths(test_jpg_directory)

print(f"{y_}Number of train images: {g_} {len(train_images_path)}\n")
print(f"{y_}Number of test images: {g_} {len(test_images_path)}\n")

run = wandb.init(project='happywhale', name='count',config = CONFIG)

un_ID = train_df.individual_id.nunique()
un_sp = train_df['species'].nunique()
wandb.log({'Training samples': len(train_images_path), 
          'Test samples': len(test_images_path),
          'Number of individual IDs': un_ID,
          'Number of unique species': un_sp,
          })

run.finish()

In [None]:
def getShape(data, images_paths):
    shape = cv2.imread(images_paths[0]).shape
    
    for image_path in images_paths:
        image_shape=cv2.imread(image_path).shape
        if (image_shape!=shape):
            flag = False
            break;
              
    if (flag): return (data +" - Same image shape " + str(shape))
    else: return (data +" - Different image shape")      
        
print(getShape('train images', train_images_path))
print(getShape('test images', test_images_path))

# Train and test images 📷

In [None]:
# function to display multiple images

def display_multiple_img(images_paths, rows, cols,title):
    
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8))
    plt.suptitle(title, fontsize=20)
    for ind,image_path in enumerate(images_paths):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

In [None]:
display_multiple_img(train_images_path[0:25], 5, 5,"Train images")

In [None]:
display_multiple_img(test_images_path[0:25], 5, 5,"Test images")

# Whales and Dolphins Distribution 🐋 🐬 

In [None]:
#====== Function to plot WandB bar chart ======
def plot_wb_bar(df,col1,col2, title): 
    run = wandb.init(project='happywhale', job_type='image-visualization',name=col1,config = CONFIG, anonymous="allow")
    
    dt = [[label, val] for (label, val) in zip(df[col1], df[col2])]
    table = wandb.Table(data=dt, columns = [col1,col2])
    wandb.log({col1 : wandb.plot.bar(table, col1,col2,title=title)})
    run.finish()
    
#====== Function to create a dataframe of value counts ======
def count_values(df,col,top=False):
    df = pd.DataFrame(df[col].value_counts().reset_index().values,columns=[col, "counts"])
    if top==True: df=df[:10]
    return df

#====== Function to create a dataframe ======
def intermediate_df(col, labels, sizes):
    d = pd.DataFrame()
    d[col] = labels
    d['counts'] = sizes
    return d

In [None]:
# creating a new column 
train_df['label'] = train_df.species.map(lambda x: 'dolphin' if 'dolphin' in x else 'whale')

fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Whales and Dolphins ', size = 20, font="Serif")
explode = (0.05, 0.05)
labels = list(train_df.label.value_counts().index)
sizes = train_df.label.value_counts().values
ax.pie(sizes, explode=explode,startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.7, colors=["#0077b6","#90e0ef"])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
plot_wb_bar(intermediate_df('label', labels, sizes),"label", 'counts', "Whales and Dolphins Distribution")

# Species Distribution 🐋 🐬

In [None]:
plt.figure(figsize=(20,20))
plt.yticks(fontsize=16)
sns.countplot(y="species",data=train_df,order=train_df.iloc[0:]["species"].value_counts().index,palette="PuBu",linewidth=3)
plt.title("Species Distribution",font="Serif", size=20)
plt.show()

In [None]:
plot_wb_bar(count_values(train_df,"species", top=True),"species", 'counts', "Most frequent species")

In [None]:
print("Number of training images: ",train_df.shape[0])
print("\nNumber of individual IDs: " ,train_df.individual_id.nunique())

In [None]:
def frequency(df, col, freq):
    n = 5
    if freq == "Most":
        return df[col].value_counts()[:n].index.tolist()
    elif freq == "Least":
        return df[col].value_counts()[-n:].index.tolist()
    
m_freq_species = frequency(train_df,"species", "Most")
l_freq_species = frequency(train_df,"species", "Least")
m_freq_ID = frequency(train_df,"individual_id", "Most")
l_freq_ID = frequency(train_df,"individual_id", "Least")

In [None]:
def path(df,group,group_type):
    PATH = "../input/happy-whale-and-dolphin/train_images"
    
    #species
    if group_type=='sp':
        z = df['image'][df['species']==group].values 
    
    #ID
    if group_type=='id':
        z = df['image'][df['individual_id']==group].values 
   
    image_names = []
    for filename in z:
        fullpath = os.path.join(PATH, filename)
        image_names.append(fullpath)
    return image_names

In [None]:
def display_groups(df, group_type, lst):
    for item in lst:
        display_multiple_img(path(df,item,group_type)[:9], 3, 3,item)
        
def map_species(df,group_type, lst, name, table_name):
    run = wandb.init(project='happywhale', job_type='image-visualization',name=name,config = CONFIG, anonymous="allow")

    # Initialize an empty W&B Table
    data_table = wandb.Table(columns=['species', 'img1', 'img2', 'img_3', 'img_4', 'img_5'])

    for item in lst: 
        paths = path(df,item,group_type)[:5]
        # Add data to the table row-wise
        data_table.add_data(item,
                                wandb.Image(paths[0]),
                                wandb.Image(paths[1]),
                                wandb.Image(paths[2]),
                                wandb.Image(paths[3]),
                                wandb.Image(paths[4]))

    # Log the table
    wandb.log({table_name: data_table})

    # Finish the run
    wandb.finish()

# Most Frequent Species 🐋 🐬

In [None]:
display_groups(train_df,'sp', m_freq_species)
map_species(train_df,'sp', m_freq_species, "Most Frequent Species", "most_freq_species")

![](https://i.imgur.com/pOVECHZ.png)

# Least Frequent Species 🐋 🐬

In [None]:
display_groups(train_df,'sp', l_freq_species)
map_species(train_df,'sp', l_freq_species, "Least Frequent Species", "least_freq_species")

![](https://i.imgur.com/O7CqPbl.png)

# Most frequent whales & dolphins 🐋 🐬

In [None]:
fig,ax = plt.subplots(1,2,figsize=(16,8))

whales = train_df[train_df['label']=='whale']
dolphins = train_df[train_df['label']!='whale']
whales = whales.rename(columns = {"species":"species_whales"})
dolphins = dolphins.rename(columns = {"species":"species_dolphins"})

sns.countplot(y="species_whales", data=whales, order=whales.iloc[0:]["species_whales"].value_counts().index, ax=ax[0], color = "#0077b6")
ax[0].set_title('Most frequent whales')
ax[0].set_ylabel(None)
    
sns.countplot(y="species_dolphins", data=dolphins,order=dolphins.iloc[0:]["species_dolphins"].value_counts().index, ax=ax[1], color = "#90e0ef")
ax[1].set_title('Most frequent dolphins')
ax[1].set_ylabel(None)

plt.tight_layout()
plt.show()

In [None]:
plot_wb_bar(count_values(whales,"species_whales", top=True),"species_whales", 'counts', "Most frequent whales")
plot_wb_bar(count_values(dolphins,"species_dolphins", top=True),"species_dolphins", 'counts', "Most frequent dolphins")

# Most frequent whales 🐋

In [None]:
m_freq_species_whales = frequency(whales,"species_whales", "Most")
whales = whales.rename(columns = {"species_whales":"species"})
display_groups(whales,'sp', m_freq_species_whales)

# Most frequent dolphins 🐬

In [None]:
m_freq_species_dolphins = frequency(dolphins,"species_dolphins", "Most")
dolphins = dolphins.rename(columns = {"species_dolphins":"species"})
display_groups(dolphins,'sp', m_freq_species_dolphins)

# Most Frequent Individual IDs 🐋 🐬

In [None]:
display_groups(train_df,'id', m_freq_ID)

# Data Augmentation ➕

In [None]:
def plot_augmentations(images, titles, sup_title):
    fig, axes = plt.subplots(figsize=(20, 16), nrows=3, ncols=4, squeeze=False)
    
    for indx, (img, title) in enumerate(zip(images, titles)):
        axes[indx // 4][indx % 4].imshow(img)
        axes[indx // 4][indx % 4].set_title(title, fontsize=15)
        
    plt.tight_layout()
    fig.suptitle(sup_title, fontsize = 20)
    fig.subplots_adjust(wspace=0.2, hspace=0.2, top=0.93)
    axes[2,2].set_visible(False)
    axes[2,3].set_visible(False)
    plt.show()
    
def augment(paths, data):
    
    # list of albumentations
    albumentations = [A.RandomSunFlare(p=0.02), A.RandomFog(p=1), A.RandomBrightness(p=1),
                              A.Rotate(p=1, limit=90),
                              A.RGBShift(p=1), A.RandomSnow(p=0.02),
                              A.HorizontalFlip(p=1), A.RandomContrast(limit = 0.5,p = 1),
                              A.HueSaturationValue(p=1,hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=50)]
    
    # image titles
    titles = ["RandomSunFlare","RandomFog","RandomBrightnessContrast",
                       "Rotate", "RGBShift", "RandomSnow","HorizontalFlip", "RandomContrast","HSV"]
    
    for i in paths:
        image_path = i
        
        # getting image name from path
        image_name = image_path.split("/")[4].split(".")[0]
        
        # reading image
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        
        # resizing the image
        image = cv2.resize(image, (224, 224))
        
        # list of images
        images = []
        
        # creating image augmentations
        for augmentation_type in albumentations:
            augmented_img = augmentation_type(image = image)['image']
            images.append(augmented_img)

        # original image
        titles.insert(0, "Original")
        images.insert(0,image)  
        
        sup_title = "Image Augmentation for " + data + " - " + image_name
        plot_augmentations(images, titles, sup_title)
        
        titles.remove("Original")
        
augment(train_images_path[0:2],'train')

This is what my [project](https://wandb.ai/ruchi798/happywhale?workspace=user-ruchi798) looks like on the W&B dashboard ⬇️

![](https://i.imgur.com/CzsCPux.png)