# Import Libraries

In [None]:
import os
import gc
import sys

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import skimage
from skimage.feature import greycomatrix, greycoprops
from skimage.filters import sobel
from skimage import color

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from keras import layers
import keras.backend as K
from keras.models import Sequential, Model
from keras.preprocessing import image
from keras.layers import Input, Dense, Activation, Dropout
from keras.layers import Flatten, BatchNormalization, Conv2D
from keras.layers import MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D 
from keras.applications.imagenet_utils import preprocess_input
from keras.applications.vgg16 import VGG16

from PIL import Image
from tqdm import tqdm
import random as rnd
import cv2
from keras.preprocessing.image import ImageDataGenerator
from numpy import expand_dims

!pip install livelossplot
from livelossplot import PlotLossesKeras

%matplotlib inline

# Loading Dataset 
We'll use here the Pandas to load the dataset into memory

In [None]:
train_df = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
train_df['path'] = '../input/happy-whale-and-dolphin/train_images/' + train_df['image']

pred_df = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
pred_df['path'] = '../input/happy-whale-and-dolphin/test_images/' + pred_df['image']

## Having two csv files
* train.csv - contain image name,species and individual_id
* sample_submission.csv - contain image name, dummy label for the images in the test folde
## And two folders contain the images
* train - having 51033 images of different type of whales and dolphins. There Labels have provided in the train.csv file
* test - having 27956 images of different type of whales and dolphins. We need to predict their labels

In [None]:
train_df.head(10)

In [None]:
print('Train samples count: ', len(train_df))
train_df.columns

In [None]:
print('Species Count: ',len(train_df['species'].value_counts()))
train_df['species'].value_counts()

# Data Cleaning
## Fixing Duplicate Labels
* bottlenose_dolpin -> bottlenose_dolphin
* kiler_whale -> killer_whale
* beluga -> beluga_whale
## Changing Label due to extreme similarities
* globis & pilot_whale -> short_finned_pilot_whale

In [None]:
print('Before fixing duplicate labels : ')
print("Number of unique species : ", train_df['species'].nunique())

train_df['species'].replace({
    'bottlenose_dolpin' : 'bottlenose_dolphin',
    'kiler_whale' : 'killer_whale',
    'beluga' : 'beluga_whale',
    'globis' : 'short_finned_pilot_whale',
    'pilot_whale' : 'short_finned_pilot_whale'
},inplace =True)

print('\nAfter fixing duplicate labels : ')
print("Number of unique species : ", train_df['species'].nunique())


train_df['class'] = train_df['species'].apply(lambda x: x.split('_')[-1])
train_df.head()

## Checking missing data
Lets check if there is any missing values in our dataset

In [None]:
train_df.isna().sum()

In [None]:
len(os.listdir('../input/happy-whale-and-dolphin/train_images'))

# Visualization
### Looking at some random beauties 
It's a great deal of fun to explore the data and play around with matplotlib

In [None]:
plt.figure(figsize = (15,12))
for idx,i in enumerate(train_df.species.unique()):
    plt.subplot(4,7,idx+1)
    df = train_df[train_df['species'] ==i].reset_index(drop = True)
    image_path = df.loc[rnd.randint(0, len(df))-1,'path']
    img = Image.open(image_path)
    img = img.resize((224,224))
    plt.imshow(img)
    plt.axis('off')
    plt.title(i)
plt.tight_layout()
plt.show()

In [None]:
def plot_species(df,species_name):
    plt.figure(figsize = (12,12))
    species_df = df[df['species'] ==species_name].reset_index(drop = True)
    plt.suptitle(species_name)
    for idx,i in enumerate(np.random.choice(species_df['path'],32)):
        plt.subplot(8,8,idx+1)
        image_path = i
        img = Image.open(image_path)
        img = img.resize((224,224))
        plt.imshow(img)
        plt.axis('off')
    plt.tight_layout()
    plt.show()

### Plotting more images from each species

In [None]:
for species in train_df['species'].unique():
    #print('\n\n')
    plot_species(train_df , species)

## Lets see some image by individual_id
We have to predict individual_id from image. So lets see how each individual looks like.

In [None]:
def plot_individual(df,individual_id):
    plt.figure(figsize = (12,12))
    species_df = df[df['individual_id'] ==individual_id].reset_index(drop = True)
    plt.suptitle(individual_id)
    for idx,i in enumerate(np.random.choice(species_df['path'],24)):
        plt.subplot(8,8,idx+1)
        image_path = i
        img = Image.open(image_path)
        img = img.resize((224,224))
        plt.imshow(img)
        plt.axis('off')
    plt.tight_layout()
    plt.show()

Top 5 most frequent individual

In [None]:
top_5_ids = train_df.individual_id.value_counts().head(5)
for i in top_5_ids.index:
    #print('\n\n')
    plot_individual(train_df , i)

### Top 5 least frequent individual
We will get duplicate images because many individual has only one training image.

In [None]:
last_5_ids = train_df.individual_id.value_counts().tail(5)
for i in last_5_ids.index:
    #print('\n\n')
    plot_individual(train_df , i)

Lets see some test images

In [None]:
t_df = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
t_df['path'] = '../input/happy-whale-and-dolphin/test_images/' + t_df['image']

def plot_testimages(df):
    plt.figure(figsize = (12,12))
    plt.suptitle('Test Images')
    for idx,i in enumerate(np.random.choice(df['path'],48)):
        plt.subplot(8,8,idx+1)
        image_path = i
        img = Image.open(image_path)
        img = img.resize((224,224))
        plt.imshow(img)
        plt.axis('off')
    plt.tight_layout()
    plt.show()

# plot_testimages(t_df)
# del t_df

### Observations regarding handpicked images
* There are some abnormal images in both train and test dataset
* Some training images contains people, boats, birds, penguins etc
* Many training images are cropped but some are not.
* The uncropped images must be taken care of.
* There are some images take from under water

## Class Distribution Analysis
In this section we will be analyzing the number of training and test samples in each class. It will give us a better understanding of our dataset and provide us the necessary information to preprocess our dataset before the training phase.

In [None]:
plot = sns.countplot(x = train_df['class'], color = '#2596be')
sns.despine()
plot.set_title('Class Distribution\n', font = 'serif', x = 0.1, y=1, fontsize = 16);
plot.set_ylabel("Count", x = 0.02, font = 'serif', fontsize = 12)
plot.set_xlabel("Specie", fontsize = 12, font = 'serif')

for p in plot.patches:
    plot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2, p.get_height()), 
       ha = 'center', va = 'center', xytext = (0, -20),font = 'serif', textcoords = 'offset points', size = 15)

**Percentage of images of whale and dolphin in the dataset¶**

In [None]:
plt.figure(figsize=(5,5))
class_cnt = train_df.groupby(['class']).size().reset_index(name = 'counts')
colors = sns.color_palette('Paired')[0:9]
plt.pie(class_cnt['counts'], labels=class_cnt['class'], colors=colors, autopct='%1.1f%%')
plt.legend(loc='upper left')
plt.show()

**Number of training images of each species**

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(data=train_df, y = 'species',  palette='crest', dodge=False)
plt.show()

**Number of training images of each species of whale and dolphin**

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,5))

whales = train_df[train_df['class']=='whale']
dolphins = train_df[train_df['class']!='whale']

sns.countplot(y="species", data=whales, order=whales.iloc[0:]["species"].value_counts().index, ax=ax[0], color = "#0077b6")
ax[0].set_title('Most frequent whales')
ax[0].set_ylabel(None)
    
sns.countplot(y="species", data=dolphins,order=dolphins.iloc[0:]["species"].value_counts().index, ax=ax[1], color = "#90e0ef")
ax[1].set_title('Most frequent dolphins')
ax[1].set_ylabel(None)

plt.tight_layout()
plt.show()

**Number of training images of top 10 individuals**

In [None]:
plt.figure(figsize=(12,4))
top_ten_ids = train_df.individual_id.value_counts().head(24)
top_ten_ids = pd.DataFrame({'individual_id':top_ten_ids.index, 'frequency':top_ten_ids.values})

plt.bar(top_ten_ids['individual_id'],top_ten_ids['frequency'],width = 0.8,color='c',zorder=4)
plt.xticks(rotation=90)
plt.ylabel("frequency")
plt.xlabel("Individual Ids")
plt.title("Top 10 Individual Ids used by frequency")
plt.grid(visible = True, color ='grey',linestyle ='-', linewidth = 0.9,alpha = 0.2, zorder=0)
plt.show()

**Plot the value count graph of each individual**

In [None]:
train_df['individual_id'].value_counts().plot()
plt.xticks(rotation=90)
plt.show()

**Number of unique individuals in the dataset**

In [None]:
len(train_df.individual_id.unique())

**Image count of individuals**

In [None]:
train_df['count'] = train_df.groupby('individual_id',as_index=False)['individual_id'].transform(lambda x: x.count())
train_df.head()

**Individuals with only one training image**

In [None]:
train_df[train_df['count']==1]

**Percentage of Individuals with less then 5 images**

In [None]:
tmp = train_df[train_df['count']<=4]
len(tmp)/len(train_df)

**Percentage of Individuals with more then 20 images**

In [None]:
count = 0
for i in train_df['count']:
    if(i > 21):
        count += 1
print(count/len(train_df))

### Observation Regarding Class Distribution
There is a huge disbalance in the data. There are many classes with only one or several samples:

* Total Number of individuals are 15587
* 9258 individuals have just one image
* Single whale with most images have 400 of them
* Images dsitribution:
    * almost 40% comes from whales with 4 or less images.
    * almost 23% comes from whales with 5-20 images.
    * rest 37% comes from individual with >20 images.

## Image Resolutions

In [None]:
widths, heights = [], []

for path in tqdm(train_df["path"]):
    width, height = Image.open(path).size
    widths.append(width)
    heights.append(height)
    
train_df["width"] = widths
train_df["height"] = heights
train_df["dimension"] = train_df["width"] * train_df["height"]

**Lets see some small images**

In [None]:
train_df.sort_values('width').head(84)

## Color Analysis
We need to do some color analysis to get an ida about the augmentation technique needed for this problem

In [None]:
def is_grey_scale(givenImage):
    w,h = givenImage.size
    for i in range(w):
        for j in range(h):
            r,g,b = givenImage.getpixel((i,j))
            if r != g != b: return False
    return True

**Check color scale of Train images**

In [None]:
sampleFrac = 0.1
#get our sampled images
isGreyList = []
for imageName in train_df['path'].sample(frac=sampleFrac):
    val = Image.open(imageName).convert('RGB')
    isGreyList.append(is_grey_scale(val))
print(np.sum(isGreyList) / len(isGreyList))
del isGreyList

**Check color scale of Test images**

In [None]:
sampleFrac = 0.1
#get our sampled images
isGreyList_test = []
for imageName in pred_df['path'].sample(frac=sampleFrac):
    val = Image.open(imageName).convert('RGB')
    isGreyList_test.append(is_grey_scale(val))
print(np.sum(isGreyList_test) / len(isGreyList_test))
del isGreyList_test

**Get mean intensity for each channel RGB**

In [None]:
def get_rgb_men(row):
    img = cv2.imread(row['path'])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return np.sum(img[:,:,0]), np.sum(img[:,:,1]), np.sum(img[:,:,2])

tqdm.pandas()
train_df['R'], train_df['G'], train_df['B'] = zip(*train_df.progress_apply(lambda row: get_rgb_men(row), axis=1) )

In [None]:
def show_color_dist(df, count):
    fig, axr = plt.subplots(count,2,figsize=(15,15))
    for idx, i in enumerate(np.random.choice(df['path'], count)):
        img = cv2.imread(i)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axr[idx,0].imshow(img)
        axr[idx,0].axis('off')
        axr[idx,1].set_title('R={:.0f}, G={:.0f}, B={:.0f} '.format(np.mean(img[:,:,0]), np.mean(img[:,:,1]), np.mean(img[:,:,2]))) 
        x, y = np.histogram(img[:,:,0], bins=255)
        axr[idx,1].bar(y[:-1], x, label='R', alpha=0.8, color='red')
        x, y = np.histogram(img[:,:,1], bins=255)
        axr[idx,1].bar(y[:-1], x, label='G', alpha=0.8, color='green')
        x, y = np.histogram(img[:,:,2], bins=255)
        axr[idx,1].bar(y[:-1], x, label='B', alpha=0.8, color='blue')
        axr[idx,1].legend()
        axr[idx,1].axis('off')

**Red images and their color distribution**<br>
Since we are picking random images, some image may appear multiple times

In [None]:
df = train_df[((train_df['B']*1.05) < train_df['R']) & ((train_df['G']*1.05) < train_df['R'])]
show_color_dist(df, 8)

**Blue images and their color distribution**

In [None]:
df = train_df[(train_df['B'] > 1.3*train_df['R']) & (train_df['B'] > 1.3*train_df['G'])]
show_color_dist(df, 8)

**Green images and their color distribution**

In [None]:
df = train_df[(train_df['G'] > 1.05*train_df['R']) & (train_df['G'] > 1.05*train_df['B'])]
show_color_dist(df, 8)

# Observation Regarding Color Distribution
1. We see that around 3% of the images in the training set are greyscale. While 1% in the Test set are greyscale.
2. Some whales have yellow spots and some images are reddish.This can happened due to sunset.
3. This suggests that we need to create image transformations that are very agnostic to the RGB spectrum (i.e. bump up the number of greyscaled images in the smaller classes).

# Data for Analysis

In [None]:
def image_individual(df,individual_id):
    species_df = df[df['individual_id'] ==individual_id].reset_index(drop = True)
    return species_df['path']

In [None]:
top_10_ids = train_df.individual_id.value_counts().head(10)

In [None]:
bottom_10_ids = train_df.individual_id.value_counts().tail(10)

# Analyzing Edges

A Sobel filter is one means of getting a basic edge magnitude/gradient image. Can be useful to threshold and find prominent linear features, etc. Several other similar filters in skimage.filters are also good edge detectors: roberts, scharr, etc. and you can control direction, i.e. use an anisotropic version.

In [None]:
for id in bottom_10_ids.index:
    image = cv2.imread(image_individual(train_df,id)[0])
    gray=cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = sobel(image)
    gray_edges=sobel(gray)
    dimension = edges.shape
    fig = plt.figure(figsize=(8, 8))
    plt.suptitle(id)
    plt.subplot(2,2,1)
    plt.imshow(gray_edges)
    plt.subplot(2,2,2)
    plt.imshow(edges[:dimension[0],:dimension[1],0], cmap="gray")
    plt.subplot(2,2,3)
    plt.imshow(edges[:dimension[0],:dimension[1],1], cmap='gray')
    plt.subplot(2,2,4)
    plt.imshow(edges[:dimension[0],:dimension[1],2], cmap='gray')
plt.show()

In [None]:
for id in bottom_10_ids.index:
    image = cv2.imread(image_individual(train_df,id)[0])
    edges = sobel(image)
    gray=cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray_edges=sobel(gray)
    dimension = edges.shape
    fig = plt.figure(figsize=(8, 8))
    plt.suptitle(id)
    plt.subplot(2,2,1)
    plt.imshow(gray_edges)
    plt.subplot(2,2,2)
    plt.imshow(edges[:dimension[0],:dimension[1],0], cmap="BuGn")
    plt.subplot(2,2,3)
    plt.imshow(edges[:dimension[0],:dimension[1],1], cmap='BuGn')
    plt.subplot(2,2,4)
    plt.imshow(edges[:dimension[0],:dimension[1],2], cmap='BuGn')
plt.show()

In [None]:
for id in top_10_ids.index:
    image = cv2.imread(image_individual(train_df,id)[0])
    edges = sobel(image)
    gray=cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray_edges=sobel(gray)
    dimension = edges.shape
    fig = plt.figure(figsize=(8, 8))
    plt.suptitle(id)
    plt.subplot(2,2,1)
    plt.imshow(gray_edges)
    plt.imshow(image)
    plt.subplot(2,2,2)
    plt.imshow(edges[:dimension[0],:dimension[1],0], cmap="gray")
    plt.subplot(2,2,3)
    plt.imshow(edges[:dimension[0],:dimension[1],1], cmap='gray')
    plt.subplot(2,2,4)
    plt.imshow(edges[:dimension[0],:dimension[1],2], cmap='gray')
plt.show()

In [None]:
for id in top_10_ids.index:
    image = cv2.imread(image_individual(train_df,id)[0])
    edges = sobel(image)
    gray=cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray_edges=sobel(gray)
    dimension = edges.shape
    fig = plt.figure(figsize=(8, 8))
    plt.suptitle(id)
    plt.subplot(2,2,1)
    plt.imshow(gray_edges)
    plt.subplot(2,2,2)
    plt.imshow(edges[:dimension[0],:dimension[1],0], cmap="BuGn")
    plt.subplot(2,2,3)
    plt.imshow(edges[:dimension[0],:dimension[1],1], cmap='BuGn')
    plt.subplot(2,2,4)
    plt.imshow(edges[:dimension[0],:dimension[1],2], cmap='BuGn')
plt.show()

## Analysis

We need to train this images on a pre-trained model which has already known the features of fins of dolphins or our head of the model should be a good feature extractor of dolphin fins for the dataset.

# HSV Transform
Since this contest is about time series ordering, I think it's possible there may be useful information in a transform to HSV color space. HSV is useful for identifying shadows and illumination, as well as giving us a means to identify similar objects that are distinct by color between scenes (hue), though there's no guarantee the hue will be stable.

In [None]:
for id in bottom_10_ids.index:
    image = cv2.imread(image_individual(train_df,id)[0])
    hsv = color.rgb2hsv(image)
    dimension = hsv.shape
    fig = plt.figure(figsize=(8, 8))
    plt.suptitle(id)
    plt.subplot(2,2,1)
    plt.imshow(image)
    plt.subplot(2,2,2)
    plt.imshow(hsv[:dimension[0],:dimension[1],0], cmap="PuBuGn")
    plt.subplot(2,2,3)
    plt.imshow(hsv[:dimension[0],:dimension[1],1], cmap='bone')
    plt.subplot(2,2,4)
    plt.imshow(hsv[:dimension[0],:dimension[1],2], cmap='bone')
plt.show()

In [None]:
for id in top_10_ids.index:
    image = cv2.imread(image_individual(train_df,id)[0])
    hsv = color.rgb2hsv(image)
    dimension = hsv.shape
    fig = plt.figure(figsize=(8, 8))
    plt.suptitle(id)
    plt.subplot(2,2,1)
    plt.imshow(image)
    plt.subplot(2,2,2)
    plt.imshow(hsv[:dimension[0],:dimension[1],0], cmap="PuBuGn")
    plt.subplot(2,2,3)
    plt.imshow(hsv[:dimension[0],:dimension[1],1], cmap='bone')
    plt.subplot(2,2,4)
    plt.imshow(hsv[:dimension[0],:dimension[1],2], cmap='bone')
plt.show()

# Augmentations

In [None]:
def plot_augimages(paths, datagen):
    plt.figure(figsize = (14,28))
    plt.suptitle('Augmented Images')
    
    midx = 0
    for path in paths:
        data = Image.open(path)
        data = data.resize((224,224))
        samples = expand_dims(data, 0)
        it = datagen.flow(samples, batch_size=1)
    
        # Show Original Image
        plt.subplot(10,5, midx+1)
        plt.imshow(data)
        plt.axis('off')
    
        # Show Augmented Images
        for idx, i in enumerate(range(4)):
            midx += 1
            plt.subplot(10,5, midx+1)
            
            batch = it.next()
            image = batch[0].astype('uint8')
            plt.imshow(image)
            plt.axis('off')
        midx += 1
    
    plt.tight_layout()
    plt.show()

    
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.10,
    brightness_range=[0.6,1.4],
    channel_shift_range=0.7,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode='nearest'
) 
plot_augimages(np.random.choice(train_df['path'],10), datagen)

# Load data in batches using data generators

In [None]:
val_datagen = ImageDataGenerator()

In [None]:
# Flow training images in batches of 32 using train_datagen generator
train_generator = datagen.flow_from_dataframe(
        train_df,  # This is the source directory for training images
        x_col='path',
        y_col='individual_id',
        target_size=(32, 32),  # All images will be resized to 150x150
        batch_size=32,
        class_mode="categorical",
        shuffle=True,
)

# Preprocessing
Encoding Labels

In [None]:
y = train_df.iloc[:, 2].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
onehot_encoder = OneHotEncoder(sparse=False)
y = y.reshape(len(y), 1)
y = onehot_encoder.fit_transform(y)

In [None]:
y.shape

In [None]:
# gc.collect()

In [None]:
# # include_top = False means that we doesnt include fully connected top layer we will add them accordingly
# vgg16 = VGG16(include_top = False, input_shape = (224,224,3), weights = 'imagenet')

# # training of all the convolution is set to false
# for layer in vgg16.layers:
#     layer.trainable = False

# x = GlobalAveragePooling2D()(vgg16.output)
# predictions = Dense(y.shape[1], activation='softmax')(x)

# model = Model(inputs = vgg16.input, outputs = predictions)

In [None]:
# model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
# model.summary()

In [None]:
# history = model.fit_generator(
#       train_generator,
#       steps_per_epoch=100,
#       epochs=200,
#       verbose=2)
# model.save('./last.h5')

In [None]:
# plt.figure(figsize=(15,5))
# plt.plot(history.history['accuracy'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.show()

In [None]:
# plt.figure(figsize=(15,5))
# plt.plot(history.history['loss'])
# plt.title('Model loss')
# plt.ylabel('loss')
# plt.xlabel('Epoch')
# plt.show()