# Importing Modules

In [None]:
# Standard imports
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import trange
from colorama import Fore
from glob import glob
import json
from pprint import pprint
import time
import cv2
from enum import Enum
from IPython.display import display
import random
import inspect

# For Data preparation
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, VotingRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# For building models
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms, models

# Tensorflow modules
import tensorflow as tf
from tensorflow.keras.applications import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.metrics import *
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import *


# For Transformer
import transformers
from transformers import AutoTokenizer, BertModel
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


import warnings
warnings.filterwarnings("ignore")
# To ignore tensorflow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


print(
    f"GPU is available : {tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None)}")

# Config

In [None]:
class Config(Enum):
    '''
    It basically contains all the path location and other stuffs
    '''

    def __str__(self):
        return self.value

    TRAIN_CSV = "../input/happy-whale-and-dolphin/train.csv"
    TEST_CSV = "../input/happy-whale-and-dolphin/sample_submission.csv"
    TRAIN_DIR = "../input/happy-whale-and-dolphin/train_images"
    TEST_DIR = "../input/happy-whale-and-dolphin/test_images"


def setSeed(seed):
    """
    Setting the seed of all the random function to maintain reproducibility
    """
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(seed)
    tf.random.set_seed(seed)
    print('SEEDITIZATION DONE !')

setSeed(42)

# Helper functions

In [None]:
def giveHistogram(df: "data File", col_name: str, bins=None, dark=False):
    """
    To create histogram plots
    """
    fig = px.histogram(df, x=col_name, color = col_name, template="plotly_dark" if dark else "ggplot2",
                       nbins=bins if bins != None else 1 + int(np.log2(len(df))))
    fig.update_layout(
        title_text=f"Distribution of {col_name}",
        title_x=0.5,
    )
    fig.update_xaxes(categoryorder = 'total descending')
    fig.show()


def widthAndHeightDist(df: "data_file", col_name: "col name that contains the img path", dark=False):
    """
    Give Histogram distribution of image width and height
    """
    widths = []
    heights = []
    bins = 1 + int(np.log2(len(df)))
    total_images = list(df[col_name].values)
    for idx in trange(len(total_images), desc="Collecting widths and heights...", bar_format="{l_bar}%s{bar:50}%s{r_bar}" % (Fore.CYAN, Fore.RESET), position=0, leave=True):
        cur_path = total_images[idx]
        h, w, _ = cv2.imread(cur_path).shape
        widths.append(w)
        heights.append(h)

    figW = px.histogram(widths, nbins=bins,
                        template="plotly_dark" if dark else "ggplot2")
    figW.update_layout(title='Distribution of Image Widths', title_x=0.5)
    figW.show()

    figH = px.histogram(heights, nbins=bins,
                        template="plotly_dark" if dark else "ggplot2")
    figH.update_layout(title='Distribution of Image Heights', title_x=0.5)
    figH.show()


def buildGridImages(df: "data_file", img_path_col_name: str, label_col_name: str, nrows=5, ncols=4, img_size=512):
    """
    To build an image grid
    """

    df = df.sample(nrows*ncols)
    paths = df[img_path_col_name].values
    labels = df[label_col_name].values

    text_color = (255, 255, 255)
    box_color = (0, 0, 0)

    plt.figure(figsize=(20, 12))
    for i in range(nrows * ncols):
        plt.subplot(nrows, ncols, i+1)
        img = cv2.imread(paths[i])
        img = cv2.resize(img, (img_size, img_size))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        plt.axis("off")
        plt.title(str(labels[i]))
        plt.imshow(img)

    plt.tight_layout()
    plt.show()


def create_folds(data, target="label", regression=True, num_splits=5):
    """
    Helper function to create folds
    """
    data["kfold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    kf = StratifiedKFold(n_splits=num_splits)

    if regression:
        # Applying Sturg's rule to calculate the no. of bins for target
        num_bins = int(1 + np.log2(len(data)))

        data.loc[:, "bins"] = pd.cut(data[target], bins=num_bins, labels=False)
        for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
            data.loc[v_, 'kfold'] = f
        data = data.drop(["bins"], axis=1)
    else:
        for f, (t_, v_) in enumerate(kf.split(X=data, y=data[target].values)):
            data.loc[v_, 'kfold'] = f

    return 

# Looking inside the data

In [None]:
data_df = pd.read_csv(Config.TRAIN_CSV.value)
test_df = pd.read_csv(Config.TEST_CSV.value)

data_df['path'] = data_df['image'].apply(lambda x: f"{Config.TRAIN_DIR.value}/{x}")
test_df['path'] = test_df['image'].apply(lambda x: f"{Config.TEST_DIR.value}/{x}")

data_df.head()

In [None]:
# Extracting whale or dolphin
data_df['animal_kind'] = data_df['species'].apply(lambda x: x.split("_")[-1])
data_df.head()

In [None]:
test_df.head()

### Distribution of whale and dolphin in the dataset

In [None]:
giveHistogram(df = data_df, col_name = 'animal_kind', bins=None, dark=1)

### Distribution of the species

In [None]:
giveHistogram(df = data_df, col_name = 'species', bins=None, dark=1)

### Lets look at some of the images

In [None]:
whales = data_df.loc[data_df['animal_kind'] == 'whale']
dolphin = data_df.loc[data_df['animal_kind'] == 'dolphin']
beluga = data_df.loc[data_df['animal_kind'] == 'beluga']
dolpin = data_df.loc[data_df['animal_kind'] == 'dolpin']
globis = data_df.loc[data_df['animal_kind'] == 'globis']

#### Whale 🐳

In [None]:
buildGridImages(df = whales, img_path_col_name = 'path', label_col_name = 'species', nrows=5, ncols=4, img_size=512)

#### Dolphin 🐬

In [None]:
buildGridImages(df = dolphin, img_path_col_name = 'path', label_col_name = 'species', nrows=5, ncols=4, img_size=512)

#### Beluga

In [None]:
buildGridImages(df = beluga, img_path_col_name = 'path', label_col_name = 'species', nrows=5, ncols=4, img_size=512)

#### Dolpin 🐬❓

In [None]:
buildGridImages(df = dolpin, img_path_col_name = 'path', label_col_name = 'species', nrows=5, ncols=4, img_size=512)

#### globis

In [None]:
buildGridImages(df = globis, img_path_col_name = 'path', label_col_name = 'species', nrows=5, ncols=4, img_size=512)

##### *OMG 😝 Spelling mistake dolphin -> dolpin 🤣😂*

### Distribution of Widths and Heights

In [None]:
widthAndHeightDist(df = data_df, col_name = 'path', dark=1)