# Happywhale - CS795 Project 1
***
Old Dominion University

2/26/2022
#### Authors: Raphael J. Sandor, Xiangrui Xu


# Imports

In [None]:
import efficientnet.tfkeras as efn
import glob
import json
import keras
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import PIL
import pickle
import pathlib
import seaborn as sns
import warnings
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as tfhub
import re
from scipy import stats
from scipy.stats import norm
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from PIL import Image
from pathlib import Path
from datetime import datetime
from keras.optimizers import adam_v2
from tensorflow.keras import backend as K
from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')
%matplotlib inline

# Global Data

In [None]:
train_df = pd.read_csv('train.csv')

'''
    Use sample submission to gather which 
    images need predictions made 
'''
test_df = pd.read_csv('sample_submission.csv')

total_image_df =  pd.concat([train_df['image'], test_df['image']])
TRAIN_IMAGES_DIR = Path("./train_images")
TEST_IMAGES_DIR = Path("./test_images")
train_images = list(TRAIN_IMAGES_DIR.glob('./*'))

# Exploritory Data Analysis

In [None]:
img = PIL.Image.open(str(train_images[1000]))
plt.figure(figsize = (10,10))
imgplot = plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
trainImgCnt = len(list(TRAIN_IMAGES_DIR.glob('*.jpg')))
testImgCnt = len(list(TEST_IMAGES_DIR.glob('*.jpg')))

In [None]:
print("Train image count: " + str(trainImgCnt))
print("Test image count: " + str(testImgCnt))

In [None]:
train_df.info()

In [None]:
# Data in train
train_df.head()

## Unique Species 

In [None]:
species = train_df['species']

In [None]:
train_df.describe()

In [None]:
unique_species = pd.unique(species) # returns numpy.ndarray 
print("Unique Species")
print("-------------------")
print(*unique_species, sep='\n')

# Whales and Dolphins

In [None]:
# Note that the dataset includes some typo mistakes mistakes.
train_df['species'].replace('bottlenose_dolpin', 'bottlenose_dolphin', inplace=True)
train_df['species'].replace('kiler_whale', 'killer_whale', inplace=True)
train_df['species'][(train_df['species'] =="pilot_whale") | (train_df['species'] =="globis" )]='short_finned_pilot_whale'

whales = ['humpback_whale','beluga','minke_whale', 'fin_whale', 'blue_whale', 'gray_whale',
          'southern_right_whale','sei_whale', 'cuviers_beaked_whale', 'brydes_whale']

dolphins = ['melon_headed_whale','false_killer_whale', 'bottlenose_dolphin', 'common_dolphin', 
            'dusky_dolphin', 'killer_whale', 'long_finned_pilot_whale', 'spinner_dolphin', 
            'spotted_dolphin','commersons_dolphin', 'white_sided_dolphin', 'short_finned_pilot_whale',
            'rough_toothed_dolphin', 'pantropic_spotted_dolphin', 'frasiers_dolphin', 'pygmy_killer_whale']

In [None]:
whale_df = train_df[train_df['species'].str.contains('|'.join(whales))]
dolphin_df = train_df[train_df['species'].str.contains('|'.join(dolphins))]
print("Total species after :",len(train_df.species.unique()))

In [None]:
print(whale_df.head())
print("---------------------------------------------")
print(dolphin_df.head())

In [None]:
## lets check speiceis in visually
# Nikhil Jothi Prakash 
# Works cited: https://www.kaggle.com/nikhiljothiprakash/happy-whale-and-dolphin
plt.figure(figsize=(16, 12))
plt.rcParams["font.size"] = 16
plt.barh(train_df["species"].value_counts().sort_values(ascending=True).index,train_df["species"].value_counts().sort_values(ascending=True),tick_label = train_df["species"].value_counts().sort_values(ascending=True).index)
plt.show()

In [None]:
print("Species of whales: " + str(len(whales)))
print("Species of dolphins: " + str(len(dolphins)))

In [None]:
print("Number of whales: " + str(whale_df.shape[0]))
print("Number of dolphins: " + str(dolphin_df.shape[0]))

In [None]:
# now lets classify on whales.
train_df['isWhale'] = train_df.species.isin(whale_df.species).astype(int)
train_data = []

In [None]:
train_df.describe()

In [None]:
# Borrowed from 
# https://www.kaggle.com/samir95/species-classification
_, dev, _ , _ = train_test_split(train_df, train_df['isWhale'], test_size=0.1)

dev.shape

# CNN Dataset
<ol>
  <li>Load the data.</li>
  <li>Resize images to be normalized</li>
</ol>
<i> If I had more time and knowledge I would use the TFRecords from the next step </i>

In [None]:
import fastai
from fastai.vision.all import *
from fastai.basics import *
from fastai.data.all import *

from fastai.vision.core import *
import fastbook
fastbook.setup_book() 
pd.options.mode.chained_assignment = None  # default='warn'

dev['imagePath'] = dev['image'].apply(lambda f: TRAIN_IMAGES_DIR/f)
train_df['imagePath'] = train_df['image'].apply(lambda f: TRAIN_IMAGES_DIR/f)

dblock = DataBlock(blocks=(ImageBlock, CategoryBlock), 
                   get_x=ColReader('imagePath'),
                   get_y=lambda r: r['isWhale'],
                   splitter=RandomSplitter(seed=42),
                   item_tfms=Resize(460),
                   batch_tfms=aug_transforms(size=224))
#dblock.summary(dev)
dsets = dblock.datasets(dev)
dls = dblock.dataloaders(dev)

# TFRecords For Faster Performance 
##### Creating TFRecords will provide better performance than manual image manipulations according to Keras.
https://keras.io/examples/keras_recipes/creating_tfrecords/

<i> We'll use this when we do individual predicitions <i>

## Setup Data

In [None]:
# Works cited:
# https://www.kaggle.com/nikhiljothiprakash/happy-whale-and-dolphin/notebook
concat_df = pd.concat([train_df['image'], test_df['image']])

### Dictionaries

In [None]:
## lets create dict for species
spid_dict = dict((a,b) for b,a in enumerate(train_df.species.unique()))
spid_dict_inverse={(c,d) for d,c in spid_dict.items()}

In [None]:
image_name_to_image_id = dict((image_name, index) for index, image_name in enumerate(concat_df.unique()))

In [None]:
img_dict = dict((image,index) for index,image in enumerate(concat_df.unique()))
img_dict_inverse = {ind:img for img,ind in img_dict.items()}

In [None]:
## lets create dict for individual id 
id_dict = dict((a,b) for b,a in enumerate(train_df.individual_id.unique()))
id_dict_inverse={(c,d) for d,c in id_dict.items()}

In [None]:
train_df["label"]=[id_dict[i] for i in train_df.individual_id]
train_df["image_id"]=[img_dict[i] for i in train_df['image']]
train_df.head(20)

### Helper Functions
Borrowed from Keras:https://keras.io/examples/keras_recipes/creating_tfrecords/

In [None]:
def image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()])
    )


def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))


def float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


## Create TFRecords
Faster image manipulation in the future.

In [None]:
# Borrowed example of making records from 
# Gaurav Chopra
# https://www.kaggle.com/gauravchopracg/understanding-tfrecord-format
# Create a function to apply entire process to each element of dataset.
# process the two images into 'tf.Example' messages.
def create_example(image_id, image, label):
  """
  Creates a tf.Example message ready to be written to a file.
  """
  # Create a dictionary mapping the feature name to the tf.Example-compatible
  # data type.
  feature = {
    "image_id": int64_feature(image_id),
    "image": image_feature(image),
    "label": int64_feature(label)
  }  
  # Create a Features message using tf.train.Example.
  return tf.train.Example(features=tf.train.Features(feature=feature))


def  write_records():
  #Write the `tf.Example` observations to the file.
  with tf.io.TFRecordWriter(train_records) as writer:
    for i, row in new.iterrows():
      image = tf.io.decode_jpeg(tf.io.read_file(str(row['imagePath'])))
    
      # storing all the features in the tf.Example message.
      tf_example = create_example(row['image_id'], image, row['label'])
      # write the example messages to a file named images.tfrecords
      writer.write(tf_example.SerializeToString())

In [None]:
# define a filename to store preprocessed image data:
train_records = 'trainImages.tfrecords'
test_records = 'testImages.tfrecords'
new = train_df[['image_id','imagePath', 'label']].copy()

if not os.path.exists("trainImages.tfrecords"):
  write_records()
    

# Start CNN Learning

In [None]:
dls.show_batch(max_n=20)
learn = cnn_learner(dls, resnet34, metrics=[accuracy, error_rate])
learn.lr_find()

# Fine Tune and Improve

In [None]:
learn.fine_tune(2, base_lr=3e-3)

In [None]:
learn.freeze()
learn.fit_one_cycle(3, 3e-3)

In [None]:
learn.unfreeze()
learn.lr_find()

In [None]:
learn.fit_one_cycle(6, lr_max=slice(1e-6, 1e-4))

# Classification based on Individual 
#### using research by Andre C. Ferreira as a starting point on this venture
##### https://github.com/AndreCFerreira/Bird_individualID/blob/master/Train_CNN/TRAIN_CNN.ipynb

Lets explore how many individuals we are working with in this

In [None]:
from sklearn import model_selection

In [None]:
def create_folds(data,target,num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data[target], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
# image to array
def decode_image(image_raw):
    image = tf.image.decode_jpeg(image_raw, channels=3)
    image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])
    image = tf.cast(image, tf.float32) / 255.0
    return image

In [None]:
def read_tfrecord(raw_image_dataset):
    feature_description = {
        "image_id": tf.io.FixedLenFeature([], tf.int64),
        "image_raw": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.int64),
    }

    parsed_image_dataset = tf.io.parse_single_example(raw_image_dataset, feature_description)
    image_id = tf.cast(parsed_image_dataset['image_id'], tf.int32)
    image = decode_image(parsed_image_dataset['image_raw'])
    label = tf.cast(parsed_image_dataset['label'], tf.int32)
    
    return image_id, image, label

In [None]:
save_dir = '.'
EXPERIMENT = 0
run_ts = datetime.now().strftime('%Y%m%d-%H%M%S')
print(run_ts)

In [None]:
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()


class config:
    
    
    SEED = 42
    FOLD_TO_RUN = 0
    FOLDS = 5
    DEBUG = False
    EVALUATE = True
    RESUME = False
    RESUME_EPOCH = None
    
    
    ### Dataset
    BATCH_SIZE = 32 * strategy.num_replicas_in_sync
    IMAGE_SIZE = 512
    N_CLASSES = 15587
    
    ### Model
    model_type = 'effnetv1'  
    EFF_NET = 5
    EFF_NETV2 = 's-21k-ft1k'
    FREEZE_BATCH_NORM = False
    head = 'arcface' 
    EPOCHS = 20
    LR = 0.001
    message='baseline'
    
    ### Augmentations
    CUTOUT = False
    
    ### Save-Directory
    save_dir = save_dir
    
    ### Inference
    KNN = 50
    
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
def is_interactive():
    return 'runtime'    in get_ipython().config.IPKernelApp.connection_file
IS_INTERACTIVE = is_interactive()
print(IS_INTERACTIVE)



In [None]:
MODEL_NAME = None
if config.model_type == 'effnetv1':
    MODEL_NAME = f'effnetv1_b{config.EFF_NET}'
elif config.model_type == 'effnetv2':
    MODEL_NAME = f'effnetv2_{config.EFF_NETV2}'

config.MODEL_NAME = MODEL_NAME
print(MODEL_NAME)

In [None]:
with open(config.save_dir+'/config.json', 'w') as fp:
    json.dump({x:dict(config.__dict__)[x] for x in dict(config.__dict__) if not x.startswith('_')}, fp)

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path('happywhale-tfrecords-v1')
    
train_files = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/happywhale-2022-train*.tfrec')))
test_files = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/happywhale-2022-test*.tfrec')))
print(GCS_PATH)
print(len(train_files),len(test_files),count_data_items(train_files),count_data_items(test_files))
