<center>
    <h1> Basic EDA Starter for Everyone! 
    <h2> Upvote is Free 😊
</center>

<center>
    <img src="https://i.imgur.com/2LVg0HB.jpeg" alt="coral" />
</center>

#### References
- https://www.kaggle.com/diegoalejogm/great-barrier-reefs-eda-with-animations
- https://www.kaggle.com/debarshichanda/w-b-tables-great-barrier-reef-eda

- Specially thanks to Diego Gomez! I took lots of codes from his notebook.

<br>
<h1 style = "font-size:45px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">① Load Train Data</h1>

In [None]:
import pandas as pd

DATA_PATH = '/kaggle/input/tensorflow-great-barrier-reef/'
train = pd.read_csv(DATA_PATH + 'train.csv')
train

<br>
<h1 style = "font-size:45px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">② Take A Look At Data</h1>

## Basic information about train data

In [None]:
train.info()

## Check duplicated data

In [None]:
train.duplicated().sum()

There is no duplicated data in train data

## Feature summary

In [None]:
def resumetable(df):
    '''function to create feature summary'''
    print(f'Shape: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['Data Type'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': 'Features'})
    summary['Num of Null Value'] = df.isnull().sum().values
    summary['Num of Unique Value'] = df.nunique().values
    summary['1st Value'] = df.loc[0].values
    summary['2nd Value'] = df.loc[1].values
    summary['3rd Value'] = df.loc[2].values
    return summary

In [None]:
resumetable(train)

<h1 style = "font-size:45px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">③ Basic Engineering</h1>

## Downcast

In [None]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% Compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = downcast(train)

## Feature Engineering

In [None]:
import ast

# Convert String to List Type
train['annotations'] = train['annotations'].apply(ast.literal_eval)

# Get the number of bounding boxes for each image
train['num_bboxes'] = train['annotations'].apply(lambda x: len(x))

In [None]:
train.head()

## Check number of frames with bounding boxes

In [None]:
train[train['num_bboxes'] > 0]

<h1 style = "font-size:45px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">④ Verify if there is corrupted data</h1>

In [None]:
from os import listdir
from PIL import Image

def verify_images(video_id):
    path = DATA_PATH + f'train_images/video_{video_id}/'    
    for filename in listdir(path):
        if filename.endswith('.jpg'):
            try:
                img = Image.open(path + filename)
                img.verify() # Verify it is in fact an image
            except (IOError, SyntaxError) as e:
                print('Bad file:', filename) # Print out the names of corrupt files
    print(f'Video {video_id} has all valid images. Verified!')

for video_id in range(3):
    verify_images(video_id)

<h1 style = "font-size:45px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">⑤ Plot Frame Image with Bounding Boxes</h1>

## Load sequence of images with annotations

In [None]:
import numpy as np
from PIL import ImageDraw

def fetch_image(df, video_id, frame_id):
    # get frame
    frame = df[(df['video_id'] == video_id) & (df['video_frame'] == frame_id)].iloc[0]
    # get bounding_boxes
    bounding_boxes = frame['annotations']
    # open image
    img = Image.open(DATA_PATH + f'train_images/video_{video_id}/{frame_id}.jpg')

    for box in bounding_boxes:
        x0, y0, x1, y1 = (box['x'], box['y'], box['x']+box['width'], box['y']+box['height'])
        draw = ImageDraw.Draw(img)
        draw.rectangle( (x0, y0, x1, y1), outline=180, width=5)
    return img

def fetch_image_list(df, video_id, num_images, start_frame_idx):
    image_list = [np.array(fetch_image(df, video_id, start_frame_idx + index)) for index in range(num_images)]

    return image_list

In [None]:
images = fetch_image_list(train, video_id=0, num_images=80, start_frame_idx=25)

print(f'Number of images: {len(images)}')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 

grid = gridspec.GridSpec(4, 2) 
plt.figure(figsize=(18, 20))

idx_list = [0, 5, 10, 15, 20, 25, 30, 35] 

for i, idx in enumerate(idx_list): 
    ax = plt.subplot(grid[i])
    plt.imshow(images[idx], interpolation='nearest')
    ax.set_title(f'frame index {idx}')
    plt.axis('off')

<h1 style = "font-size:45px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">⑥ Image Animation</h1>

In [None]:
from matplotlib import animation, rc
rc('animation', html='jshtml')

def create_animation(imgs, frame_interval=130):
    fig = plt.figure(figsize=(7, 4))
    plt.axis('off')
    img = plt.imshow(imgs[0])

    def animate(i):
        img.set_array(imgs[i])
        return [img]

    return animation.FuncAnimation(fig, animate, frames=len(imgs), interval=frame_interval)

In [None]:
frame_interval = 130 # set smaller number if you want to play fast, otherwise set bigger

create_animation(images, frame_interval=frame_interval)