# Lets go on a small adventure looking for starfishes 

This notebook houses some basic EDA which will be updated frequently as the competition goes on with a simple aim of getting to the depths of data and extracting any key insights which could shape the solution


<img src="https://media.giphy.com/media/QvSkfOVGFEH7Nydll2/giphy.gif">

## Time to gear up! : Lets Import

In [None]:
import numpy as np
import pandas as pd 
import os
import pathlib
import PIL
from pathlib import Path
from PIL import Image, ImageDraw
from math import sqrt
import ast

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/tensorflow-great-barrier-reef/train.csv")
test = pd.read_csv("../input/tensorflow-great-barrier-reef/test.csv")
sub = pd.read_csv("../input/tensorflow-great-barrier-reef/example_sample_submission.csv")

path = Path('../input/tensorflow-great-barrier-reef/train_images')
filepaths = list(path.glob(r'**/*.jpg'))

## Let us Begin!!!

In [None]:
# checking the train test lengths
print("Number of training samples: ", len(train))
print("Number of testing samples: ", len(test))

In [None]:
train.head(150)

Looks like a lot of these frames dont have our starfishes 


<img src="https://media.giphy.com/media/jsN192JGdyWvS1gqTb/giphy.gif">

In [None]:
# lets see how many frames with no starfishes
train_clean = train.loc[train["annotations"] != "[]"]
print(f"No starfishes in {len(train)-len(train_clean)} samples.")
print(f"The clean train set has {len(train_clean)} images for us to work with.")

In [None]:
train_clean.head()

## Distribution of Sequences

In [None]:
# Checking out the number of sequences
len(train_clean.sequence.value_counts())

In [None]:
# rows per each sequence
print("Sequence Samples")
print(train_clean.sequence.value_counts())

In [None]:
seq_df = train_clean.sequence.value_counts().to_frame()
plt.figure(figsize=(16, 9))
sns.barplot(x=seq_df.index, y=list(seq_df.sequence), palette="Greens_d")
plt.title("Distribution of Sequences")
plt.xlabel("Sequence Id")
plt.ylabel("Frequency")
plt.show()

## Number Of Boxes

In [None]:
num_boxes = []
annotations_clean = []
for elem in train_clean.annotations:
    ann = ast.literal_eval(elem)
    num_boxes.append(len(ann))
    annotations_clean.append(ann)

In [None]:
# adding num boxes per row and changing the annotations column to a proper python parseable list of dictionaries
train_clean["num_boxes"] = num_boxes
train_clean["annotations"] = annotations_clean

In [None]:
train_clean.head()

In [None]:
print("#box Frequency")
print(train_clean.num_boxes.value_counts())

In [None]:
# number of bounding boxes in the clean train datasets
print(f"Number of Bounding Boxes in the dataset: {train_clean.num_boxes.sum()}")

## Distribution of number of bounding boxes

In [None]:
box_count = train_clean.num_boxes.value_counts().to_frame()

In [None]:
plt.figure(figsize=(16, 9))
sns.barplot(x=box_count.index, y=list(box_count.num_boxes), palette="Greens_d")
plt.title("Distribution of Num_boxes")
plt.xlabel("# of Boxes")
plt.ylabel("Frequency")
plt.show()

So looks like 57% of the data points have only one bounding box, followed by 19.1% with 2 bounding boxes 


## Looking at the boxes

In [None]:
#structure of a annotation
list(train_clean["annotations"])[0]

In [None]:
# generating paths for input images
src = '../input/tensorflow-great-barrier-reef/train_images'
paths = []
for row in train_clean.image_id:
    vid_num = row.split('-')[0]
    img_num = row.split('-')[1]
    paths.append(os.path.join(src,f'video_{vid_num}',img_num+'.jpg'))


In [None]:
train_clean['paths'] = paths

In [None]:
# classic way of iterating through and drawing the bounding boxes on an image
def vis_boxes(img_path, bboxes):
    coords = []
    for box in bboxes:
        x1 = box['x']
        y1 = box['y']
        x2 = x1 + box['width']
        y2 = y1 + box['height']
        coords.append([x1, y1, x2, y2])
        
    img = Image.open(img_path)
    img1 = img.copy()
    draw = ImageDraw.Draw(img1)
    for elem in coords:
        draw.rectangle(elem, outline='red', width=7)
    
    return img1

In [None]:
train_clean.head()

### Sequences with max bounding boxes

In [None]:
# number of bounding boxes per each sequence
train_clean.groupby('sequence').num_boxes.sum().to_frame()

## Lets look at some samples 

In [None]:
# lets plot a few
# some inspiration from https://www.kaggle.com/sjyangkevin/eda-bounding-box-analysis-annotated-videos

plt.figure(figsize=(16, 9))
n_images = 9
count = 0
r,c = int(sqrt(n_images)), int(sqrt(n_images))
train_plot = train_clean.sample(n = n_images)

for _, row in train_plot.iterrows():
    img_path = row['paths']
    bboxes = row['annotations']
    plt.subplot(r, c, count + 1)
    img_out = vis_boxes(img_path, bboxes)
    plt.imshow(img_out)
    count+=1

plt.show()
plt.tight_layout()

## What Next?
- Some Advanced EDA
- Baseline model
- Error Analysis of Baseline
- Advanced model

**If you like it so far, consider upvoting 😄** 

<img src="https://media.giphy.com/media/eunrMjB8lBUKeL1fqD/giphy-downsized.gif">