<center><h1>Help Protect the Great Barrier Reef with Amazon SageMaker Objection Detection</h1></center>

![Chest X-Ray Images (Pneumonia)](https://storage.googleapis.com/kaggle-competitions/kaggle/31703/logos/header.png?t=2021-10-29-00-30-04)
    
Data Source: https://www.kaggle.com/c/tensorflow-great-barrier-reef

In [None]:
# !pip install kaggle imagesize

In [None]:
import kaggle
!kaggle competitions download -c tensorflow-great-barrier-reef

In [None]:
import pandas as pd
import numpy as np
import imagesize
import json
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed

In [None]:
from itertools import groupby
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import pandas as pd
import os
import pickle
import cv2
from multiprocessing import Pool
import matplotlib.pyplot as plt
import ast
import glob

import shutil
import sys

In [None]:
%%time
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
from sagemaker.image_uris import retrieve

role = get_execution_role()
sess = sagemaker.Session()

bucket = sess.default_bucket()
prefix = 'dataset/great-barrier-reef'

training_image = retrieve('object-detection', boto3.Session().region_name)

In [None]:
# !unzip tensorflow-great-barrier-reef.zip

## Data Preparation
1. Filter out images without annotations
2. Create directories for images and annotations
3. Generate annotation files having the same name with the corresponding image


In [None]:
FOLD = 2 # which fold to train
REMOVE_NOBBOX = True # remove images with no bbox
ROOT_DIR  = 'tensorflow-great-barrier-reef'
IMAGE_DIR = 'images' # directory to save images
LABEL_DIR = 'annotation' # directory to save labels

In [None]:
!mkdir -p {IMAGE_DIR}
!mkdir -p {LABEL_DIR}

In [None]:
def get_path(row):
    row['old_image_path'] = f'train_images/video_{row.video_id}/{row.video_frame}.jpg'
    row['image_path'] = f'{IMAGE_DIR}/video_{row.video_id}_{row.video_frame}.jpg'
    row['label_path'] = f'{LABEL_DIR}/video_{row.video_id}_{row.video_frame}.json'
    return row

In [None]:
# Train Data
df = pd.read_csv('train.csv')
df = df.apply(get_path, axis=1)

In [None]:
df['annotations'] = df['annotations'].apply(lambda x: ast.literal_eval(x))

In [None]:
df['num_bbox'] = df['annotations'].apply(lambda x: len(x))
data = (df.num_bbox>0).value_counts()/len(df)*100
print(f"No BBox: {data[0]:0.2f}% | With BBox: {data[1]:0.2f}%")

## 🧹 Clean Data

In [None]:
if REMOVE_NOBBOX:
    df = df.query("num_bbox>0")

We need to generate the content of the annotation file following the example below. This makes the 

```json
{
   "file": "your_image_directory/sample_image1.jpg",
   "image_size": [
      {
         "width": 50,
         "height": 32,
         "depth": 3
      }
   ],
   "annotations": [
      {
         "class_id": 0,
         "left": 559,
         "top": 213,
         "width": 50,
         "height": 32
      }
   ],
   "categories": [
      {
         "class_id": 0,
         "name": "starfish"
      }
   ]
}
```

In [None]:
def generate_sm_annotation(row):
    """
    Function to generate SageMaker Object Detection
    Annotation json file.
    """
    annotation = row.annotations
    old_image_path = row.old_image_path
    im = Image.open(old_image_path)
    width, height = im.size
    # get image size
    annotation_object = {
        'file': row.image_path,
        'image_size': [
            {'width': width, 'height': height, 'depth': 3}
        ],
        'categories': [
            {'class_id': 0, 'name': 'starfish'}
        ]
    }
    annotation_object['annotations'] = [ 
        {'class_id': 0, 'width': i['width'], 'height': i['height'], 'left': i['x'], 'top': i['y']}
        for i in row.annotations
    ]
    return annotation_object
    

In [None]:
df['sm_annotations'] = df.apply(generate_sm_annotation, axis=1)

In [None]:
train, validation = train_test_split(df, test_size=0.3)

In [None]:
train.head()

In [None]:
def make_copy(path, class_):
    data = path.split('/')
    filename = data[-1]
    video_id = data[-2]
    new_path = os.path.join(f'images/{class_}',f'{video_id}_{filename}')
    shutil.copy(path, new_path)
    return

In [None]:
def write_sm_annotations(annotation, class_):
    """
    Function to write SM Object Detection annotation 
    to file
    """
    data = annotation['file'].split('/')
    top_path = data[0]
    video_id = data[1]
    new_path = f'{top_path}/{class_}_annotation/{video_id}'.replace('jpg', 'json')
    annotation['file'] = f'{video_id}'
    annotation_output_file = open(new_path, 'w')
    json.dump(annotation, annotation_output_file, indent=0)
    return

### Copy train images and annotation files to the respective paths

In [None]:
train_image_paths = train.old_image_path.tolist()

In [None]:
_ = Parallel(n_jobs=-1, backend='threading')(delayed(make_copy)(path, 'train') for path in tqdm(train_image_paths))

In [None]:
train_image_sm_annotations = train.sm_annotations.tolist()

In [None]:
train_image_sm_annotations[0]

In [None]:
_ = Parallel(n_jobs=-1, backend='threading')(delayed(write_sm_annotations)(annotation, 'train') for annotation in tqdm(train_image_sm_annotations))



### Copy validation images and annotation files to the respective paths

In [None]:
validation_image_paths = validation.old_image_path.tolist()

In [None]:
_ = Parallel(n_jobs=-1, backend='threading')(delayed(make_copy)(path, 'validation') for path in tqdm(validation_image_paths))

In [None]:
validation_image_sm_annotations = validation.sm_annotations.tolist()

In [None]:
validation_image_sm_annotations[0]

In [None]:
_ = Parallel(n_jobs=-1, backend='threading')(delayed(write_sm_annotations)(annotation, 'validation') for annotation in tqdm(validation_image_sm_annotations))



Let's inspect some images for fun

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

In [None]:
image_path = 'images/validation/video_0_11923.jpg'

In [None]:
im = Image.open(image_path)

# Create figure and axes
fig, ax = plt.subplots()

# Display the image
ax.imshow(im)

# Create a Rectangle patch
rect = patches.Rectangle((554, 360), 42, 34, linewidth=1, edgecolor='r', facecolor='none')

# Add the patch to the Axes
ax.add_patch(rect)

plt.show()

### Upload data to S3 for model training

In [None]:
# Four channels: train, validation, train_lst, and validation_lst
s3_train = 's3://{}/{}/train/'.format(bucket, prefix)
s3_validation = 's3://{}/{}/validation/'.format(bucket, prefix)
s3_train_annotation = 's3://{}/{}/train_annotation/'.format(bucket, prefix)
s3_validation_annotation = 's3://{}/{}/validation_annotation/'.format(bucket, prefix)

In [None]:
s3_train

In [None]:
# !aws s3 cp images/train/ $s3_train --recursive --dryrun

In [None]:
# upload the image files to train and validation channels
!aws s3 cp images/train/ $s3_train --recursive --quiet
!aws s3 cp images/validation/ $s3_validation --recursive --quiet

!aws s3 cp images/train_annotation/ $s3_train_annotation --recursive --quiet
!aws s3 cp images/validation_annotation/ $s3_validation_annotation --recursive --quiet

### Fine-tuning the Object Classification Model

Once we have the data available in the correct format for training, the next step is to actually train the model using the data. Before training the model, we need to setup the training parameters. The next section will explain the parameters in detail.


#### Training parameters
There are two kinds of parameters that need to be set for training. The first one are the parameters for the training job. These include:

- Training instance count: This is the number of instances on which to run the training. When the number of instances is greater than one, then the image classification algorithm will run in distributed settings.
- Training instance type: This indicates the type of machine on which to run the training. Typically, we use GPU instances for these training
- Output path: This the s3 folder in which the training output is stored

In [None]:
train_use_spot_instances = True
train_max_run=1300
train_max_wait = 2400 if train_use_spot_instances else None

In [None]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

object_detector_model = sagemaker.estimator.Estimator(
    training_image,
    role, 
    instance_count=1, 
    instance_type='ml.p2.xlarge',
    volume_size = 50,
    input_mode= 'File',
    use_spot_instances=train_use_spot_instances,
    max_run=train_max_run,
    max_wait=train_max_run,    
    output_path=s3_output_location,
    sagemaker_session=sess
)

In [None]:
num_classes = 1
num_training_samples = train.shape[0]
print("num classes: {}, num training images: {}".format(num_classes, num_training_samples))

In [None]:
num_epochs = 100
lr_steps = "33,67"

In [None]:
object_detector_model.set_hyperparameters(
    base_network="resnet-50",
    use_pretrained_model=1,
    num_classes=num_classes,
    mini_batch_size=16,
    epochs=num_epochs,
    learning_rate=0.001,
    lr_scheduler_step=lr_steps,
    lr_scheduler_factor=0.1,
    optimizer="sgd",
    momentum=0.9,
    weight_decay=0.0005,
    overlap_threshold=0.5,
    nms_threshold=0.45,
    image_shape=512,
    label_width=350,
    num_training_samples=num_training_samples,
)

In [None]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train, distribution='FullyReplicated', 
    content_type='application/x-image', s3_data_type='S3Prefix'
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation, distribution='FullyReplicated',
    content_type='application/x-image', s3_data_type='S3Prefix'
)

In [None]:
train_data_annotation = sagemaker.inputs.TrainingInput(
    s3_train_annotation, distribution='FullyReplicated', 
    content_type='application/x-image', s3_data_type='S3Prefix'
)
validation_data_annotation = sagemaker.inputs.TrainingInput(
    s3_validation_annotation, distribution='FullyReplicated', 
    content_type='application/x-image', s3_data_type='S3Prefix'
)

data_channels = {'train': train_data, 'validation': validation_data, 
                 'train_annotation': train_data_annotation, 'validation_annotation': validation_data_annotation}

In [None]:
data_channels

## Submit training job


In [None]:
%%time
object_detector_model.fit(inputs=data_channels, logs=True)