## Imports

In [2]:
import os
import pathlib
import urllib.request
import tarfile
import pandas as pd
import cv2
import sklearn.model_selection
import sagemaker

## Clients

In [3]:
sagemaker_session = sagemaker.Session()

## Constants

* DataSet url
* ML pathlib directory

In [4]:
BIRDS_200_DIR = "CUB_200_2011"
BIRDS_200_DATASET_S3_URL = f"https://s3.amazonaws.com/fast-ai-imageclas/{BIRDS_200_DIR}.tgz"
IM2REC_GITHUB_URL = "https://raw.githubusercontent.com/apache/mxnet/master/tools/im2rec.py"

ML_DIR = pathlib.Path("/opt/ml/")
ML_DATA_DIR = ML_DIR / "data"
ML_DATA_BIRDS_200_DIR = ML_DATA_DIR / BIRDS_200_DIR
ML_PROC_DIR = ML_DIR / "proc"
ML_PROC_TRAIN_DIR = ML_PROC_DIR / "train"
ML_PROC_TEST_DIR = ML_PROC_DIR / "test"
ML_PROC_VALIDATION_DIR = ML_PROC_DIR / "validation"

IMAGE_ID = "image_id"
CLASS_ID = "class_id"

CLASSES_TXT = ML_DATA_BIRDS_200_DIR / "classes.txt"
BOUNDING_BOX_TXT = ML_DATA_BIRDS_200_DIR / "bounding_boxes.txt"
IMAGES_TXT = ML_DATA_BIRDS_200_DIR / "images.txt"
IMAGE_CLASS_LABELS_TXT = ML_DATA_BIRDS_200_DIR / "image_class_labels.txt"
SIZES_TXT = ML_DATA_BIRDS_200_DIR / "sizes.txt"
TRAIN_TEST_SPLIT_TXT = ML_DATA_BIRDS_200_DIR / "train_test_split.txt"
IMAGES_DIR = ML_DATA_BIRDS_200_DIR / "images"

RESIZE_SIZE = 256
PACK_LABEL = "bird_200"

TRAIN_LST = f"{PACK_LABEL}_train.lst"
VAL_LST = f"{PACK_LABEL}_val.lst"
TEST_LST = f"{PACK_LABEL}_test.lst"

TRAIN_REC = f"{PACK_LABEL}_train.rec"
VAL_REC = f"{PACK_LABEL}_val.rec"
TEST_REC = f"{PACK_LABEL}_test.rec"

LST_COLS = ["A", "B", "box_x0", "box_y0", "box_x1", "box_y1", "class_id", "relative_image_path"]
TEST_SIZE_FRACTION = 0.2
BUCKET = "sagemaker-us-east-1-180797159824"

## Create directories

In [5]:
ML_DIR.mkdir(exist_ok=True)
ML_DATA_DIR.mkdir(exist_ok=True)
ML_PROC_DIR.mkdir(exist_ok=True)
ML_PROC_TRAIN_DIR.mkdir(exist_ok=True)
ML_PROC_TEST_DIR.mkdir(exist_ok=True)
ML_PROC_VALIDATION_DIR.mkdir(exist_ok=True)

## Download the dataset

In [6]:
def download(url, download_dir, force=False):
    filename = url.split("/")[-1]
    filepath = download_dir / filename
    if force or not filepath.exists():
        urllib.request.urlretrieve(url, filepath)
    return filepath

In [7]:
dataset_path = download(BIRDS_200_DATASET_S3_URL, download_dir=ML_DATA_DIR)
print(dataset_path)

/opt/ml/data/CUB_200_2011.tgz


## Extract the dataset

In [8]:
compressed = tarfile.open(dataset_path)
compressed.extractall(ML_DATA_DIR)

## Load the dataset

In [9]:
classes_df = pd.read_csv(CLASSES_TXT, sep=" ", names=[CLASS_ID, "class_name"], header=None)
bounding_box_df = pd.read_csv(BOUNDING_BOX_TXT, sep=" ", names=[IMAGE_ID, "box_x0", "box_y0", "box_x1", "box_y1"], header=None)
images_df = pd.read_csv(IMAGES_TXT, sep=" ", names=[IMAGE_ID, "image_file_name"], header=None)
image_class_labels_df = pd.read_csv(IMAGE_CLASS_LABELS_TXT, sep=" ", names=[IMAGE_ID, CLASS_ID], header=None)
train_test_split_df = pd.read_csv(TRAIN_TEST_SPLIT_TXT, sep=" ", names=[IMAGE_ID, "is_train"], header=None)

full_df = bounding_box_df.merge(images_df, how="outer", on=IMAGE_ID)
full_df = full_df.merge(image_class_labels_df, how="outer", on=IMAGE_ID)
full_df = full_df.merge(train_test_split_df, how="outer", on=IMAGE_ID)
full_df = full_df.merge(classes_df, how="outer", on=CLASS_ID)
full_df.head()

Unnamed: 0,image_id,box_x0,box_y0,box_x1,box_y1,image_file_name,class_id,is_train,class_name
0,1,60.0,27.0,325.0,304.0,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,001.Black_footed_Albatross
1,2,139.0,30.0,153.0,264.0,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1,001.Black_footed_Albatross
2,3,14.0,112.0,388.0,186.0,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,001.Black_footed_Albatross
3,4,112.0,90.0,255.0,242.0,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1,001.Black_footed_Albatross
4,5,70.0,50.0,134.0,303.0,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1,001.Black_footed_Albatross


## Create annotated .lst files

To create annotated .lst files for detection, please refer to the instructions provided in this README - Creating Your Own .lst Files for Detection.
The format of the .lst file is as follows:

```
RowIdx  A  B  [Extra Header]   [(Object0), (Object1), ... (ObjectN)] <Filename>
```

Please note that [Extra Header] and (ObjectX) are separate groups of columns that consist of multiple columns each.

* `RowIdx` indicates the index of the row, starting from 0.
* `A` represents the total number of columns in [Extra Header] plus 2.
* `B` denotes the number of columns that each ObjectX contains.
* `[Extra Header]` is an optional section for additional header information, such as image width/height.
* `[(Object0), (Object1), ..., (ObjectN)]` is a list of objects, where each object is composed of multiple columns.
* `<Filename>` is the relative filepath of the image

The separator `\t` must be used.

In [10]:
(train_val_entries, test_entries) = ([], [])
for i, row in full_df.iterrows():
    img_path = IMAGES_DIR / row["image_file_name"]
    img = cv2.imread(str(img_path))
    (height, width, _) = img.shape
    (box_x0, box_x1, box_y0, box_y1) = (row["box_x0"], row["box_x1"], row["box_y0"], row["box_y1"])
    (box_x0, box_x1, box_y0, box_y1) = (box_x0/height, box_x1/height, box_y0/width, box_y1/width)
    entry = {
        "A": 2,
        "B": 5,
        "relative_image_path": row["image_file_name"],
        "box_x0": box_x0,
        "box_x1": box_x1,
        "box_y0": box_y0,
        "box_y1": box_y1,
        "class_id": float(row["class_id"] - 1), # 1, 2, ... -> 0, 1, ....
    }
    if row["is_train"]:
        train_val_entries.append(entry)
    else:
        test_entries.append(entry)

(train_val_df, test_df) = (pd.DataFrame(train_val_entries).reindex(LST_COLS, axis=1), pd.DataFrame(test_entries).reindex(LST_COLS, axis=1))
(train_df, val_df) = sklearn.model_selection.train_test_split(train_val_df, test_size=TEST_SIZE_FRACTION)

In [11]:
train_df.head(2)

Unnamed: 0,A,B,box_x0,box_y0,box_x1,box_y1,class_id,relative_image_path
631,2,5,0.127226,0.196,0.826972,0.476,21.0,022.Chuck_will_Widow/Chuck_Will_Widow_0012_796...
5967,2,5,0.420054,0.202,0.869919,0.484,199.0,200.Common_Yellowthroat/Common_Yellowthroat_00...


In [12]:
val_df.head(2)

Unnamed: 0,A,B,box_x0,box_y0,box_x1,box_y1,class_id,relative_image_path
3824,2,5,0.324895,0.252,0.318565,0.428,127.0,128.Seaside_Sparrow/Seaside_Sparrow_0045_12069...
2321,2,5,0.550898,0.252,0.305389,0.224,77.0,078.Gray_Kingbird/Gray_Kingbird_0016_70288.jpg


In [13]:
test_df.head(2)

Unnamed: 0,A,B,box_x0,box_y0,box_x1,box_y1,class_id,relative_image_path
0,2,5,0.179104,0.054,0.970149,0.608,0.0,001.Black_footed_Albatross/Black_Footed_Albatr...
1,2,5,0.040346,0.224,1.118156,0.372,0.0,001.Black_footed_Albatross/Black_Footed_Albatr...


## Prepare directories for RecordIO creation

In [14]:
train_df.to_csv(TRAIN_LST, sep="\t", float_format="%.4f", header=False)
val_df.to_csv(VAL_LST, sep="\t", float_format="%.4f", header=False)
test_df.to_csv(TEST_LST, sep="\t", float_format="%.4f", header=False)

In [15]:
!tail -n 3 $TRAIN_LST

3785	2	5	0.4850	0.0560	0.5240	0.5520	126.0000	127.Savannah_Sparrow/Savannah_Sparrow_0066_119949.jpg
527	2	5	0.3086	0.0160	0.7062	0.6400	17.0000	018.Spotted_Catbird/Spotted_Catbird_0005_19411.jpg
4774	2	5	0.3084	0.1920	1.1856	0.4740	159.0000	160.Black_throated_Blue_Warbler/Black_Throated_Blue_Warbler_0106_161523.jpg


## Create RecordIO files -- model inputs

The model expects RecordIO files as input. This is accomplished by [im2rec.py](https://github.com/apache/mxnet/blob/master/tools/im2rec.py) as follows:
1. Create train.lst and test.lst in the same directory as images
1. Run the `im2rec.py` to produce the .rec file

In [16]:
download(IM2REC_GITHUB_URL, download_dir=pathlib.Path("."))
os.system(f"python3 im2rec.py --resize {RESIZE_SIZE} --pack-label {PACK_LABEL} {IMAGES_DIR}")

0

## Upload and Register RecordIO files

In [17]:
train_channel = "train"
validation_channel = "validation"

sagemaker_session.upload_data(path=TRAIN_REC, bucket=BUCKET, key_prefix=train_channel)
sagemaker_session.upload_data(path=VAL_REC, bucket=BUCKET, key_prefix=validation_channel)

's3://sagemaker-us-east-1-180797159824/validation/bird_200_val.rec'

The files are now in s3, ready to use for training.