# Files can be viewed here: https://drive.google.com/drive/folders/1bzg25F-sFlvD00n48j-YW_pRadQfYArk?usp=sharing

## Load the comptetition data from Kaggle

In [None]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c tensorflow-great-barrier-reef

Downloading tensorflow-great-barrier-reef.zip to /content
100% 14.2G/14.2G [12:01<00:00, 20.2MB/s]
100% 14.2G/14.2G [12:01<00:00, 21.1MB/s]


In [None]:
! unzip /content/tensorflow-great-barrier-reef.zip

In [None]:
import pandas as pd
import numpy as np
import os
import ast
import yaml
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from shutil import copyfile
import yaml
import random
random.seed(2022)

## Mount to google drive to save the data for later use

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil

## Save the test and train csv files to the drive for later use

In [None]:
shutil.copy("/content/test.csv","/content/drive/MyDrive/cots-data")
shutil.copy("/content/train.csv","/content/drive/MyDrive/cots-data")

'/content/drive/MyDrive/cots-data/train.csv'

## Transfer all video images to train images folder

In [None]:
import os
import shutil

#video 0, video 1, video 2

src='/content/train_images/video_2'
dest='/content/drive/MyDrive/cots-data/train_images/video_2'

src_files = os.listdir(src)
for file_name in src_files:
    #print(file_name)
    full_file_name = os.path.join(src, file_name)
    if os.path.isfile(full_file_name):
        shutil.copy(full_file_name, dest)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/cots-data/train.csv')
df.head()

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations
0,0,40258,0,0,0-0,[]
1,0,40258,1,1,0-1,[]
2,0,40258,2,2,0-2,[]
3,0,40258,3,3,0-3,[]
4,0,40258,4,4,0-4,[]


## Add annotations to the train data

In [None]:
#finding no. of samples without any bounding boxes
df.annotations = df.annotations.apply(ast.literal_eval)
df['no_of_bboxes'] = df['annotations'].apply(lambda x: len(x))

# drop rows with 0 bounding boxes
df = df.loc[lambda df: df['no_of_bboxes']>0]
df.reset_index(inplace=True)
df = df.drop(['index'], axis=1)
df.head()

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,no_of_bboxes
0,0,40258,16,16,0-16,"[{'x': 559, 'y': 213, 'width': 50, 'height': 32}]",1
1,0,40258,17,17,0-17,"[{'x': 558, 'y': 213, 'width': 50, 'height': 32}]",1
2,0,40258,18,18,0-18,"[{'x': 557, 'y': 213, 'width': 50, 'height': 32}]",1
3,0,40258,19,19,0-19,"[{'x': 556, 'y': 214, 'width': 50, 'height': 32}]",1
4,0,40258,20,20,0-20,"[{'x': 555, 'y': 214, 'width': 50, 'height': 32}]",1


In [None]:
print(df.no_of_bboxes.values.max())

18


## Define bounding boxes

In [None]:
def retrieve_bounding_box(list_of_annotations):
    '''
    Retrieve the bounding box values for training YOLO
    '''
    bboxes = []
    for bbox in list_of_annotations:
        width = bbox['width']
        height = bbox['height']
        xc = bbox['x'] + int(width/2)
        yc = bbox['x'] + int(height/2)
        xc/=1280
        width/=1280
        yc/=720
        height/=720
        modified_bbox = [0, xc, yc, width, height]
        bboxes.append(modified_bbox)
        
        #we did this because YOLO expects data in a particular format as shown below:
        # (x_centre, y_centre, box_width, box_height)
    return bboxes

## Mapping the images (image_id) with their respective paths in the train_images folder. 

In [None]:
df['bbox'] = [retrieve_bounding_box(df.annotations[i]) for i in range(len(df))]
df['image_path'] = None
for i in range(len(df)):
    path = f"/content/drive/MyDrive/cots-data/train_images/video_{df.video_id[i]}/{df.video_frame[i]}.jpg"
    df.image_path[i] = path
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.image_path[i] = path


Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,no_of_bboxes,bbox,image_path
0,0,40258,16,16,0-16,"[{'x': 559, 'y': 213, 'width': 50, 'height': 32}]",1,"[[0, 0.45625, 0.7986111111111112, 0.0390625, 0...",/content/drive/MyDrive/cots-data/train_images/...
1,0,40258,17,17,0-17,"[{'x': 558, 'y': 213, 'width': 50, 'height': 32}]",1,"[[0, 0.45546875, 0.7972222222222223, 0.0390625...",/content/drive/MyDrive/cots-data/train_images/...
2,0,40258,18,18,0-18,"[{'x': 557, 'y': 213, 'width': 50, 'height': 32}]",1,"[[0, 0.4546875, 0.7958333333333333, 0.0390625,...",/content/drive/MyDrive/cots-data/train_images/...
3,0,40258,19,19,0-19,"[{'x': 556, 'y': 214, 'width': 50, 'height': 32}]",1,"[[0, 0.45390625, 0.7944444444444444, 0.0390625...",/content/drive/MyDrive/cots-data/train_images/...
4,0,40258,20,20,0-20,"[{'x': 555, 'y': 214, 'width': 50, 'height': 32}]",1,"[[0, 0.453125, 0.7930555555555555, 0.0390625, ...",/content/drive/MyDrive/cots-data/train_images/...


# Train, test, validate split

In [None]:
train_df, valid_df, test_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

train_df= train_df.reset_index(drop=True)
valid_df= valid_df.reset_index(drop=True)
test_df= test_df.reset_index(drop=True)

In [None]:
train_df

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,no_of_bboxes,bbox,image_path
0,1,18048,6716,7,1-6716,"[{'x': 676, 'y': 361, 'width': 37, 'height': 36}]",1,"[[0, 0.5421875, 0.9638888888888889, 0.02890625...",/content/drive/MyDrive/cots-data/train_images/...
1,0,40258,103,103,0-103,"[{'x': 642, 'y': 49, 'width': 20, 'height': 21}]",1,"[[0, 0.509375, 0.9055555555555556, 0.015625, 0...",/content/drive/MyDrive/cots-data/train_images/...
2,0,53708,9769,884,0-9769,"[{'x': 918, 'y': 380, 'width': 42, 'height': 35}]",1,"[[0, 0.73359375, 1.2986111111111112, 0.0328125...",/content/drive/MyDrive/cots-data/train_images/...
3,2,22643,5748,385,2-5748,"[{'x': 500, 'y': 304, 'width': 50, 'height': 3...",15,"[[0, 0.41015625, 0.7194444444444444, 0.0390625...",/content/drive/MyDrive/cots-data/train_images/...
4,0,59337,1888,37,0-1888,"[{'x': 441, 'y': 243, 'width': 41, 'height': 31}]",1,"[[0, 0.36015625, 0.6333333333333333, 0.0320312...",/content/drive/MyDrive/cots-data/train_images/...
...,...,...,...,...,...,...,...,...,...
2946,1,17665,6923,78,1-6923,"[{'x': 434, 'y': 629, 'width': 71, 'height': 6...",3,"[[0, 0.36640625, 0.6472222222222223, 0.0554687...",/content/drive/MyDrive/cots-data/train_images/...
2947,0,53708,9691,806,0-9691,"[{'x': 57, 'y': 535, 'width': 72, 'height': 70...",3,"[[0, 0.07265625, 0.12777777777777777, 0.05625,...",/content/drive/MyDrive/cots-data/train_images/...
2948,2,22643,5845,482,2-5845,"[{'x': 1087, 'y': 655, 'width': 95, 'height': ...",2,"[[0, 0.8859375, 1.5541666666666667, 0.07421875...",/content/drive/MyDrive/cots-data/train_images/...
2949,2,29859,8007,235,2-8007,"[{'x': 343, 'y': 700, 'width': 38, 'height': 18}]",1,"[[0, 0.2828125, 0.4888888888888889, 0.0296875,...",/content/drive/MyDrive/cots-data/train_images/...


In [None]:
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

(2951, 9)
(984, 9)
(984, 9)


## Map the train annotations with the images and create separate folders for each train, test and validate. 

In [None]:
for i in tqdm(range(len(train_df))):
    row = train_df.loc[i]
    copyfile(f'{row.image_path}', f'/content/drive/MyDrive/cots-data/images/train/{row.image_id}.jpg')

for i in tqdm(range(len(valid_df))):
    row = valid_df.loc[i]
    copyfile(f'{row.image_path}', f'/content/drive/MyDrive/cots-data/images/validate/{row.image_id}.jpg')    

for i in tqdm(range(len(test_df))):
    row = test_df.loc[i]
    copyfile(f'{row.image_path}', f'/content/drive/MyDrive/cots-data/images/test/{row.image_id}.jpg')


100%|██████████| 2951/2951 [00:48<00:00, 61.21it/s]
100%|██████████| 984/984 [00:13<00:00, 71.74it/s]
100%|██████████| 984/984 [00:13<00:00, 74.27it/s]


# Get labels in txt files

In [None]:
for i in tqdm(range(train_df.shape[0])):
    row = train_df.loc[i]
    yolo_label_data = row.bbox
    yolo_label_data = np.array(yolo_label_data)
    np.savetxt(
        f"/content/drive/MyDrive/cots-data/labels/train/{row.image_id}.txt",
        yolo_label_data,
        fmt=["%d", "%f", "%f", "%f", "%f"]
    )

for i in tqdm(range(valid_df.shape[0])):
    row = valid_df.loc[i]
    yolo_label_data = row.bbox
    yolo_label_data = np.array(yolo_label_data)
    np.savetxt(
        f"/content/drive/MyDrive/cots-data/labels/validate/{row.image_id}.txt",
        yolo_label_data,
        fmt=["%d", "%f", "%f", "%f", "%f"]
    )

for i in tqdm(range(test_df.shape[0])):
    row = test_df.loc[i]
    yolo_label_data = row.bbox
    yolo_label_data = np.array(yolo_label_data)
    np.savetxt(
        f"/content/drive/MyDrive/cots-data/labels/test/{row.image_id}.txt",
        yolo_label_data,
        fmt=["%d", "%f", "%f", "%f", "%f"]
    )

100%|██████████| 2951/2951 [00:27<00:00, 107.11it/s]
100%|██████████| 984/984 [00:08<00:00, 115.02it/s]
100%|██████████| 984/984 [00:08<00:00, 121.92it/s]


In [None]:
!zip -r /content/valid_labels.zip /content/drive/MyDrive/cots-data/labels/validate

In [None]:
from google.colab import files
files.download("/content/valid_labels.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>