# Data Cleaning
I got the data with the following structure:
```
├── crowd4access-images
│   ├── crowd4access-images
│   │   ├── test
|   |   |   |--image
|   |   |   |   |--*.jpg
|   |   |   |--label
|   |   |   |   |--*.txt
│   │   ├── trainval
|   |   |   |--image
|   |   |   |   |--*.jpg
|   |   |   |--label
|   |   |   |   |--*.txt
```

where each image file had a text file of the same name with annotations in the form:

```
class_name top_left_x top_left_y bottom_right_x bottom_right_y
```

in absolute pixel coordinates (not relative). The DETR model I am using expects a COCO format where there is a `.json` file with the form:

```
{
    "info": {...},
    "licenses": [...],
    "images": [...],
    "annotations": [...],
    "categories": [...], <-- Not in Captions annotations
    "segment_info": [...] <-- Only in Panoptic annotations
}
# from https://www.immersivelimit.com/tutorials/create-coco-annotations-from-scratch
```

where bounding box coordinates (placed in the annotations list) are relative.

To convert, I read in each text file and converted the bounding box coordinates to the COCO format and put it all into a custom Python dictionary. I then dumped it as a `.json` file to complete the conversion to COCO format.

In [None]:
import os
import json
from PIL import Image

In [None]:
path_to_train_data = "../data/trainval/image" # for local
path_to_test_data = "../data/test/image"

# Convert Data into COCO formatted Dataset

In [None]:
def convert_bbox_definition(left: int, top: int, right: int, bottom: int) -> list:
    '''
    Converts bounding box definition from:
    
    `class_name top_left_x top_left_y bottom_right_x bottom_right_y`

    to the proper COCO formatting defined as:

    `top_left_x top_left_y width height`

    '''
    top_left_x = left
    top_left_y = top
    width = right - left
    height = bottom - top

    bbox = [top_left_x, top_left_y, width, height]
    return bbox

In [None]:
def convert2coco(path: str) -> dict:
    '''
    Creates the correct label formatting for the model to train on defined as a dictionary with keys `image_id` and `annotations` where `annotations` is a list of dictionaries with each dictionary being a COCO object annotation
    '''

    d = {
        "info": {
            "description": "Tactile Paving Dataset from Crowd4Access",
            "year": 2022
        },

        "licenses": [
            {
                "id": 0,
                "name": "Attribution-ShareAlike 4.0 International",
                "url": "https://creativecommons.org/licenses/by-sa/4.0/legalcode"
            }
        ],

        "images": [],

        "annotations": [],

        "categories": [
            {
                "id": 0,
                "name": "tactile_paving",
                "supercategory": "tactile_paving"
            }
        ]
    }

    i = 0 # counter for creating new image ids
    j = 0 # counter for annotation ids

    for item in os.listdir(path + "/image"): # look at every image in path
        
        filename = item.split(".")[0] # split on file extension and just get id
        
        with open(f"{path}/label/{filename}.txt", "r") as f:
            # ANNOTATIONS
            for line in f.readlines():
                split_line = line.split(" ")
                
                # each bbox is defined as (class left top right bottom) as a new line in the corresponding .txt file

                image_class = split_line[0].strip()
                bbox_left = int(split_line[1].strip())
                bbox_top = int(split_line[2].strip())
                bbox_right = int(split_line[3].strip())
                bbox_bottom = int(split_line[4].strip("\n"))

                new_bbox = convert_bbox_definition(
                    bbox_left,
                    bbox_top,
                    bbox_right,
                    bbox_bottom
                )

                unique_coco_annotation = {
                    "image_id": i,
                    "bbox": new_bbox,
                    "id": j,
                    "category_id": 0,
                    "segmentation": [],
                    "area": 0,
                    "iscrowd": 0
                }

                d["annotations"].append(unique_coco_annotation)
                
                j += 1

        # IMAGES
        image = Image.open(path + "/image/" + item)
        height = image.size[1]
        width = image.size[0]
        id = i
        license = 0
        file_name = item

        unique_coco_image = {
            "id": id,
            "file_name": file_name,
            "width": width,
            "height": height,
            "license": license
        }

        d["images"].append(unique_coco_image)

        i += 1
    return d

In [None]:
data_as_coco = convert2coco("crowd4access-images/crowd4access-images/trainval")
test_data_as_coco = convert2coco("test_images")

## Dump to a `.json` file

In [None]:
with open("crowd4access-images/crowd4access-images/trainval/image/custom_train.json", "w") as f:
    json.dump(data_as_coco, f)

with open("crowd4access-images/crowd4access-images/test/image/custom_test.json","w") as f:
    json.dump(test_data_as_coco, f)