# File description
Combine images + bounding box annotations from the robust dataset with those of `info.csv` (The ones created in `annotation2_read_cvat_backup.ipynb`). The process was relatively straight forward as the dataset could be exported into the correct format directly from `CVAT`. The only real preprocessing necessary was to remove all images which didn't contain any bounding boxes.

In [None]:
import dutils as U
U.jupyter_ipython.adjust_screen_width()
from dutils.jupyter_ipython import show_image as show
import seaborn; seaborn.set_style("whitegrid")

from tqdm.notebook import tqdm
from natsort import natsorted
import shutil
import zipfile
import shutil
from glob import glob
import random
import pandas as pd
import json
import cv2
import os

# Setup

In [None]:
zip_folder_path = "../cvat/robust.zip"
unzip_folder_path = zip_folder_path[:-4]
assert os.path.isfile(zip_folder_path) and (zip_folder_path[-4:] == ".zip"), "Backup file is invalid"
with zipfile.ZipFile(zip_folder_path) as zip_file:
        zip_file.extractall(unzip_folder_path)

cvat_dataset_folder_path = os.path.join(unzip_folder_path, "obj_train_data")
clean_dataset_path = "../dataset/data_final/data"
csv_path = "../dataset/data_final/info.csv"

# Have inlcluded both .jpg and .png, but no other formats is legal
image_paths = \
    glob(cvat_dataset_folder_path + "/*.png") + \
    glob(cvat_dataset_folder_path + "/*.jpg") + \
    glob(cvat_dataset_folder_path + "/*.PNG") + \
    glob(cvat_dataset_folder_path + "/*.JPG")

# glob is case insensitive on Windows, but not on Linux so have included upper case extensions which create duplicates on Windows.
image_paths = natsorted(list(set(image_paths)))

# Load the yolo annotations
anno_paths = natsorted(glob(cvat_dataset_folder_path + "/*.txt"))

# Make sure each image as a corresponding annotation
assert len(anno_paths) == len(image_paths)
for anno_path, image_path in list(zip(anno_paths, image_paths)):
    assert anno_path[:-4] == image_path[:-4], "Mismatch between the location of image_paths and anno_paths"

df_annotations = pd.read_csv(csv_path)
df_annotations.head()

# Helpers

In [None]:
project_labels_map = {
     'cycle_helmet': 0,
     'cycle_nohelmet': 1,
     'cycle_blurred': 2,
     'cycle_covered': 3,
     'escooter_helmet': 4,
     'escooter_nohelmet': 5,
     'escooter_blurred': 6,
     'escooter_covered': 7,
     'headphones': 8,
     'earbuds': 9,
     'phone': 10,
     'hovding': 11,
     'cycle_light': 12,
     'escooter_light': 13,
     'scooter': 14
}

# The backup file has flipped annotation_labels i.e. 0=14, 1=13 ... 
# So i will reverse the robust labels aswell (this is admittedly an ugly solution)
reverse_label_map = {i:i_reversed for i, i_reversed in enumerate(list(range(14,-1,-1)))}
def get_reversed_label(anno_path):
    with open(anno_path) as f:
        s = f.read()
    
    return_string = ""
    
    for anno in s.strip().split("\n"):
        splits = anno.strip().split(" ")
        reversed_label = reverse_label_map[int(splits[0])]
        new_anno = " ".join([str(reversed_label)] + splits[1:])
        return_string += new_anno + "\n"
    return return_string.strip()

# Copy images with at least one bounding box

In [None]:
i = -1
for image_path, anno_path in tqdm(list(zip(image_paths, anno_paths))):
    with open(anno_path) as f:
        anno = f.readlines()
    
    # If annotation file is empty -> don't save the image
    if len(anno) == 0:
        continue
    
    # Setup
    i += 1 
    name_general = "robust_" + str(i)
    save_path = os.path.join(clean_dataset_path, name_general)
    
    # Save to disk
    cv2.imwrite(save_path + ".png", cv2.imread(image_path))
    with open(save_path+".txt", 'x') as f:
        reversed_annotation = get_reversed_label(anno_path)
        print(reversed_annotation, file=f, end="")
    
    # Update annotation dataframe
    to_append = df_annotations.iloc[0:1].copy()
    to_append.loc[0, "location"] = "internet"
    to_append.loc[0, "week_day":"date_minut"] = pd.NA
    to_append.loc[0, "annotation_name"] = name_general + ".txt"
    to_append.loc[0, "frame_name"] = name_general + ".png"
    to_append.loc[0, "cycle_helmet":] = 0
    
    with open(save_path+".txt") as bbs:
        labels = [int(bb.split(" ")[0]) for bb in bbs.read().strip().split("\n")]
    
    for (label_name, label_i) in project_labels_map.items():
        to_append[label_name] = labels.count(label_i)
    
    df_annotations = df_annotations.append(to_append)

In [None]:
df_annotations.to_csv(csv_path, index=False)
df_annotations[df_annotations["location"] == "internet"].loc[:, "cycle_helmet":].sum()

# Testing - only robust

In [None]:
images_with_bb = []
for _ in range(10):
    random_name = random.randint(0, max([int(path.split(os.sep)[-1][7:-4]) for path in glob(clean_dataset_path + "/robust_*")]))
    image_path = f"{clean_dataset_path}/robust_{random_name}.png"
    anno_path = f"{clean_dataset_path}/robust_{random_name}.txt"
    image_drawn = U.pytorch.yolo_draw_bbs_path(image_path, anno_path)
    images_with_bb.append(image_drawn)

show(images_with_bb)

# Testing - all

In [None]:
images_with_bb = []
for _ in range(10):
    random_name = random.choice(df_annotations["frame_name"].to_list())[:-4]
    image_path = glob(f"{clean_dataset_path}/{random_name}.png")[0]
    anno_path = glob(f"{clean_dataset_path}//{random_name}.txt")[0]
    image_drawn = U.pytorch.yolo_draw_bbs_path(image_path, anno_path)
    images_with_bb.append(image_drawn)

show(images_with_bb)

In [None]:
df_annotations.loc[:, "cycle_helmet":].sum().plot.bar(figsize=(20,8), rot=45)
df_annotations.loc[:, "cycle_helmet":].sum()