# File description
Make the last adjustment to the dataset. This include:

    1.) Creating a dataframe that contains one row for each annotation
    2.) Add extra information to these rows such as YOLO and cartesian coordinates
    3.) Combine certain labels e.g. `cycle_covered` becomes `cycle_nohelmet` and `earbuds` becomes `headphones`
    4.) Display final label distribution.
<br>
<br>
The resulting dataframe looks something like this (some columns are missing)
<img src="../illustration_images/df_final_dataset_example.png" width="800" /> 


In [None]:
import dutils as U
U.jupyter_ipython.adjust_screen_width()
from dutils.jupyter_ipython import show_image as show
import seaborn; seaborn.set_style("whitegrid")

from typing import List
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from natsort import natsorted
import shutil
from glob import glob
import random
import pandas as pd
import json
import cv2
import os

# Helpers

In [None]:
def read_yolo_from_file(label_path:str) -> List[list]:
    """
    Load yolo label from file. Expect each line to be in the format format:
        class_label x_center y_center bb_width bb_height.


    :param label_path: Takes a label_path, which is a string
    :return: list of lists with BBs in the format:
            [
            [class_label, x_center, y_center, bb_width, bb_height],
            [class_label, x_center, y_center, bb_width, bb_height],
            ...
            ]
    """
    # Checks
    if not os.path.exists(label_path): raise ValueError("Received a bad path")
    if label_path[-4:] != ".txt": raise ValueError(f"Expected .txt file but received {os.path.basename(label_path)}")

    # Read labels
    boxes = []
    with open(label_path) as f:
        for label in f.readlines():

            # Extract label info + label checks
            label_split = label.replace("\n", "").split(" ")
            if len(label_split) != 5:
                raise RuntimeError("Received a bad label")
            class_label, x, y, w, h = [float(x) for x in label_split]
            if not all(0<=number<=1 for number in [x,y,w,h]):
                raise RuntimeError("one or more of [x,y,w,h] is outside the accepted range [0,1]")

            boxes.append([class_label, x, y, w, h])

    return boxes

def xywhn2xyxy(label, x, y, w, h, dh, dw):
    l = int((x - w / 2) * dw)
    r = int((x + w / 2) * dw)
    t = int((y - h / 2) * dh)
    b = int((y + h / 2) * dh)

    if l < 0: l = 0
    if r > dw - 1: r = dw - 1
    if t < 0: t = 0
    if b > dh - 1: b = dh - 1
    return " ".join([label, str(l), str(t), str(r), str(b)])

# Setup

In [None]:
label_map = {
    0: 'cycle_helmet',
    1: 'cycle_nohelmet',
    2: 'cycle_blurred',
    3: 'cycle_covered',
    4: 'escooter_helmet',
    5: 'escooter_nohelmet',
    6: 'escooter_blurred',
    7: 'escooter_covered',
    8: 'headphones',
    9: 'earbuds',
    10: 'phone',
    11: 'hovding',
    12: 'cycle_light',
    13: 'escooter_light',
    14: 'scooter'
}

In [None]:
df = pd.read_csv("../dataset/data_final/info.csv")
df_clean = df.loc[0:-1, :"frame_name"].copy()
df_clean["class_label"] = -1
df_clean["class_label_name"] = ""
df_clean["label_yolo"] = ""
df_clean["label_cartesian"] = ""
df_clean["color_channels"] = -1
df_clean["image_height"] = -1
df_clean["image_width"] = -1

# Add a row for each annotation
NOTE: Bounding boxes with `hovding`, `cycle_light`, `escooter_light` and `scooter` will be removed

In [None]:
path = "../dataset/data_final/data"
for i, row in tqdm(df.iterrows(), total=len(df)):
    # Setup
    anno_path = path + "/" + row["annotation_name"]
    image_path = path + "/" + row["frame_name"]
    annotations = read_yolo_from_file(anno_path)
    
    # get image info
    h,w,c = cv2.imread(image_path).shape
    
    # Add one row per annotation
    for anno in annotations:
        label = int(anno[0])
        
        # remove: hovding, cycle_light, escooter_light and scooter
        if label in [11, 12, 13, 14]: 
            continue
            
        to_append = row[:"frame_name"].copy()
        to_append["class_label"] = label
        to_append["class_label_name"] = label_map[label]
        to_append["label_yolo"] = " ".join(map(str, anno))
        to_append["label_cartesian"] = xywhn2xyxy(label_map[label], *anno[1:], h, w)
        to_append["color_channels"] = c
        to_append["image_height"] = h
        to_append["image_width"] = w
        
        df_clean = df_clean.append(to_append)
        
df_clean

# Make alternative class labels

In [None]:
df_clean["class_label_combined_name"] = df_clean["class_label_name"]
def change_class_label(df, from_label:str, to_label:str, folder_path:str=None) -> None:
    cond = df["class_label_name"] == from_label
    df.loc[cond, "class_label_combined_name"] = to_label
    print(f"Found: {sum(cond)}")
    
    if path is not None:
        show(df_clean[cond].frame_name.apply(lambda x: folder_path+"/"+x).to_list())
        
    assert all(df["class_label_combined_name"] != from_label)

## Combine `cycle_blurred` with `cycle_nohelmet`

In [None]:
change_class_label(df_clean, "cycle_blurred", "cycle_nohelmet", path)

## Combine `cycle_covered` with `cycle_nohelmet`

In [None]:
change_class_label(df_clean, "cycle_covered", "cycle_nohelmet", path)

## Combine `escooter_blurred` with `escooter_nohelmet`

In [None]:
change_class_label(df_clean, "escooter_blurred", "escooter_nohelmet", path)

## Combine `escooter_covered` with `escooter_nohelmet`

In [None]:
change_class_label(df_clean, "escooter_covered", "escooter_nohelmet", path)

## Combine `earbuds` with `headphones`

In [None]:
change_class_label(df_clean, "earbuds", "headphones", path)

# Final class distribution

In [None]:
sns.set(rc={'figure.figsize':(20,7)}, style="whitegrid")
with_names = list(dict(df_clean["class_label_combined_name"].value_counts()).keys())
counts = df_clean["class_label_combined_name"].value_counts().values.tolist()
counts_percentage = [round(c/sum(counts)*100, 2) for c in counts]
barplot = sns.barplot(x=with_names, y=counts_percentage)

for i, (name, value) in enumerate(zip(with_names, counts_percentage)):
    barplot.text(i, value+0.5, str(counts[i]), horizontalalignment="center")
barplot.set_xlabel("Labels")
barplot.set_ylabel("Count (%)");
plt.title(f"Label distribution - Total: {df_clean['class_label_combined_name'].value_counts().sum()}")

display(df_clean["class_label_combined_name"].value_counts())
"SUM: ", df_clean["class_label_combined_name"].value_counts().sum()

# Make annotations for the combined labels

In [None]:
reversed_label_map = {v:k for k,v in label_map.items()}
df_clean["class_label_combined"] = df_clean["class_label_combined_name"].apply(lambda x: reversed_label_map[x])
df_clean["label_yolo_combined"] = df_clean.apply(
    lambda x: str(x["class_label_combined"]) + " " + " ".join(x["label_yolo"].split(" ")[1:])
    ,1
)

df_clean["label_cartesian_combined"] = df_clean.apply(
    lambda x: x["class_label_combined_name"] + " " + " ".join(x["label_cartesian"].split(" ")[1:])
    ,1
)

# Save the full annotation file

In [None]:
df_clean["annotation_path"] = df_clean["annotation_name"].apply(lambda x: "../dataset/data_final/data/" + x)
df_clean["frame_path"] = df_clean["frame_name"].apply(lambda x: "../dataset/data_final/data/" + x)
df_clean["row_id"] = list(range(len(df_clean)))
df_clean.to_csv("../dataset/data_final/annotations_all.csv", index=False)