## Data Preperation

The goal is to prepare the data in the videos with labels_dataframe.csv to make it ready for finetuning Faster-R CNN
we first take a look at how faster r-cnn finetune data is structered

we will use the coco json annotation:
```dataset/
├── train/
│   ├── images/
│   │   ├── image1.jpg
│   │   ├── image2.jpg
│   └── annotations/
│       └── instances_train.json
├── val/
│   ├── images/
│   │   ├── image1.jpg
│   │   ├── image2.jpg
│   └── annotations/
│       └── instances_val.json

Sample COCO JSON annotation for one image:
{
    "images": [
        {"id": 1, "file_name": "image1.jpg", "height": 480, "width": 640},
        
    ],
    "annotations": [
        {"id": 1, "image_id": 1, "category_id": 1, "bbox": [100, 200, 50, 50], "area": 2500, "iscrowd": 0}
    ],
    "categories": [
        {"id": 1, "name": "object_class"}
    ]
}


In [None]:
# imports
import pandas as pd
import os
import cv2
import random
import matplotlib.pyplot as plt
import json
from tqdm import tqdm

In [None]:
distribution = [('train', 0.8), ('test', 0.1), ('val', 0.1)]
len(distribution)

In [None]:
path = "./data"
video_directory = "/deepstore/datasets/dmb/ComputerVision/ProRail/Ivg/Videos"
print("Path to dataset files:", path)
print("Path to video files:", video_directory)

In [None]:
df = pd.read_csv(path + "/labels_dataframe.csv")
videos = df["Source"].unique()
videos

In [None]:
df = pd.read_csv('data/labels_dataframe.csv')
df.head(20)

The columns we need for COCO are:

images:
- ID
- File name
- height/width of the picture

annotations:
- ID
- Image_id
- category ID
- bounding box (coordinates)
- area (oppervlakte van bounding box)
- iscrowd (used to indicate whether an object is part of a "crowd" or a group of objects that cannot be easily separated)

categories:
- ID
- Category class (string)

In [None]:
available_videos = os.listdir(video_directory)
available_videos = [video for video in available_videos if video.endswith(".mp4")]
available_videos = [video for video in available_videos if video in videos]
available_videos

In [None]:
total_frames = df[df["Source"].isin(available_videos)]
total_frames = total_frames["Absolute Frame"].count()
total_frames

In [None]:
frames_path = "./data/data_faster_rcnn"

In [None]:
def save_frame(frame, video, frame_number, path, subpath, overwrite=True):
    """
    Save a frame to disk with a formatted filename.

    Args:
        frame (numpy.ndarray): The frame to save.
        video (str): The name of the video file (without extension).
        frame_number (int): The frame number.
        path (str): The directory to save the frame.
        overwrite (bool): Whether to overwrite existing files.
    """
    new_path = os.path.join(path, subpath, "images")
    os.makedirs(new_path, exist_ok=True)
    formatted_frame_number = f"{frame_number:05d}"
    image_path = f"{new_path}/{video}_{formatted_frame_number}.jpg"
    # print(f"🎞️ Saving frame {image_path}...")
    if not overwrite and os.path.exists(image_path):
        return
    cv2.imwrite(image_path, frame)


coco_categories = [
    {
        "id": 1,
        "name": "hazmat",
    }
]
coco_images = {"train": [], "test": [], "val": []}
coco_annotations = {"train": [], "test": [], "val": []}


def add_annotation(video, dist, frame_number, video_h, video_w, annotations):
    global coco_annotations
    global coco_images
    global coco_categories
    image_id = len(coco_images[dist]) + 1
    if dist not in coco_images:
        coco_images[dist] = []
    coco_images[dist].append(
        {
            "id": image_id,
            "file_name": f"{video}_{frame_number:05d}.jpg",
            "width": video_w,
            "height": video_h,
        }
    )
    for index, annotation in annotations.iterrows():
        x_left_top = annotation["XTL"]
        y_left_top = annotation["YTL"]
        x_right_bottom = annotation["XBR"]
        y_right_bottom = annotation["YBR"]
        # get the bounding box
        width = x_right_bottom - x_left_top
        height = y_right_bottom - y_left_top
        # calculate the area
        area = width * height
        # make the bbox
        bbox = [x_left_top, y_left_top, width, height]
        if dist not in coco_annotations:
            coco_annotations[dist] = []
        coco_annotations[dist].append(
            {
                "id": len(coco_annotations[dist]) + 1,
                "image_id": image_id,
                "category_id": 1,
                "bbox": bbox,
                "area": area,
                "iscrowd": 0,
            }
        )


def get_rnd_distribution():
    new_dist = distribution.copy()
    while len(new_dist) > 0:
        rnd_dist = random.choice(new_dist)
        required_amount = int(total_frames * rnd_dist[1])
        if rnd_dist[0] not in coco_annotations:
            coco_annotations[rnd_dist[0]] = []
        if required_amount >= len(coco_annotations[rnd_dist[0]]):
            return rnd_dist
        else:
            new_dist.remove(rnd_dist)
    
    return distribution[0]

In [None]:
with tqdm(total=total_frames) as pbar:
    for video in available_videos:
        video_path = f"{video_directory}/{video}"
        cap = cv2.VideoCapture(video_path)
        video_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        video_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        number_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_number = 0
        video_name = video.split('.')[0]
        while frame_number < number_frames:
            ret, frame = cap.read()
            pbar.set_description(f"Processing {video}, frame {frame_number}/{number_frames}")
            if not ret:
                break
            annotations = df[(df["Source"] == video) & (df["Relative Frame"] == frame_number)]
            if not annotations.empty:
                sub_path = get_rnd_distribution()[0]
                # take rnd distribution if not full and then store in it.
                save_frame(frame, video_name, frame_number, frames_path,sub_path)
                add_annotation(video_name, sub_path, frame_number, video_h, video_w, annotations)
                pbar.update(annotations.shape[0])
            frame_number += 1
        cap.release()

In [None]:
print(len(coco_annotations['train']))

In [None]:
for name, dist in distribution:
    path = f"{frames_path}/{name}/annotations"
    os.makedirs(path, exist_ok=True)
    with open(f"{frames_path}/{name}/annotations/instances_{name}.json", "w") as f:
        json.dump(
            {
                "images": coco_images[name],
                "annotations": coco_annotations[name],
                "categories": coco_categories,
            },
            f,
        )

In [None]:
def draw_annotations(annotation,dist_name):
    # coco_images is array of dictionaries with keys: id, file_name, width, height
    image_name = [
        image["file_name"]
        for image in coco_images[dist_name]
        if image["id"] == annotation["image_id"]
    ]
    # open the image
    image = cv2.imread(f"{frames_path}/{dist_name}/{image_name}/{image_name[0]}")
    # get the bbox
    bbox = annotation["bbox"]
    # draw the bbox
    x, y, w, h = bbox
    cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 0), 2)
    # show the image
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()


def get_random_annotation():
    dist_name = random.choice(distribution)[0]
    return random.choice(coco_annotations[dist_name]), dist_name
# check if all the images are in the right folder


In [None]:

# read the json files
def check_images_in_annotations(destination_dir, destination_dir_for_images, max_images=10):
    with open(destination_dir) as f:
        data = json.load(f)
        annotations = data["annotations"]
        #shffle
        random.shuffle(annotations)
        for annotation in annotations[:max_images]:
            image_id = annotation["image_id"]
            images = data["images"]
            image = next((image for image in images if image["id"] == image_id), None)

            if image is None:
                print(f"Image not found for annotation: {annotation}")
            else:
                # show the image with bounding box
                # Read the image
                path_im = destination_dir_for_images + "/"+ image["file_name"]
                image = cv2.imread(path_im)
                if image is None:
                    print(f"Image not found: {path_im}")
                    continue

                # Convert BGR (OpenCV format) to RGB (Matplotlib format)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

                # Extract bounding box coordinates
                x, y, w, h = annotation["bbox"]
                x, y, w, h = int(x), int(y), int(w), int(h)

                # Plot the image
                plt.figure(figsize=(8, 8))
                plt.imshow(image)

                # Draw the bounding box
                plt.gca().add_patch(plt.Rectangle((x, y), w, h, edgecolor='green', facecolor='none', linewidth=2))

                # Display the image with the bounding box
                plt.title(f"Image ID: {image_id}")
                plt.axis("off")
                plt.show()

for name, dist in distribution:
    print(f"Checking {name}")
    check_images_in_annotations(
        frames_path + f"/{name}/annotations/instances_{name}.json", 
        frames_path + f"/{name}/images")


In [None]:


def count_files_in_directory(directory_path):
    try:
        # List all items in the directory
        all_items = os.listdir(directory_path)
        # Filter only files
        files = [item for item in all_items if os.path.isfile(os.path.join(directory_path, item))]
        return len(files)
    except Exception as e:
        print(f"Error: {e}")
        return 0

# Example usage
img_path = "./data/data_faster_rcnn"
print(count_files_in_directory(img_path+ "/train/images"))
print(count_files_in_directory(img_path+ "/val/images"))
print(count_files_in_directory(img_path+ "/test/images"))

In [None]:
get_rnd_distribution()