## Notebook to remove unannotated frames from a dataset that has been partially annotated. All unannotated frames are moved into a new folder called 'unannotated', and a new json saved

In [None]:
import json
import os
import numpy as np
import pandas as pd
from collections import defaultdict

# import sys
# sys.path.insert(0,r'C:\tr-dev\JARVIS-msi\JARVIS-HybridNet\jarvis\dataset')
# import dataset3D

### Function to load and save datasets of the class dataset3D

In [None]:
def load_dataset(json_path):
    with open(json_path) as f:
        dataset = json.load(f)
    return dataset

def save_dataset(dataset, json_path):
    with open(json_path, 'w') as f:
        json.dump(dataset, f)

## Function to create a subfolder and move selected file to that subfolder

In [None]:
def move_file_to_subdirectory(file_path, subdirectory):
    
    directory = os.path.dirname(file_path)
    subdirectory_path = os.path.join(directory, subdirectory)
    if not os.path.exists(subdirectory_path):
        os.makedirs(subdirectory_path)
    file_name = os.path.basename(file_path)
    new_file_path = os.path.join(subdirectory_path, file_name)
    os.rename(file_path, new_file_path)

# Filter out frames without annotations. Automatically move the frames out of the corresponding folder (optional).

In [None]:
def filter_frames(dataset, recording_dir, moveFiles=True):
    filtered_images = []
    filtered_annotations = []
    annotated_frame_ids = set()
    filter_count = 0
    total_count = 0
    for ann in dataset['annotations']:
        annotated_frame_ids.add(ann['image_id'])
        filter_count+=1
    for img in dataset['images']:
        total_count+=1
        if img['id'] in annotated_frame_ids:
            filtered_images.append(img)
        else:
            file_path = os.path.join(recording_dir, img['file_name'])
            if not os.path.exists(file_path) and moveFiles:
                print('Sending ' + file_path + ' to ' + os.path.join(file_path, 'unannotated'))
                move_file_to_subdirectory(file_path, 'unannotated')
    for ann in dataset['annotations']:
        if ann['image_id'] in annotated_frame_ids:
            filtered_annotations.append(ann)

    dataset['images'] = filtered_images
    dataset['annotations'] = filtered_annotations

    return dataset, [total_count, filter_count], [filtered_annotations, filtered_images]

In [None]:
def filter_frames(dataset, recording_dir, moveFiles=True):
    filtered_images = []
    filtered_annotations = []
    annotated_frame_ids = set()
    filter_count = 0
    total_count = 0

    for ann in dataset['annotations']:
        annotated_frame_ids.add(ann['image_id'])
        filter_count += 1

    for img in dataset['images']:
        total_count += 1
        if img['id'] in annotated_frame_ids:
            filtered_images.append(img)
        else:
            file_path = os.path.join(recording_dir, img['file_name'])
            if moveFiles:
                if os.path.exists(file_path):
                    print(f'Moving {file_path} to {os.path.join(recording_dir, "unannotated")}')
                    move_file_to_subdirectory(file_path, 'unannotated')

    for ann in dataset['annotations']:
        if ann['image_id'] in annotated_frame_ids:
            filtered_annotations.append(ann)

    # dataset['images'] = filtered_images # NO! This makes creating a project fail as it changes the index reference in the annotations. May be possible to remap this
    dataset['annotations'] = filtered_annotations

    return dataset, [total_count, filter_count], [filtered_annotations, filtered_images]

## Paths

In [None]:
data_dir = r'C:\tr-dev\JARVIS-msi\trainingSet\TR_manyframes_trainingset_tofilter'
annotations_dir = os.path.join(data_dir, 'annotations')

train_recording_dir = os.path.join(data_dir, 'train')
val_recording_dir = os.path.join(data_dir, 'val')

train_json_path = os.path.join(annotations_dir, 'instances_train.json')
val_json_path = os.path.join(annotations_dir, 'instances_val.json')

filtered_train_json_path = os.path.join(annotations_dir, 'filtered_instances_train.json')
filtered_val_json_path = os.path.join(annotations_dir, 'filtered_instances_val.json')

## Filter the datasets

In [None]:
# Load datasets
train_dataset = load_dataset(train_json_path)
val_dataset = load_dataset(val_json_path)

# Filter jsons
filtered_train_dataset,[train_total_count, train_filter_count],[filtered_annotations, filtered_images] = filter_frames(train_dataset, train_recording_dir, moveFiles=False)
filtered_val_dataset,[val_total_count, val_filter_count],[filtered_annotations, filtered_images] = filter_frames(val_dataset, val_recording_dir, moveFiles=False)

# Save filtered datasets
save_dataset(filtered_train_dataset, filtered_train_json_path)
save_dataset(filtered_val_dataset, filtered_val_json_path)

print("Filtered datasets saved successfully.")
print(f'Total training frames = {train_total_count} of which {train_filter_count} are annotated. The rest have been removed in the filtered json')
print(f'Total validation frames = {val_total_count} of which {val_filter_count} are annotated. The rest have been removed in the filtered json')