In [None]:
import json 
import cv2 
from pa3_demo import load_obj_each_frame
BBOX_FILE = "part_2_frame_dict_base.json"
VIDEO_FILE = "commonwealth.mp4"

#### Greedy Data Association (with observation history)
Reference: [PSU CSE598C](https://www.cse.psu.edu/~rtc12/CSE598C/datassocPart1.pdf)

The goal of this task is to assign unique IDs to each of the bounding boxes in the frames of a video. We use a greedy Nearest Neighbor algorithm that favors objects with long observation histories. From the reference above, we interpret the Global Nearest Neighbor (GNN) algorithm as follows:
1. For each frame iterate over each bounding box and store them in a history tracker
2. The tracker is a dictionary with the bounding box ID as the key and the history as the value, and it is sorted in a descending order of the occurrence of the bounding box
3. For each of these observed bounding boxes, IoU is calculated with the previous frame's bounding boxes with the same ID. If the IoU is greater than a threshold, the bounding box is assigned the same ID. If not, a new ID is assigned to the bounding box.
4. Finally, the algorithm returns a dictionary of the bounding box IDs where each ID is associated with the bounding box in each frame.



In [None]:
from collections import defaultdict

def calculate_iou(box1, box2):
    """Calculates Intersection over Union (IoU) between two bounding boxes."""
    x_min1, y_min1, w1, h1 = box1["x_min"], box1["y_min"], box1["width"], box1["height"]
    x_min2, y_min2, w2, h2 = box2["x_min"], box2["y_min"], box2["width"], box2["height"]
    
    x_max1, y_max1 = x_min1 + w1, y_min1 + h1
    x_max2, y_max2 = x_min2 + w2, y_min2 + h2

    xA = max(x_min1, x_min2)
    yA = max(y_min1, y_min2)
    xB = min(x_max1, x_max2)
    yB = min(y_max1, y_max2)

    inter_area = max(0, xB - xA) * max(0, yB - yA)
    box1_area = w1 * h1
    box2_area = w2 * h2

    iou = inter_area / float(box1_area + box2_area - inter_area)
    return iou

def assign_objects(frames, similarity_threshold=0.5):
    """Assigns unique IDs to objects in video frames using Global Nearest Neighbor.

    Args:
        frames: Dictionary where keys are frame numbers and values are lists of bounding boxes.
        similarity_threshold: Minimum IoU score for considering a match.

    Returns:
        tracks: Dictionary where keys are object IDs and values are lists of [frame_number, bounding_box].
    """
    tracks = defaultdict(list)
    

    for frame_number, boxes in frames.items():
        unassigned_boxes = []
        for box in boxes: 
            # add box with initial observation history to unassigned_boxes
            unassigned_boxes.append([box, 0])

        # sort unassigned boxes by observation history
        unassigned_boxes.sort(key=lambda x: x[1], reverse=True)
        # iterate through unassigned boxes
        for box, history in unassigned_boxes:
            
            best_match, best_iou = None, 0

            for track_id, track in tracks.items():
                # get the last box in the track
                last_box = track[-1][1]

                # calculate IoU between the last box in the track and the current box
                iou = calculate_iou(last_box, box)

                # update the best match if necessary
                if iou > best_iou:
                    best_match, best_iou = track_id, iou
            # assign the current box to the best match (if it exists)
            if best_iou >= similarity_threshold:
                tracks[best_match].append([frame_number, box])

                # remove the current box from unassigned_boxes
                unassigned_boxes.remove([box, history])
                # update the observation history of the matched box
                tracks[best_match][-1][-1] = box

            # create a new track for the current box (if no match)
            else:
                tracks[len(tracks) + 1].append([frame_number, box])
                unassigned_boxes.remove([box, history])
                unassigned_boxes.append([box, history + 1])

    return tracks

frame_dict = load_obj_each_frame(BBOX_FILE)

output_track = assign_objects(frame_dict)
# modify the output_track by updating the id key of frame_dict
new_frame_dict = frame_dict.copy()
for track_id, track in output_track.items():
    for frame_id, box in track:
        # print(frame_dict[frame_id][frame_dict[frame_id].index(box)])
        new_frame_dict[frame_id][new_frame_dict[frame_id].index(box)]["id"] = track_id

In [None]:
# describe the output_track
# for track_id, track in output_track.items():
#     print(f"Track {track_id}:")
#     for observation in track:
#         print(f"  - Frame {observation[0]}: {observation[1]}")


def draw_object(object_dict, image):
    x_min, y_min, w, h, id= object_dict["x_min"], object_dict["y_min"], object_dict["width"], object_dict["height"], object_dict["id"]
    image = cv2.rectangle(image, (x_min, y_min), (x_min + w, y_min + h), (0, 255, 0), 2)
    image = cv2.putText(image, f"ID: {id}", (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    return image
def draw_objects_in_video(video_file,frame_dict, output_file = "part_2_demo.mp4"):
    cap = cv2.VideoCapture(video_file)
    ok, image = cap.read()
    vidwrite = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'MP4V'), 30, (700,500))
    while ok:
        ######!!!!#######
        image = cv2.resize(image, (700, 500)) # make sure your video is resize to this size, otherwise the coords in the data file won't work !!!
        ######!!!!#######
        frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
        print(frame_number)
        objects = frame_dict.get(str(frame_number), [])
        for obj in objects: 
            print(obj)
            image = draw_object(obj, image)
        vidwrite.write(image)
        ok, image = cap.read()
    vidwrite.release()
    cap.release()
draw_objects_in_video(VIDEO_FILE, new_frame_dict, output_file="output.mp4")
dumped = json.dumps(new_frame_dict)
with open("part_2_frame_dict.json" , "w") as f:
    f.write(dumped)