In [575]:
import os
from pathlib import Path
import cv2
import numpy as np
import kagglehub

path = kagglehub.dataset_download("trainingdatapro/cars-video-object-tracking")
print("Path to dataset files:", path)

DATASET_DIR = Path(path)

Path to dataset files: /Users/Marta/.cache/kagglehub/datasets/trainingdatapro/cars-video-object-tracking/versions/3


In [576]:
IMAGE_DIR = DATASET_DIR / "images"

assert IMAGE_DIR.exists(), f"Missing {IMAGE_DIR}"
def sorted_images(folder: Path):
    exts = (".png", ".jpg", ".jpeg", ".bmp")
    files = [p for p in folder.iterdir() if p.suffix.lower() in exts]
    files.sort(key=lambda p: p.name)
    return files

IMAGE_FILES = sorted_images(IMAGE_DIR)

print("Frames:", len(IMAGE_FILES))


Frames: 301


## Introduction

TODO: Introduce problem 

how are we going to solve it

why background substraction
why kalman filter

brief 

## Detection

This section extracts moving objects from video frames using background subtraction and morphological cleaning. We use the MOG2 algorithm to separate foreground (moving objects) from the static background. To remove noise, we apply morphological operations such as opening, closing, and dilation. We then use connected components to extract individual objects from the mask.

Because the mask is imperfect due to various factors like vehicle color, distance to camera, and lighting conditions, a single car sometimes appears divided into multiple disconnected blobs. That's why we cluster and group the blobs that are likely to be part of the same vehicle, using proximity-based clustering to merge fragments while avoiding over-merging distant vehicles.

### Functions

Each function is explained in its code comments.

In [577]:
def clean_mask(mask: np.ndarray) -> np.ndarray:
    """ 
     First we apply opening to remove some noise, and then closing to fill small gaps.
    We tested different configurations and this combination gave the most stable results.
    
    """
    
    _, mask = cv2.threshold(mask, 200, 255, cv2.THRESH_BINARY)
    k_open = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    k_close = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2)) # note: this is a small value but works better for us than bigger values

    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, k_open, iterations=2)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, k_close, iterations=2)
    
    return mask

def cluster_blobs_by_proximity(dets: list, distance_threshold: float =100.0) -> list:
    """
    Cluster detections that are close together and merge them into a single detection
    TODO: explanation context for markdown
        Why this is needed:
    Background subtraction often produces multiple small blobs for a single car
    (broken mask, gaps, reflections). If we keep them separated, the tracker may
    create duplicated IDs or jittery centroids.

    Clustering policy:
    - We group detections based on the distance between their centroids.
    - If two detections are closer than distance_threshold, we connect them.
    - The clustering is transitive (A close to B and B close to C -> all become one cluster).
      This is implemented with a Union-Find (Disjoint Set) structure.

    How we merge a cluster:
    - Area: sum of areas (represents total blob size).
    - Centroid: area-weighted average (bigger fragments contribute more).
    - Bounding box: the tight bounding box that contains all member boxes.

    Practical note / trade-off:
    A larger distance_threshold fixes more fragmentation, but it can also merge
    two different cars when they are very close (traffic or adjacent lanes).
    So this parameter controls a balance between "repairing broken blobs" and
    "accidentally merging nearby vehicles".
    """
    if len(dets) <= 1:
        return dets
    n = len(dets)
    clusters = list(range(n))
    def find_root(i):
        if clusters[i] != i:
            clusters[i] = find_root(clusters[i])
        return clusters[i]
    def union(i, j):
        root_i, root_j = find_root(i), find_root(j)
        if root_i != root_j:
            clusters[root_j] = root_i
    for i in range(n):
        for j in range(i + 1, n):
            cx_i, cy_i = dets[i]["centroid"]
            cx_j, cy_j = dets[j]["centroid"]
            d = ((cx_j - cx_i)**2 + (cy_j - cy_i)**2) ** 0.5
            if d < distance_threshold:
                union(i, j)
    # Merge clusters
    cluster_map = {}
    for i in range(n):
        root = find_root(i)
        if root not in cluster_map:
            cluster_map[root] = []
        cluster_map[root].append(i)
    merged = []
    for cluster_indices in cluster_map.values():
        cluster_dets = [dets[i] for i in cluster_indices]
        if len(cluster_dets) == 1:
            merged.append(cluster_dets[0])
        else:
            total_area = sum(d["area"] for d in cluster_dets)
            merged_cx = sum(d["centroid"][0] * d["area"] for d in cluster_dets) / total_area
            merged_cy = sum(d["centroid"][1] * d["area"] for d in cluster_dets) / total_area
            all_xs = [d["bbox"][0] for d in cluster_dets] + [d["bbox"][0] + d["bbox"][2] for d in cluster_dets]
            all_ys = [d["bbox"][1] for d in cluster_dets] + [d["bbox"][1] + d["bbox"][3] for d in cluster_dets]
            x_min, x_max = min(all_xs), max(all_xs)
            y_min, y_max = min(all_ys), max(all_ys)
            merged.append({
                "centroid": (int(merged_cx), int(merged_cy)),
                "bbox": (x_min, y_min, x_max - x_min, y_max - y_min),
                "area": total_area
            })
    return merged

def allowed_area_range(y_bottom, img_h):
    """
    TODO: make this expalanation shorter here, its just for context when explaining in the markdown (this is an important part of the detector)
    Vehicles appear larger when they are closer to the bottom of the image and
    smaller when they are far away. To model this, we analysed the bounding box
    annotations of the original video and studied how their pixel area grows
    depending on their vertical position.

    From this analysis we obtained two curves that estimate the expected minimum
    and maximum area of a vehicle at each height of the image.

    However, real detections from background subtraction are imperfect
    (fragmented blobs, merged cars, lighting effects), and the raw formulas were
    too strict and rejected valid vehicles. To make the detector more tolerant,
    we apply scaling factors (kmin and kmax) that widen the allowed range so
    slightly smaller or larger blobs are still accepted.
   
    """
    t = y_bottom / img_h
    kmin = 0.3  # Adjusted to allow more variance, bigger c
    kmax = 1.3
    min_area = (2000.00 + 44749.12 * t * t) * kmin
    max_area = (2000.00 + 108157.55 * t * t) * kmax
    return min_area, max_area

def detect_blobs(mask: np.ndarray):
    H, W = mask.shape[:2]
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    dets = []
    for c in contours:
        area = cv2.contourArea(c)
        if area <= 0:
            continue
        x, y, w, h = cv2.boundingRect(c)
        y_bottom = y + h  # Use bottom of bbox for perspective model
        minA, maxA = allowed_area_range(y_bottom, H)
        if area <  minA or area > maxA:
            continue
        M = cv2.moments(c)
        if M["m00"] == 0:
            continue
        cx = int(M["m10"] / M["m00"])
        cy = int(M["m01"] / M["m00"])
        dets.append({"centroid": (cx, cy), "bbox": (x, y, w, h), "area": area})
    # Cluster and merge nearby fragmented blobs
    dets = cluster_blobs_by_proximity(dets, distance_threshold=40.0)
    return dets

## Tracking

Next, we need to track each detected car. We chose the Kalman filter because it is fast, supports multiple object tracking, and predicts the probable next position of each car based on its own motion model. Different cars move at different speeds, and the Kalman filter learns and adapts to these individual motion patterns.

The Kalman filter maintains a state vector [x, y, vx, vy] representing position and velocity. The transition matrix defines how the state evolves between frames (it predicts where the car will be based on its current velocity). The measurement matrix maps the detected position (which we observe) to the state space. The process noise covariance controls how much we trust the motion model (lower values = trust motion more), while the measurement noise covariance controls how much we trust the detections (lower values = trust detections more). By balancing these, the filter smooths noisy detections while allowing the object to change speed.

Each detection is attached to a Track object, which aims to maintain a persistent identity throughout the video. Every track has a unique ID and its own Kalman filter. The predict function estimates where the vehicle should be in the current frame using the motion model, before we see any new detections. The update function corrects the prediction using the actual detected position, allowing the filter to learn and adjust if the vehicle's motion changes.


In [578]:
def init_kalman(dt: float = 1.0, process_var=None, meas_var=None) -> cv2.KalmanFilter:
    """
    State: [x, y, vx, vy]^T
    Measurement: [x, y]^T
    process_var, meas_var: empirical, in pixel units.
    """
    kf = cv2.KalmanFilter(4, 2)

    kf.transitionMatrix = np.array([
        [1, 0, dt, 0 ],
        [0, 1, 0 , dt],
        [0, 0, 1 , 0 ],
        [0, 0, 0 , 1 ],
    ], dtype=np.float32)

    kf.measurementMatrix = np.array([
        [1, 0, 0, 0],
        [0, 1, 0, 0],
    ], dtype=np.float32)

    # These are the two main tuning knobs.
    kf.processNoiseCov = np.eye(4, dtype=np.float32) * process_var
    kf.measurementNoiseCov = np.eye(2, dtype=np.float32) * meas_var

    # Start uncertain so it can adapt quickly.
    kf.errorCovPost = np.eye(4, dtype=np.float32) * 500.0
    kf.statePost = np.zeros((4, 1), dtype=np.float32)
    return kf

def color_from_id(track_id: int) -> tuple[int, int, int]:

    rng = np.random.default_rng(track_id)  # stable seed per id
    # Keep colors away from extremes: [40..220]
    c = rng.integers(40, 220, size=3, dtype=np.int32)
    return (int(c[0]), int(c[1]), int(c[2]))  # B, G, R

class Track:
    """
    One vehicle hypothesis + identity.
    We keep:
      - Kalman filter
      - hits: how many times we matched a detection (confidence)
      - missed: how many consecutive frames we failed to match (death timer)
      - confirmed: whether track has been stable for MIN_HITS frames
    """
    def __init__(self, track_id: int, init_xy: tuple[int,int], init_bbox, dt=1.0, process_var=None, meas_var=None):
        self.id = track_id
        self.kf = init_kalman(dt=dt, process_var=process_var, meas_var=meas_var)
        self.color = color_from_id(track_id)

        x, y = init_xy
        self.kf.statePost = np.array([[x], [y], [0], [0]], dtype=np.float32)

        self.hits = 1
        self.missed = 0
        self.bbox = init_bbox
        self.history = [init_xy]
        self.consecutive_hits = 1  # Count consecutive frames with detections
        self.last_pred = (x, y)  # Initialize with first position

    def predict(self) -> tuple[float,float]:
        """
        Predict where the vehicle should be in the current frame (before seeing detections).
        Stores prediction in self.last_pred for association gating.
        """
        pred = self.kf.predict()
        self.last_pred = (float(pred[0]), float(pred[1]))
        return self.last_pred

    def update(self, xy: tuple[int,int], bbox):
        """
        Correct the predicted state using the detection measurement.
        """
        cx, cy = xy
        z = np.array([[cx], [cy]], dtype=np.float32)
        self.kf.correct(z)
        self.hits += 1
        self.consecutive_hits += 1  # Increment consecutive hits
        self.missed = 0
        self.bbox = bbox
        self.history.append(xy)

    def mark_missed(self):
        """
        No detection matched this track this frame
        """
        self.missed += 1
        self.consecutive_hits = 0  # Reset consecutive hits

    def is_confirmed(self, min_hits: int = 3) -> bool:
        """Check if track is stable and should be displayed"""
        return self.consecutive_hits >= min_hits


In [579]:
# adaptive association with measurement validation

def match_distance_for_bbox(bbox):
    """
    Larger objects can move more pixels between frames.
    bbox height ≈ distance to camera (perspective effect).
    Adaptive gate: closer objects (larger) can move more.
    """
    _, _, w, h = bbox
    return 20 + 0.8 * h   # slightly more permissive than before

def euclidean_distance(track, detection):
    """
    Compute Euclidean distance between track prediction and detection.
    Uses predicted state (last_pred) for gating.
    """
    pred_x, pred_y = track.last_pred
    det_x, det_y = detection["centroid"]
    d = ((pred_x - det_x)**2 + (pred_y - det_y)**2) ** 0.5
    return d


def associate_detections_to_tracks(dets, tracks, min_confirmed_hits=3):
    """
    For each track, pick the nearest detection,
    but only if it lies within a size-dependent distance.
    
    Prioritize confirmed tracks to prevent ghost IDs from stealing detections.
    Validate measurements: reject detections that would cause large corrections
    to avoid mask artifacts corrupting track state.
    
    OPTIMIZED: Use sets instead of list.remove() to avoid O(n) operations.
    """

    matches = []
    unmatched_dets = set(range(len(dets)))
    unmatched_tracks = set(range(len(tracks)))

    # First pass: match CONFIRMED tracks (higher priority)
    for ti, t in enumerate(tracks):
        if not t.is_confirmed(min_confirmed_hits) or not unmatched_dets:
            continue

        #tx, ty = t.history[-1]
        tx, ty = getattr(t, "last_pred", t.history[-1])
        best_di = None
        best_d = float("inf")

        for di in unmatched_dets:
            cx, cy = dets[di]["centroid"]
            d = ((tx - cx)**2 + (ty - cy)**2) ** 0.5
            allowed = match_distance_for_bbox(dets[di]["bbox"])
            

            if d < allowed and d < best_d:
                best_d = d
                best_di = di

        if best_di is not None:
            # Additional validation: check that this detection is plausible
            # using Kalman filter's predicted state 
            euclid_d = euclidean_distance(t, dets[best_di])
            pred_allowed = match_distance_for_bbox(dets[best_di]["bbox"]) * 1.2  # 20% relaxation for KF
            if euclid_d < pred_allowed:
                matches.append((ti, best_di))
                unmatched_dets.discard(best_di)
                unmatched_tracks.discard(ti)

    # Second pass: match UNCONFIRMED tracks (lower priority, stricter matching)
    for ti, t in enumerate(tracks):
        if t.is_confirmed(min_confirmed_hits) or ti not in unmatched_tracks or not unmatched_dets:
            continue

        #tx, ty = t.history[-1]
        tx, ty = getattr(t, "last_pred", t.history[-1])
        best_di = None
        best_d = float("inf")

        for di in unmatched_dets:
            cx, cy = dets[di]["centroid"]
            d = ((tx - cx)**2 + (ty - cy)**2) ** 0.5
            # Stricter gate for unconfirmed tracks (avoid spurious matches)
            allowed = match_distance_for_bbox(dets[di]["bbox"]) * 0.7
            

            if d < allowed and d < best_d:
                best_d = d
                best_di = di

        if best_di is not None:
            matches.append((ti, best_di))
            unmatched_dets.discard(best_di)
            unmatched_tracks.discard(ti)

    return matches, list(unmatched_tracks), list(unmatched_dets)


In [580]:
#helper for visualization 
def draw_bbox(img, bbox, label, color, thickness=2):
    x, y, w, h = bbox
    x2, y2 = x + w, y + h

    # optional "shadow" outline for contrast
    cv2.rectangle(img, (x, y), (x2, y2), (0, 0, 0), thickness + 2, lineType=cv2.LINE_AA)
    cv2.rectangle(img, (x, y), (x2, y2), color, thickness, lineType=cv2.LINE_AA)

    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.55
    text_thickness = 2
    (tw, th), baseline = cv2.getTextSize(label, font, font_scale, text_thickness)

    y_text_top = y - (th + baseline + 6)
    if y_text_top < 0:
        y_text_top = y + 2

    x_text = x
    y_text = y_text_top + th + 3

    cv2.rectangle(img, (x_text, y_text_top), (x_text + tw + 8, y_text_top + th + baseline + 6),
                  (0, 0, 0), -1, lineType=cv2.LINE_AA)
    cv2.rectangle(img, (x_text + 1, y_text_top + 1), (x_text + tw + 7, y_text_top + th + baseline + 5),
                  color, -1, lineType=cv2.LINE_AA)

    cv2.putText(img, label, (x_text + 4, y_text),
                font, font_scale, (0, 0, 0), text_thickness, lineType=cv2.LINE_AA)

## Algorithm

The system follows a tracking by detection approach: first it detects moving vehicles in each frame, and then it keeps a persistent identity for each vehicle over time using a Kalman filter motion model.

The tracker behaviour is controlled by a few parameters that determine how tolerant the system is to missing detections and motion uncertainty.

In [581]:
SAVE_VIDEO = True
VIDEO_NAME = "vehicle_tracking_debug.mp4"
VIDEO_FPS = 20
video_writer = None
SHOW_EVERY = 1  
SCALE = 0.4      # scale for display screen

DT = 1.0
MAX_AGE  = 30       # frames allowed to miss before deleting (allows recovery from mask gaps)
MIN_HITS = 3         # show track after this many consecutive matches
PROCESS_VAR = 0.2   
MEAS_VAR    = 1.0
# bacgkground subtractor object 
bg = cv2.createBackgroundSubtractorMOG2(history=400, varThreshold=15, detectShadows=True)

tracks = []
next_id = 1


`DT`  
Represents the time step between frames in the motion model. In this project everything is measured per frame, so it is set to 1.0. It does not change the behaviour, it only keeps the motion equations consistent.

`MAX_AGE`  
Maximum number of consecutive frames a track can remain unmatched before being deleted. This allows a vehicle to temporarily disappear (for example due to glare or segmentation errors) and still keep its identity when it reappears. Larger values make the tracker more tolerant but may keep dead tracks alive longer.

`MIN_HITS`  
Number of consecutive successful matches required before a track is considered reliable. This prevents unstable short detections from immediately becoming tracked vehicles and reduces ID flickering.

`PROCESS_VAR`  
Indicates how uncertain the vehicle motion is assumed to be.  
If it is large, the tracker assumes vehicles may change speed or direction and therefore relies more on the new detections.  
If it is small, the tracker assumes motion is smooth and relies more on its predicted trajectory.

`MEAS_VAR`  
Indicates how noisy the detections are expected to be.  
If it is large, the tracker considers the detections unreliable and follows the predicted trajectory more closely.  
If it is small, the tracker follows the detections more strictly.

In this project the detections come from background subtraction, which is noisy (shadows, glare and fragmentation).  
Therefore the tracker is configured to trust the Kalman prediction more than the raw detections


# Main loop explanation 

For every frame of the video, the algorithm starts by extracting motion using a background subtraction model (MOG2). This produces a binary mask of moving regions. Because this raw mask contains noise, shadows and fragmented shapes, it is cleaned with morphological filtering so that each vehicle ideally becomes a single blob.

From this cleaned mask, blobs are extracted and converted into detections. Each detection contains a centroid (the measurement used by the tracker) and a bounding box (used for validation and visualization). At this stage, the algorithm does not yet know which vehicle is which — it only knows where motion exists in the current frame.

Next comes the prediction stage. Every tracked vehicle already has an associated Kalman filter that stores its estimated position and velocity. Before looking at the new detections, the tracker predicts where each vehicle should appear in the current frame. This prediction allows the system to bridge short detection failures (for example glare, shadows or imperfect segmentation).

After prediction, detections are matched to tracks. The association is done in two passes: confirmed tracks are matched first (to protect stable identities), and unconfirmed tracks are matched afterwards with stricter conditions. A detection is only assigned to a track if it is spatially close enough to the predicted position. This distance gating prevents a vehicle from suddenly jumping to another lane or swapping identity with another car.

When a match is found, the Kalman filter is corrected using the detected centroid. This step combines the prediction and the measurement to obtain a smoother and more stable estimate of the vehicle trajectory. If a track does not receive a detection in the current frame, it is not immediately deleted; instead, it is marked as “missed”. This allows the tracker to survive short occlusions or difficult lighting conditions.

If a detection cannot be matched to any existing track, the system decides whether it represents a new vehicle or a temporarily lost one. To avoid creating duplicate identities, the detection is compared against all predicted track positions. The longer a track has been missing, the larger the allowed distance becomes. This adaptive suppression lets a vehicle disappear for several frames and still recover its original ID when it reappears.

Only detections that are sufficiently far from all existing tracks create a new track with a new identifier. A track is considered reliable only after it has been successfully matched for several consecutive frames. Finally, tracks that remain unmatched for too long are removed from the system.

Overall, the algorithm maintains stable vehicle identities by combining three ideas: motion detection to obtain measurements, a Kalman filter to predict motion over time, and adaptive association rules that tolerate temporary detection failures while avoiding duplicated IDs.



In [582]:
# Main loop TODO: explanation below can be deleted if not needed
#For every frame, the Kalman filter predicts where the car should be (the predicted position).
# When new detections (blobs) are found, the code tries to match them to existing tracks using both the last known position and the Kalman filter's prediction.
#If a detection matches a track, the Kalman filter updates (corrects) its state using the detected position.


for i, img_path in enumerate(IMAGE_FILES):
    frame = cv2.imread(str(img_path))
    if frame is None:
        continue

    H, W = frame.shape[:2]

    # 1) Foreground mask (motion)
    fg = bg.apply(frame)

    # 2) Clean the mask
    fg_clean = clean_mask(fg)

    # 3) Detections = blobs (with merging of fragments)
    dets = detect_blobs(fg_clean)

    # 4) Predict all tracks (KF motion model)
    for t in tracks:
        t.predict()

    # 5) Associate with two pass approach (confirmed tracks first)
    matches, unmatched_tracks, unmatched_dets = associate_detections_to_tracks(
        dets, tracks, min_confirmed_hits=MIN_HITS
    )

    # 6) Update matched tracks (KF correction step)
    for ti, di in matches:
        cx, cy = dets[di]["centroid"]
        bbox = dets[di]["bbox"]
        tracks[ti].update((cx, cy), bbox)

    # 7) Mark unmatched tracks: we collect them in order to delete them later
    for ti in unmatched_tracks:
        tracks[ti].mark_missed()

    # 8) Create new tracks for unmatched detections
    for di in unmatched_dets:
        cx, cy = dets[di]["centroid"]
        bbox = dets[di]["bbox"]

        # duplicate suppression: don't create new ID if near any existing track 
        duplicate = False
        for t in tracks:
            #tx, ty = t.history[-1]
            tx, ty = getattr(t, "last_pred", t.history[-1]) # use prediction for better gating
            dist = ((tx - cx)**2 + (ty - cy)**2) ** 0.5
            base = 40 if not t.is_confirmed(MIN_HITS) else 50
            # If the track has missed frames, enlarge the suppression radius
            # Linear growth with a cap to avoid suppressing truly new vehicles
            threshold = min(120, base + 20 * t.missed)
            if dist < threshold:
                duplicate = True
                break
        if duplicate:
            continue

        # Allow new track creation 

        tracks.append(Track(
            track_id=next_id,
            init_xy=(cx, cy),
            init_bbox=bbox,
            dt=DT,
            process_var=PROCESS_VAR,
            meas_var=MEAS_VAR
        ))
        next_id += 1

    # 9) Delete only truly stale tracks (high MAX_AGE tolerance)
    tracks = [t for t in tracks if t.missed <= MAX_AGE]

    # 10) Visualize + record 
    if i % SHOW_EVERY == 0:
        vis = frame.copy()

        # Draw only confirmed tracks
        for t in tracks:
            if not t.is_confirmed(MIN_HITS):
                continue
            if t.missed > 1:
                continue
            label = f"vehicle_{t.id}"
            draw_bbox(vis, t.bbox, label, t.color, thickness=2)

        mask_vis = cv2.cvtColor(fg_clean, cv2.COLOR_GRAY2BGR)

        vis_small  = cv2.resize(vis, (int(W * SCALE), int(H * SCALE)))
        mask_small = cv2.resize(mask_vis, (int(W * SCALE), int(H * SCALE)))

        stacked = np.vstack([mask_small, vis_small])

        # init writer 
        if SAVE_VIDEO and video_writer is None:
            hh, ww = stacked.shape[:2]
            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
            video_writer = cv2.VideoWriter(VIDEO_NAME, fourcc, VIDEO_FPS, (ww, hh))
            print("Recording video to:", VIDEO_NAME)

        if SAVE_VIDEO and video_writer is not None:
            video_writer.write(stacked)

        cv2.imshow("vehicle tracking", stacked)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

# Cleanup
if video_writer is not None:
    video_writer.release()
    print("Saved:", VIDEO_NAME)

cv2.destroyAllWindows()



Recording video to: vehicle_tracking_debug.mp4


  self.last_pred = (float(pred[0]), float(pred[1]))


Saved: vehicle_tracking_debug.mp4


### challenges

* multiple ids per car (duplicates)
* id shifting/jumping 
* misses, we tried to find the perfect balance between reducin duplicates at the cost of misses and the opposite
* a disadvantage of this method is that it is strongly tuned to the specific camera perspective
* in our video theres a light area that causes a lot of missed trackings 
* car color affects the detection. theres one car that mixes with the lane in the first frames
* also mog2 learns online so it needs a few frames to stabilize, this is a problem for our video cause its short