In [27]:
import os
import pandas as pd
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np



In [28]:
base_path = "/home/rahuladhi/Desktop/AI/MABe/MABe-mouse-behavior-detection"
train_tracking_path = os.path.join(base_path, 'train_tracking')
train_annotation_path = os.path.join(base_path, 'train_annotation')

print("Tracking path exists: ", os.path.exists(train_tracking_path))
print("Annotation path exists: ", os.path.exists(train_annotation_path))


Tracking path exists:  True
Annotation path exists:  True


In [29]:
tracking_folders = sorted(os.listdir(train_tracking_path))
annotation_folders = sorted(os.listdir(train_annotation_path))

print("Number of tracking folders: ", len(tracking_folders))
print("NUmber of annotation folders: ", len(annotation_folders))

print("Example of tracking folders: ", tracking_folders[:5])
print("Example of annotation folders: ", annotation_folders[:5])

Number of tracking folders:  21
NUmber of annotation folders:  19
Example of tracking folders:  ['AdaptableSnail', 'BoisterousParrot', 'CRIM13', 'CalMS21_supplemental', 'CalMS21_task1']
Example of annotation folders:  ['AdaptableSnail', 'BoisterousParrot', 'CRIM13', 'CalMS21_supplemental', 'CalMS21_task1']


In [30]:
tracking_set = set(tracking_folders)
annotation_set = set(annotation_folders)

print("There in tracking but not in annotation: ", tracking_set - annotation_set)
print("There in annotation but not in tracking: ", annotation_set - tracking_set)

There in tracking but not in annotation:  {'MABe22_keypoints', 'MABe22_movies'}
There in annotation but not in tracking:  set()


In [31]:
valid_folders_set = tracking_set & annotation_set
valid_folders = sorted(valid_folders_set)
len(valid_folders)


19

In [32]:
mismatch_folders = []


for folder in valid_folders:
    tracking_files_path = os.path.join(train_tracking_path, folder)
    annotation_files_path = os.path.join(train_annotation_path, folder)
    tracking_files = []
    annotation_files = []

    for f in os.listdir(tracking_files_path):
        if f.endswith(".parquet"):
            tracking_files.append(f)
    for f in os.listdir(annotation_files_path):
        if f.endswith(".parquet"):
            annotation_files.append(f)

    tracking_files_set = set(tracking_files)
    annotation_files_set = set(annotation_files)

    #print("Number of tracking files: ", len(tracking_files))
    #print("Number of annotation files: ", len(annotation_files))


    if len(tracking_files)!=len(annotation_files):
        print("Not matching - ", folder)
        mismatch_folders.append(folder)


Not matching -  PleasantMeerkat
Not matching -  SparklingTapir


In [33]:
for folder in mismatch_folders:
    print(folder)
    tracking_mismatch_file_path = os.path.join(train_tracking_path, folder)
    annotation_mismatch_file_path = os.path.join(train_annotation_path, folder)

    tracking_mismatch_files = sorted(os.listdir(tracking_mismatch_file_path))
    annotation_mismatch_files = sorted(os.listdir(annotation_mismatch_file_path))

    print("Number of files in tracking: ", len(tracking_mismatch_files))
    print("Number of files in annotation: ", len(annotation_mismatch_files))

    tracking_mismatch_set = set(tracking_mismatch_files)
    annotation_mismatch_set = set(annotation_mismatch_files)

    print("Missing in tracking: ", annotation_mismatch_set-tracking_mismatch_set)
    print("Missing in annotation: ", tracking_mismatch_set-annotation_mismatch_set)




PleasantMeerkat
Number of files in tracking:  36
Number of files in annotation:  35
Missing in tracking:  set()
Missing in annotation:  {'1375833299.parquet'}
SparklingTapir
Number of files in tracking:  69
Number of files in annotation:  54
Missing in tracking:  set()
Missing in annotation:  {'834408298.parquet', '484405601.parquet', '610412175.parquet', '139713291.parquet', '167444193.parquet', '1772737271.parquet', '1366115611.parquet', '687999061.parquet', '1430299100.parquet', '1543851393.parquet', '329031399.parquet', '361341393.parquet', '1588709555.parquet', '801328824.parquet', '1085312517.parquet'}


In [34]:
valid_files_per_folder = {}

for folder in valid_folders:
    tracking_files_path = os.path.join(train_tracking_path, folder)
    annotation_files_path = os.path.join(train_annotation_path, folder)
    tracking_files = []
    annotation_files = []

    for f in os.listdir(tracking_files_path):
        if f.endswith(".parquet"):
            tracking_files.append(f)
    for f in os.listdir(annotation_files_path):
        if f.endswith(".parquet"):
            annotation_files.append(f)

    tracking_files_set = set(tracking_files)
    annotation_files_set = set(annotation_files)

    valid_files_set = tracking_files_set & annotation_files_set
    valid_files = sorted(valid_files_set)

    valid_files_per_folder[folder] = valid_files
    

In [35]:
summary_data = []

for folder in valid_folders:
    tracking_count = len([f for f in os.listdir(os.path.join(train_tracking_path, folder)) if f.endswith('.parquet')])
    annotation_count = len([f for f in os.listdir(os.path.join(train_annotation_path, folder)) if f.endswith('.parquet')])
    valid_count = len(valid_files_per_folder.get(folder, []))
    match_status = "✅" if tracking_count == annotation_count else "⚠️"

    summary_data.append({
        "folder": folder,
        "tracking_count": tracking_count,
        "annotation_count": annotation_count,
        "valid_count": valid_count,
        "match_status": match_status
    })


summary_df = pd.DataFrame(summary_data)
summary_df.sort_values("folder", inplace=True)
summary_df.reset_index(drop=True, inplace=True)


In [36]:
summary_df

Unnamed: 0,folder,tracking_count,annotation_count,valid_count,match_status
0,AdaptableSnail,17,17,17,✅
1,BoisterousParrot,8,8,8,✅
2,CRIM13,21,21,21,✅
3,CalMS21_supplemental,297,297,297,✅
4,CalMS21_task1,101,101,101,✅
5,CalMS21_task2,76,76,76,✅
6,CautiousGiraffe,10,10,10,✅
7,DeliriousFly,6,6,6,✅
8,ElegantMink,19,19,19,✅
9,GroovyShrew,17,17,17,✅


In [37]:
summary_df.to_csv("~/Desktop/AI/MABe/data/folder_summary.csv", index=False)
print("Summary of the folders has been saved successfully")

Summary of the folders has been saved successfully


In [38]:
import os, json
json_path = os.path.expanduser("~/Desktop/AI/MABe/data/valid_files_per_folder.json")
with open(json_path, "w") as f:
    json.dump(valid_files_per_folder, f, indent=4)

print(f"✅ Valid files mapping saved to {json_path}")



✅ Valid files mapping saved to /home/rahuladhi/Desktop/AI/MABe/data/valid_files_per_folder.json


In [39]:
data_dir = os.path.join("/home/rahuladhi/Desktop/AI/MABe/", "data")
valid_files_path = os.path.join(data_dir, "valid_files_per_folder.json")

with open(valid_files_path, "r") as f:
    valid_files_per_folder = json.load(f)

for folder, files in valid_files_per_folder.items():
    print(f"{folder}: {len(files)} files")

AdaptableSnail: 17 files
BoisterousParrot: 8 files
CRIM13: 21 files
CalMS21_supplemental: 297 files
CalMS21_task1: 101 files
CalMS21_task2: 76 files
CautiousGiraffe: 10 files
DeliriousFly: 6 files
ElegantMink: 19 files
GroovyShrew: 17 files
InvincibleJellyfish: 33 files
JovialSwallow: 52 files
LyricalHare: 24 files
NiftyGoldfinch: 11 files
PleasantMeerkat: 35 files
ReflectiveManatee: 20 files
SparklingTapir: 54 files
TranquilPanther: 25 files
UppityFerret: 21 files


In [40]:
example_folder = "AdaptableSnail"
example_file = valid_files_per_folder[example_folder][0]

train_tracking_example = os.path.join(base_path, "train_tracking", example_folder)
train_annotation_example = os.path.join(base_path, "train_annotation", example_folder)

example_tracking = os.path.join(train_tracking_example, f"{example_file}")
example_annotation = os.path.join(train_annotation_example, f"{example_file}")

print("Tracking file: ", example_tracking)
print("Annotation file: ", example_annotation)


Tracking file:  /home/rahuladhi/Desktop/AI/MABe/MABe-mouse-behavior-detection/train_tracking/AdaptableSnail/1212811043.parquet
Annotation file:  /home/rahuladhi/Desktop/AI/MABe/MABe-mouse-behavior-detection/train_annotation/AdaptableSnail/1212811043.parquet


In [41]:
import fastparquet
tracking_df = pd.read_parquet(example_tracking, engine='fastparquet')
annotation_df = pd.read_parquet(example_annotation, engine='fastparquet')

print("Tracking columns: ", tracking_df.columns.tolist()[:10])
print("Annotation columns: ", annotation_df.columns.tolist())

print("Tracking frame range: ", tracking_df["video_frame"].min(), "-", tracking_df["video_frame"].max())
print("Annotation frame range: ", annotation_df["start_frame"].min(), "-", annotation_df["stop_frame"].max())

Tracking columns:  ['video_frame', 'mouse_id', 'bodypart', 'x', 'y']
Annotation columns:  ['agent_id', 'target_id', 'action', 'start_frame', 'stop_frame']
Tracking frame range:  0 - 89975
Annotation frame range:  2 - 89467


In [42]:
def check_frame_alignment(tracking_df, annotation_df):
    """
    Compare frame ranges between tracking and annotation dataframes.
    Returns a dictionary summary for one pair.
    """
    t_min, t_max = tracking_df["video_frame"].min(), tracking_df["video_frame"].max()
    a_min, a_max = annotation_df["start_frame"].min(), annotation_df["stop_frame"].max()
    
    overlap_start = max(t_min, a_min)
    overlap_end = min(t_max, a_max)
    overlap_ratio = (overlap_end - overlap_start) / (t_max - t_min)
    
    status = "aligned" if overlap_ratio >= 0.95 else "trim required"
    
    return {
        "tracking_range": (t_min, t_max),
        "annotation_range": (a_min, a_max),
        "overlap_ratio": round(overlap_ratio, 3),
        "status": status
    }

# Test on your example pair
alignment_info = check_frame_alignment(tracking_df, annotation_df)
print(alignment_info)


{'tracking_range': (0, 89975), 'annotation_range': (2, 89467), 'overlap_ratio': 0.994, 'status': 'aligned'}


In [43]:
root_path = "/home/rahuladhi/Desktop/AI/MABe"

STRICT_THRESHOLD = 0.99
SOFT_THRESHOLD = 0.95

metadata_dir = os.path.join(root_path, "data", "alignment_metadata")
os.makedirs(metadata_dir, exist_ok = True)

alignment_records = []

def to_serializable(obj):
    """Recursively convert NumPy data types to native Python types."""
    if isinstance(obj, (np.integer, np.int32, np.int64, np.int16)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, dict):
        return {k: to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [to_serializable(v) for v in obj]
    else:
        return obj


def check_frame_alignment(tracking_df, annotation_df):
    t_min, t_max = tracking_df["video_frame"].min(), tracking_df["video_frame"].max()
    a_min, a_max = annotation_df["start_frame"].min(), annotation_df["stop_frame"].max()

    overlap_start = max(t_min, a_min)
    overlap_end = min(t_max, a_max)
    overlap_len = max(0, (overlap_end - overlap_start + 1))

    tracking_len = (t_max - t_min)+1
    annotation_len = (a_max - a_min)+1

    tracking_utilization = overlap_len / tracking_len if tracking_len > 0 else 0.0
    annotation_coverage = overlap_len / annotation_len if annotation_len > 0 else 0.0

    if tracking_utilization >= STRICT_THRESHOLD:
        status, action = "aligned", "none"
    elif tracking_utilization >= SOFT_THRESHOLD:
        status, action = "auto_trim", "trim_to_intersection"
    else:
        status, action = "manual_review", "check_alignment"

    
    return {
        "tracking_range": (t_min, t_max),
        "annotation_range": (a_min, a_max),
        "tracking_frames": tracking_len,
        "annotation_frames": annotation_len,
        "overlap_frames": overlap_len,
        "tracking_utilization_ratio": round(tracking_utilization, 6),
        "annotation_coverage_ratio": round(annotation_coverage, 6),
        "status": status,
        "action": action,
        "final_range": (overlap_start, overlap_end)
    }


for folder, file_list in tqdm(valid_files_per_folder.items(), desc = "Checking frame alignment"):
    folder_meta_dir = os.path.join(metadata_dir, folder)
    os.makedirs(folder_meta_dir, exist_ok = True)

    for file_name in file_list:
        tracking_path = os.path.join(base_path, "train_tracking", folder, f"{file_name}")
        annotation_path = os.path.join(base_path, "train_annotation", folder, f"{file_name}")

        try:
            tracking_df = pd.read_parquet(tracking_path, engine="fastparquet")
            annotation_df = pd.read_parquet(annotation_path, engine="fastparquet")

            info = check_frame_alignment(tracking_df, annotation_df)
            info.update({"folder": folder, "file": file_name})
            alignment_records.append(info)

            json_path = os.path.join(folder_meta_dir, f"{file_name}.json")
            with open(json_path, "w") as f:
                json.dump(to_serializable(info), f, indent=4)
        except Exception as e:
            print(f"Error in {folder}/{file_name}: {e}")


alignment_df = pd.DataFrame(alignment_records)
summary_path = os.path.join(root_path, "data", "frame_alignment_summary.csv")
alignment_df.to_csv(summary_path, index=False)

print(f"Frame alignment summary saved to: {summary_path}")
print(f"Per-file metadata saved under: {metadata_dir}")


Checking frame alignment: 100%|██████████| 19/19 [00:10<00:00,  1.83it/s]

Frame alignment summary saved to: /home/rahuladhi/Desktop/AI/MABe/data/frame_alignment_summary.csv
Per-file metadata saved under: /home/rahuladhi/Desktop/AI/MABe/data/alignment_metadata



