In [2]:
import os
import pandas as pd
import json
from tqdm import tqdm
import matplotlib.pyplot as plt



In [3]:
base_path = "/home/rahuladhi/Desktop/AI/MABe/MABe-mouse-behavior-detection"
train_tracking_path = os.path.join(base_path, 'train_tracking')
train_annotation_path = os.path.join(base_path, 'train_annotation')

print("Tracking path exists: ", os.path.exists(train_tracking_path))
print("Annotation path exists: ", os.path.exists(train_annotation_path))


Tracking path exists:  True
Annotation path exists:  True


In [13]:
tracking_folders = sorted(os.listdir(train_tracking_path))
annotation_folders = sorted(os.listdir(train_annotation_path))

print("Number of tracking folders: ", len(tracking_folders))
print("NUmber of annotation folders: ", len(annotation_folders))

print("Example of tracking folders: ", tracking_folders[:5])
print("Example of annotation folders: ", annotation_folders[:5])

Number of tracking folders:  21
NUmber of annotation folders:  19
Example of tracking folders:  ['AdaptableSnail', 'BoisterousParrot', 'CRIM13', 'CalMS21_supplemental', 'CalMS21_task1']
Example of annotation folders:  ['AdaptableSnail', 'BoisterousParrot', 'CRIM13', 'CalMS21_supplemental', 'CalMS21_task1']


In [14]:
tracking_set = set(tracking_folders)
annotation_set = set(annotation_folders)

print("There in tracking but not in annotation: ", tracking_set - annotation_set)
print("There in annotation but not in tracking: ", annotation_set - tracking_set)

There in tracking but not in annotation:  {'MABe22_movies', 'MABe22_keypoints'}
There in annotation but not in tracking:  set()


In [15]:
valid_folders_set = tracking_set & annotation_set
valid_folders = sorted(valid_folders_set)
len(valid_folders)


19

In [16]:
mismatch_folders = []


for folder in valid_folders:
    tracking_files_path = os.path.join(train_tracking_path, folder)
    annotation_files_path = os.path.join(train_annotation_path, folder)
    tracking_files = []
    annotation_files = []

    for f in os.listdir(tracking_files_path):
        if f.endswith(".parquet"):
            tracking_files.append(f)
    for f in os.listdir(annotation_files_path):
        if f.endswith(".parquet"):
            annotation_files.append(f)

    tracking_files_set = set(tracking_files)
    annotation_files_set = set(annotation_files)

    #print("Number of tracking files: ", len(tracking_files))
    #print("Number of annotation files: ", len(annotation_files))


    if len(tracking_files)!=len(annotation_files):
        print("Not matching - ", folder)
        mismatch_folders.append(folder)


Not matching -  PleasantMeerkat
Not matching -  SparklingTapir


In [17]:
for folder in mismatch_folders:
    print(folder)
    tracking_mismatch_file_path = os.path.join(train_tracking_path, folder)
    annotation_mismatch_file_path = os.path.join(train_annotation_path, folder)

    tracking_mismatch_files = sorted(os.listdir(tracking_mismatch_file_path))
    annotation_mismatch_files = sorted(os.listdir(annotation_mismatch_file_path))

    print("Number of files in tracking: ", len(tracking_mismatch_files))
    print("Number of files in annotation: ", len(annotation_mismatch_files))

    tracking_mismatch_set = set(tracking_mismatch_files)
    annotation_mismatch_set = set(annotation_mismatch_files)

    print("Missing in tracking: ", annotation_mismatch_set-tracking_mismatch_set)
    print("Missing in annotation: ", tracking_mismatch_set-annotation_mismatch_set)




PleasantMeerkat
Number of files in tracking:  36
Number of files in annotation:  35
Missing in tracking:  set()
Missing in annotation:  {'1375833299.parquet'}
SparklingTapir
Number of files in tracking:  69
Number of files in annotation:  54
Missing in tracking:  set()
Missing in annotation:  {'1430299100.parquet', '801328824.parquet', '1543851393.parquet', '329031399.parquet', '1772737271.parquet', '1588709555.parquet', '167444193.parquet', '834408298.parquet', '1085312517.parquet', '1366115611.parquet', '610412175.parquet', '361341393.parquet', '484405601.parquet', '139713291.parquet', '687999061.parquet'}


In [18]:
valid_files_per_folder = {}

for folder in valid_folders:
    tracking_files_path = os.path.join(train_tracking_path, folder)
    annotation_files_path = os.path.join(train_annotation_path, folder)
    tracking_files = []
    annotation_files = []

    for f in os.listdir(tracking_files_path):
        if f.endswith(".parquet"):
            tracking_files.append(f)
    for f in os.listdir(annotation_files_path):
        if f.endswith(".parquet"):
            annotation_files.append(f)

    tracking_files_set = set(tracking_files)
    annotation_files_set = set(annotation_files)

    valid_files_set = tracking_files_set & annotation_files_set
    valid_files = sorted(valid_files_set)

    valid_files_per_folder[folder] = valid_files
    

In [19]:
summary_data = []

for folder in valid_folders:
    tracking_count = len([f for f in os.listdir(os.path.join(train_tracking_path, folder)) if f.endswith('.parquet')])
    annotation_count = len([f for f in os.listdir(os.path.join(train_annotation_path, folder)) if f.endswith('.parquet')])
    valid_count = len(valid_files_per_folder.get(folder, []))
    match_status = "✅" if tracking_count == annotation_count else "⚠️"

    summary_data.append({
        "folder": folder,
        "tracking_count": tracking_count,
        "annotation_count": annotation_count,
        "valid_count": valid_count,
        "match_status": match_status
    })


summary_df = pd.DataFrame(summary_data)
summary_df.sort_values("folder", inplace=True)
summary_df.reset_index(drop=True, inplace=True)


In [20]:
summary_df

Unnamed: 0,folder,tracking_count,annotation_count,valid_count,match_status
0,AdaptableSnail,17,17,17,✅
1,BoisterousParrot,8,8,8,✅
2,CRIM13,21,21,21,✅
3,CalMS21_supplemental,297,297,297,✅
4,CalMS21_task1,101,101,101,✅
5,CalMS21_task2,76,76,76,✅
6,CautiousGiraffe,10,10,10,✅
7,DeliriousFly,6,6,6,✅
8,ElegantMink,19,19,19,✅
9,GroovyShrew,17,17,17,✅


In [None]:
summary_df.to_csv("~/Desktop/AI/MABe/data/folder_summary.csv", index=False)
print("Summary of the folders has been saved successfully")

Summary of the folders has been saved successfully


In [None]:
import os, json
json_path = os.path.expanduser("~/Desktop/AI/MABe/data/valid_files_per_folder.json")
with open(json_path, "w") as f:
    json.dump(valid_files_per_folder, f, indent=4)

print(f"✅ Valid files mapping saved to {json_path}")



✅ Valid files mapping saved to /home/rahuladhi/Desktop/AI/MABe/data/valid_files_per_folder.json
