In [3]:
import json
import pandas as pd
from pathlib import Path

# Load the three JSON files
file_paths = [
    "Ali.json",
    "Dr.Lory.json",
    "Krystal.json"
]

data = {}
for path in file_paths:
    labeler_name = Path(path).stem
    with open(path) as f:
        data[labeler_name] = json.load(f)

# Convert the data to a flat DataFrame for analysis
records = []
for labeler, annotations in data.items():
    for image_id, details in annotations.items():
        for subimage_id, label in details["images"].items():
            records.append({
                "labeler": labeler,
                "image_id": image_id,
                "subimage_id": subimage_id,
                "label": label
            })

df = pd.DataFrame(records)

# Useful statistics
stats = {}

# Number of total images and subimages per labeler
stats["images_per_labeler"] = df.groupby("labeler")["image_id"].nunique()
stats["subimages_per_labeler"] = df.groupby("labeler")["subimage_id"].count()

# Distribution of labels per labeler
label_distribution = df.groupby(["labeler", "label"]).size().unstack(fill_value=0)

# Agreement analysis: calculate standard deviation of labels across labelers for each image-subimage pair
pivot = df.pivot_table(index=["image_id", "subimage_id"], columns="labeler", values="label")
pivot_std = pivot.std(axis=1)
agreement_summary = pivot_std.describe()

# import ace_tools as tools; tools.display_dataframe_to_user(name="Label Distribution Per Labeler", dataframe=label_distribution)

stats, agreement_summary


({'images_per_labeler': labeler
  Ali        50
  Dr.Lory    50
  Krystal    50
  Name: image_id, dtype: int64,
  'subimages_per_labeler': labeler
  Ali        400
  Dr.Lory    400
  Krystal    400
  Name: subimage_id, dtype: int64},
 count    400.000000
 mean       0.652379
 std        0.662444
 min        0.000000
 25%        0.000000
 50%        0.577350
 75%        1.154701
 max        2.309401
 dtype: float64)

In [5]:
# Updated group assignment function with new NA condition
def assign_group_updated(tile_labels):
    labels = list(tile_labels)
    count_4 = labels.count(4)
    count_3 = labels.count(3)
    count_2 = labels.count(2)
    all_01 = all(l in (0, 1) for l in labels)

    if count_4 > 0:
        return "DG"
    elif all_01:
        return "DNG"
    elif count_3 > 4:
        return "SG"
    elif count_2 > 4:
        return "NA"
    else:
        return "SNG"

# Reapply the updated heuristic
grouped_updated = df.groupby(["labeler", "image_id"])["label"].apply(assign_group_updated).reset_index()
grouped_updated.columns = ["labeler", "image_id", "group"]

# Updated group distribution per labeler
group_distribution_updated = grouped_updated.groupby(["labeler", "group"]).size().unstack(fill_value=0)

# tools.display_dataframe_to_user(name="Updated Heuristic Label Group Distribution", dataframe=group_distribution_updated)

# grouped_updated.head()


Unnamed: 0,labeler,image_id,group
0,Ali,1,DG
1,Ali,102,DG
2,Ali,108,SNG
3,Ali,118,SNG
4,Ali,124,SNG


In [6]:
group_distribution_updated

group,DG,DNG,NA,SG,SNG
labeler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ali,14,15,10,3,8
Dr.Lory,18,25,0,2,5
Krystal,14,23,0,6,7
