In [8]:
from pathlib import Path
import json, shutil, os, datetime
import pandas as pd
from collections import Counter

In [9]:
BASE = Path("../datasets/korean_emotion_complex_vision_10_percent_split")  # 필요 시 _SI 등으로 교체
LABEL_DIRS = [
    BASE / "train" / "labels",
    BASE / "val" / "labels",
]

In [10]:
# 이미지가 들어있는 루트 (샘플링 결과 구조: train/<emotion>/*.jpg, val/<emotion>/*.jpg)
IMAGE_ROOTS = [BASE / "train", BASE / "val"]

In [11]:
# === 유틸 ===
def load_label_items(label_dir: Path):
    rows = []
    for jf in sorted(label_dir.glob("*.json")):
        emotion_from_file = jf.stem.replace("_sampled", "")  # '기쁨_sampled.json' -> '기쁨'
        data = json.load(open(jf, encoding="utf-8"))
        for it in data:
            rows.append({
                "split": label_dir.parents[0].name,                # train/val
                "json_emotion": emotion_from_file,                 # JSON 파일명(=폴더) 기준 감정
                "filename": it.get("filename"),
                "uploader": it.get("faceExp_uploader"),
                "a": (it.get("annot_A") or {}).get("faceExp"),
                "b": (it.get("annot_B") or {}).get("faceExp"),
                "c": (it.get("annot_C") or {}).get("faceExp"),
            })
    return rows

In [12]:
def find_image_path(filename: str):
    # 샘플링 결과 구조 특성상 감정 폴더 안에 이미지가 있음 → 전체에서 검색
    for root in IMAGE_ROOTS:
        cand = list(root.glob(f"**/{filename}"))
        if cand:
            return cand[0]
    return None

In [13]:
def majority_vote(annots):
    cnt = Counter([x for x in annots if x is not None])
    if not cnt:
        return None, 0
    top_label, top_count = cnt.most_common(1)[0]
    return top_label, top_count

In [14]:
# === 1) JSON 적재 → DataFrame ===
all_rows = []
for ld in LABEL_DIRS:
    if ld.exists():
        all_rows += load_label_items(ld)
df = pd.DataFrame(all_rows)
if df.empty:
    raise SystemExit("라벨 JSON을 찾지 못했습니다. LABEL_DIRS 경로를 확인하세요.")

In [15]:
# === 2) 합의/불일치 파생 컬럼 ===
annots = df[["a","b","c"]].values.tolist()
maj_label, maj_count = [], []
for row in annots:
    m, c = majority_vote(row)
    maj_label.append(m)
    maj_count.append(c)

In [16]:
df["annot_majority"] = maj_label             # 어노테이터 다수결 라벨
df["annot_agree_n"]  = maj_count             # 최다 동의자 수(3=전원 일치, 2=2:1, 1=전원 다름)
df["annots_all_diff"] = (df["annot_agree_n"] == 1)
df["uploader_vs_maj_mismatch"] = (df["uploader"] != df["annot_majority"])
df["four_way_all_diff"] = (
    df.apply(lambda r: len(set([r["uploader"], r["a"], r["b"], r["c"]])) == 4, axis=1)
)

In [17]:
# === 3) 요약 통계 출력 ===
summary = {
    "total": len(df),
    "annot_full_agree(3/3)": int((df["annot_agree_n"] == 3).sum()),
    "annot_two_one(2/1)":    int((df["annot_agree_n"] == 2).sum()),
    "annot_all_diff(1/1/1)": int(df["annots_all_diff"].sum()),
    "uploader_vs_majority_mismatch": int(df["uploader_vs_maj_mismatch"].sum()),
    "four_way_all_diff": int(df["four_way_all_diff"].sum())
}

In [18]:
print("=== 라벨 EDA 요약 ===")
for k,v in summary.items():
    print(f"{k:>30s}: {v}")

=== 라벨 EDA 요약 ===
                         total: 40550
         annot_full_agree(3/3): 20535
            annot_two_one(2/1): 14299
         annot_all_diff(1/1/1): 5716
 uploader_vs_majority_mismatch: 12176
             four_way_all_diff: 1210


In [19]:
# 클래스별 업로더-다수결 불일치율
by_emotion = (df.groupby("json_emotion")["uploader_vs_maj_mismatch"]
              .mean().sort_values(ascending=False).rename("mismatch_rate(json_emotion)"))
print("\n=== (json_emotion 기준) 업로더 vs 다수결 불일치율 ===")
print((by_emotion*100).round(1).astype(str) + "%")


=== (json_emotion 기준) 업로더 vs 다수결 불일치율 ===
json_emotion
상처    72.0%
불안    56.1%
슬픔    25.1%
분노    24.8%
당황    21.1%
중립     4.5%
기쁨     2.0%
Name: mismatch_rate(json_emotion), dtype: object


In [25]:
df

Unnamed: 0,split,json_emotion,filename,uploader,a,b,c,annot_majority,annot_agree_n,annots_all_diff,uploader_vs_maj_mismatch,four_way_all_diff,majority_label,status
0,train,기쁨,d86990fc5ea58f8c7ec7985cdf9e594da0d7edb8711ac8...,기쁨,기쁨,기쁨,기쁨,기쁨,3,False,False,False,기쁨,6
1,train,기쁨,b22df22cf3fefe3d4ed4d8f72346ff553ac2f0b103bbb5...,기쁨,기쁨,중립,기쁨,기쁨,2,False,False,False,기쁨,5
2,train,기쁨,fd5b33b64b9a1b0550c89f27bf0cc7e499ef1a91819706...,기쁨,기쁨,기쁨,기쁨,기쁨,3,False,False,False,기쁨,6
3,train,기쁨,e3935007de4bf8daaf7c580ee8ab896fd10d16537cee1d...,기쁨,기쁨,기쁨,기쁨,기쁨,3,False,False,False,기쁨,6
4,train,기쁨,08a32c1632b7dff63981271350669d8b39d567a5db78da...,기쁨,기쁨,기쁨,기쁨,기쁨,3,False,False,False,기쁨,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40545,val,중립,645f3935355171c19acc6c8a310efa081a4d1195adf6c8...,중립,중립,중립,중립,중립,3,False,False,False,중립,6
40546,val,중립,71e36b10749933d8df6114b777f1c41658047fa08724de...,중립,중립,중립,중립,중립,3,False,False,False,중립,6
40547,val,중립,c6ed5e481a845069537851e83227395fe2bf23221a0dce...,중립,중립,중립,중립,중립,3,False,False,False,중립,6
40548,val,중립,2aafb545c9a93c39542e483186fe25fe1cb3709a9b8d02...,중립,중립,중립,기쁨,중립,2,False,False,False,중립,5


In [24]:
# 다수결 라벨과 상태 추가
df[['majority_label', 'status']] = df.apply(lambda r: pd.Series(majority_vote(r)), axis=1)

# 감정별 분포 집계
label_counts = df['majority_label'].value_counts(dropna=False)

print("=== 감정별 라벨 분포 ===")
print(label_counts)

# 상태별 감정 분포
print("\n=== 상태별 감정 분포 ===")
status_dist = df.groupby(['status', 'majority_label']).size().unstack(fill_value=0)
print(status_dist)

=== 감정별 라벨 분포 ===
majority_label
슬픔    6421
당황    6202
기쁨    6168
중립    5880
분노    5847
불안    5008
상처    3814
1     1210
Name: count, dtype: int64

=== 상태별 감정 분포 ===
majority_label     1    기쁨    당황    분노    불안    상처    슬픔    중립
status                                                        
3                  0   150  1131  1190  2296  2084  1492   896
4               1210   143   533   345   421   448   670   481
5                  0   327  1615  1429  1389   854  1591  1047
6                  0  5548  2923  2883   902   428  2668  3456


In [26]:
def summarize_group(g: pd.DataFrame) -> pd.Series:
    total = len(g)
    full_agree = int((g["annot_agree_n"] == 3).sum())
    two_one = int((g["annot_agree_n"] == 2).sum())
    all_diff = int((g["annot_agree_n"] == 1).sum())  # == g['annots_all_diff'].sum()와 동일
    uploader_vs_maj_mismatch = int(g["uploader_vs_maj_mismatch"].sum())
    four_way_all_diff = int(g["four_way_all_diff"].sum())
    return pd.Series({
        "total": total,
        "annot_full_agree(3/3)": full_agree,
        "annot_two_one(2/1)": two_one,
        "annot_all_diff(1/1/1)": all_diff,
        "uploader_vs_majority_mismatch": uploader_vs_maj_mismatch,
        "four_way_all_diff": four_way_all_diff,
        # 비율 컬럼(%)도 같이 보고 싶을 때
        "full_agree_%": round(full_agree / total * 100, 1) if total else 0.0,
        "two_one_%":    round(two_one / total * 100, 1) if total else 0.0,
        "all_diff_%":   round(all_diff / total * 100, 1) if total else 0.0,
        "uploader_vs_maj_mismatch_%": round(uploader_vs_maj_mismatch / total * 100, 1) if total else 0.0,
        "four_way_all_diff_%":        round(four_way_all_diff / total * 100, 1) if total else 0.0,
    })

# 1) 감정별 요약 (폴더명 기준)
per_emotion = df.groupby("json_emotion").apply(summarize_group).sort_values(
    by="uploader_vs_maj_mismatch_%", ascending=False
)
print("=== 감정별(폴더명 기준) 라벨 EDA 요약 ===")
print(per_emotion)

# 2) (옵션) train/val까지 분할해서 보고 싶다면:
per_split_emotion = df.groupby(["split","json_emotion"]).apply(summarize_group)
print("\n=== split x 감정별 라벨 EDA 요약 ===")
print(per_split_emotion)

# 3) (옵션) CSV 저장
per_emotion.to_csv("./runs/label_eda_per_emotion.csv", encoding="utf-8-sig")
per_split_emotion.to_csv("./runs/label_eda_per_split_emotion.csv", encoding="utf-8-sig")


=== 감정별(폴더명 기준) 라벨 EDA 요약 ===
               total  annot_full_agree(3/3)  annot_two_one(2/1)  \
json_emotion                                                      
상처            5938.0                 1176.0              3076.0   
불안            5926.0                 1329.0              3007.0   
슬픔            5984.0                 2863.0              2362.0   
분노            5969.0                 3019.0              2131.0   
당황            5964.0                 3085.0              2212.0   
중립            4769.0                 3486.0              1136.0   
기쁨            6000.0                 5577.0               375.0   

              annot_all_diff(1/1/1)  uploader_vs_majority_mismatch  \
json_emotion                                                         
상처                           1686.0                         4278.0   
불안                           1590.0                         3325.0   
슬픔                            759.0                         1500.0   
분노              

  per_emotion = df.groupby("json_emotion").apply(summarize_group).sort_values(
  per_split_emotion = df.groupby(["split","json_emotion"]).apply(summarize_group)


In [20]:
# === 4) 검수 세트(이미지) 준비: 조건별로 폴더에 복사 ===
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
REVIEW_ROOT = Path(f"./runs/label_eda_{ts}")
(REVIEW_ROOT).mkdir(parents=True, exist_ok=True)

In [None]:
# 필터 정의
filters = {
    "annot_all_diff": df["annots_all_diff"],                      # 라벨러 전원 불일치(1/1/1)
    "uploader_vs_majority_mismatch": df["uploader_vs_maj_mismatch"],  # 업로더 vs 다수결 불일치
    "four_way_all_diff": df["four_way_all_diff"],                 # 업로더+a+b+c 모두 다름(4-way)
}

In [22]:
# 이미지 복사 및 HTML 갤러리
html_parts = ["<html><body><h1>Label EDA Review</h1>"]
for name, mask in filters.items():
    sub = df[mask].copy()
    out_dir = REVIEW_ROOT / name
    out_dir.mkdir(parents=True, exist_ok=True)

    # CSV 저장(메타)
    sub.to_csv(out_dir / f"{name}.csv", index=False, encoding="utf-8-sig")

    html_parts.append(f"<h2>{name} (n={len(sub)})</h2>")
    html_parts.append("<div style='display:flex;flex-wrap:wrap;gap:8px'>")

    for _, r in sub.iterrows():
        p = find_image_path(r["filename"])
        if p is None:
            continue
        # 파일명에 메타 반영하여 복사
        dst = out_dir / f"{p.stem}__json-{r['json_emotion']}__uploader-{r['uploader']}__A-{r['a']}__B-{r['b']}__C-{r['c']}{p.suffix}"
        try:
            shutil.copy2(p, dst)
        except Exception as e:
            print("copy fail:", p, "->", dst, e)
            continue
        # HTML 타일
        rel = os.path.relpath(dst, REVIEW_ROOT)
        cap = f"{r['filename']}<br>json:{r['json_emotion']} | uploader:{r['uploader']} | A:{r['a']} B:{r['b']} C:{r['c']}"
        html_parts.append(f"""
        <figure style="width:224px;margin:0">
          <img src="{rel}" style="width:224px;height:auto;display:block;border:1px solid #444" />
          <figcaption style="font:12px/1.3 monospace">{cap}</figcaption>
        </figure>
        """)

    html_parts.append("</div>")

copy fail: ..\datasets\korean_emotion_complex_vision_10_percent_split\train\기쁨\a421bef2d349ab4513dbb0b107abc9fac9c2551e422376f9706a639139645b8b_여_20_기쁨_교통&이동수단(엘리베이터 포함)_20210119171653-002-008.jpg -> runs\label_eda_20250818_151706\uploader_vs_majority_mismatch\a421bef2d349ab4513dbb0b107abc9fac9c2551e422376f9706a639139645b8b_여_20_기쁨_교통&이동수단(엘리베이터 포함)_20210119171653-002-008__json-기쁨__uploader-기쁨__A-중립__B-기쁨__C-당황.jpg [Errno 2] No such file or directory: 'runs\\label_eda_20250818_151706\\uploader_vs_majority_mismatch\\a421bef2d349ab4513dbb0b107abc9fac9c2551e422376f9706a639139645b8b_여_20_기쁨_교통&이동수단(엘리베이터 포함)_20210119171653-002-008__json-기쁨__uploader-기쁨__A-중립__B-기쁨__C-당황.jpg'
copy fail: ..\datasets\korean_emotion_complex_vision_10_percent_split\train\기쁨\0d384e9d6ad1ad9521f43213dff99506c204d3b5f1ca42a5bc41a4462141156d_여_20_기쁨_교통&이동수단(엘리베이터 포함)_20210208185801-002-002.jpg -> runs\label_eda_20250818_151706\uploader_vs_majority_mismatch\0d384e9d6ad1ad9521f43213dff99506c204d3b5f1ca42a5bc41a446214

KeyboardInterrupt: 

In [23]:
# 메인 인덱스 HTML
html_parts.append("</body></html>")
(REVIEW_ROOT / "index.html").write_text("\n".join(html_parts), encoding="utf-8")
print(f"\n검수 세트 준비 완료 → {REVIEW_ROOT}")
print(f"  - HTML 갤러리 열기: {REVIEW_ROOT/'index.html'}")


검수 세트 준비 완료 → runs\label_eda_20250818_151706
  - HTML 갤러리 열기: runs\label_eda_20250818_151706\index.html
