In [1]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
ANON = ROOT / "data" / "interim" / "anonymized"

# Sub → Main mapping
SUB_TO_MAIN = {
    "Pothole": "Road issues",
    "Damaged Road Surface": "Road issues",
    "Illegal Parking": "Road issues",
    "Broken/Damaged Road Sign": "Road issues",
    "Littering / Garbage": "Environmental issues",
    "Vandalism / Graffiti": "Environmental issues",
}

# Your sub label space (same names we used before)
SUB_CLASSES = list(SUB_TO_MAIN.keys())
MAIN_CLASSES = sorted(set(SUB_TO_MAIN.values()))
MAIN_TO_ID = {m:i for i,m in enumerate(MAIN_CLASSES)}
SUB_TO_ID  = {s:i for i,s in enumerate(SUB_CLASSES)}

# Where the three auto-labeled sources live (edit if needed)
TACO_DIR     = ANON / "taco" / "images"                 # → Littering / Garbage
POTHOLE_DIR  = ANON / "kaggle_potholes" / "images"      # → Pothole
PARKING_DIR  = ANON / "roboflow_illegal_parking" / "images"  # → Illegal Parking


In [2]:
import glob

def df_single_label(folder: Path, sub_label: str) -> pd.DataFrame:
    exts = ("*.jpg","*.jpeg","*.png","*.webp","*.bmp","*.tiff")
    files = []
    for e in exts:
        files += glob.glob(str(folder / "**" / e), recursive=True)
    rows = []
    for f in files:
        sub = sub_label
        main = SUB_TO_MAIN[sub]
        rows.append({
            "path": f,
            "sub_name": sub,
            "sub_id":  SUB_TO_ID[sub],
            "main_name": main,
            "main_id":  MAIN_TO_ID[main],
        })
    return pd.DataFrame(rows)

df_taco    = df_single_label(TACO_DIR,    "Littering / Garbage")
df_pothole = df_single_label(POTHOLE_DIR, "Pothole")
df_parking = df_single_label(PARKING_DIR, "Illegal Parking")

len(df_taco), len(df_pothole), len(df_parking)


(833, 665, 103)

In [3]:
import json, os
from urllib.parse import urlparse, parse_qs, unquote

def from_ls_url(p: str) -> str:
    s = str(p)
    if "/data/local-files/" in s:
        u = urlparse(s); d = parse_qs(u.query).get("d", [None])[0]
        if d:
            # d might be relative; make absolute under data/interim/
            pth = Path(d)
            if not pth.is_absolute():
                pth = ROOT / "data" / "interim" / d
            return str(pth)
    return s

def rows_from_ls_export(path: Path):
    tasks = json.loads(path.read_text())
    rows = []
    for t in tasks:
        img = (t.get("data") or {}).get("image")
        img = from_ls_url(img)
        # collect chosen sub labels (Choices)
        chosen = set()
        for ann in t.get("annotations", []):
            for r in ann.get("result", []):
                val = r.get("value", {})
                for v in (val.get("choices", []) if isinstance(val, dict) else []):
                    if v in SUB_TO_MAIN:  # only known subs
                        chosen.add(v)
        if img and len(chosen)==1:
            sub = list(chosen)[0]
            main = SUB_TO_MAIN[sub]
            rows.append({
                "path": img,
                "sub_name": sub,
                "sub_id":  SUB_TO_ID[sub],
                "main_name": main,
                "main_id":  MAIN_TO_ID[main],
            })
        # NOTE: for images with multiple subs, we skip here to keep single-sub training consistent
    return rows

ls_exports = sorted((ROOT/"labels").glob("export*.json"))
rows = []
for p in ls_exports:
    rows += rows_from_ls_export(p)

df_ls = pd.DataFrame(rows, columns=["path","sub_name","sub_id","main_name","main_id"]) if rows else \
        pd.DataFrame(columns=["path","sub_name","sub_id","main_name","main_id"])

len(df_ls)


272

In [4]:
df_auto = pd.concat([df_taco, df_pothole, df_parking], ignore_index=True)
df_all  = pd.concat([df_auto, df_ls], ignore_index=True)
df_all  = df_all.drop_duplicates(subset=["path"], keep="last").reset_index(drop=True)

OUT = ROOT / "labels" / "dataset_main_sub.csv"
OUT.parent.mkdir(parents=True, exist_ok=True)
df_all.to_csv(OUT, index=False)

print("Wrote", len(df_all), "rows →", OUT)
print("Main counts:", df_all["main_name"].value_counts().to_dict())
print("Sub counts :", df_all["sub_name"].value_counts().to_dict())


Wrote 1601 rows → /Users/nithilathawalampitiya/Documents/Projects/urban-issue-classifier/labels/dataset_main_sub.csv
Main counts: {'Environmental issues': 833, 'Road issues': 768}
Sub counts : {'Littering / Garbage': 833, 'Pothole': 643, 'Illegal Parking': 103, 'Damaged Road Surface': 22}


In [5]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from pathlib import Path

CSV  = ROOT / "labels" / "dataset_main_sub.csv"
SPL = ROOT / "labels" / "splits_hier"; SPL.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(CSV)

# keep only rows with a single sub id (already enforced), and valid ids
df = df[df["sub_id"].notna()].copy()

sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
tr_idx, tmp_idx = next(sss1.split(df["path"].values, df["sub_id"].values))
df_tr, df_tmp = df.iloc[tr_idx], df.iloc[tmp_idx]

sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.50, random_state=43)  # 10/10
v_idx, te_idx = next(sss2.split(df_tmp["path"].values, df_tmp["sub_id"].values))
df_val, df_test = df_tmp.iloc[v_idx], df_tmp.iloc[te_idx]

df_tr.to_csv(SPL/"train.csv", index=False)
df_val.to_csv(SPL/"val.csv",   index=False)
df_test.to_csv(SPL/"test.csv", index=False)

print("train/val/test sizes:", len(df_tr), len(df_val), len(df_test))
print("train sub dist:", df_tr["sub_name"].value_counts().to_dict())


train/val/test sizes: 1280 160 161
train sub dist: {'Littering / Garbage': 666, 'Pothole': 514, 'Illegal Parking': 82, 'Damaged Road Surface': 18}
