In [15]:
import numpy as np
import pandas as pd
from pathlib import Path

# -----------------------------
# CONFIG (EDIT THESE PATHS)
# -----------------------------
OUT_CSV = Path(r"F:\New Dissertation - Image Generation\POC\outputs\form_likert.csv")

# Shape-like behavior:
# - AUTOFILL_LIKERT=True  -> auto-fill Likert ratings (heuristic from computed indices)
# - AUTOFILL_LIKERT=False -> leave Likert blank for human rating
AUTOFILL_LIKERT = True

# Optional: keep computed indices alongside Likert (helps quick joins/inspection)
INCLUDE_COMPUTED_INDICES = True

SEED  = 42
NOISE = 0.30  # used only when AUTOFILL_LIKERT=True (set 0.0 for deterministic)

# Form step CSVs (image-level)
FORM1_IMAGE_CSV = Path(r"F:\New Dissertation - Image Generation\POC\outputs\form\form1_orientation\form1_image_orientation.csv")
FORM2_IMAGE_CSV = Path(r"F:\New Dissertation - Image Generation\POC\outputs\form2_image.csv")
FORM3_IMAGE_CSV = Path(r"F:\New Dissertation - Image Generation\POC\outputs\form\form3\form3_image.csv")  # adjust if different
FORM4_IMAGE_CSV = Path(r"F:\New Dissertation - Image Generation\POC\outputs\form4\form4_image_indices.csv")
FORM5_IMAGE_CSV = Path(r"F:\New Dissertation - Image Generation\POC\outputs\form5\form5_image_indices.csv")
FORM6_IMAGE_CSV = Path(r"F:\New Dissertation - Image Generation\POC\outputs\form6\form6_image_indices.csv")

# -----------------------------
# QUESTIONS (6 lenses -> 6 Likert columns)
# -----------------------------
LIKERT_COLS = [
    "Q1_orientation_crossfire",
    "Q2_aspect_best_view_tension",
    "Q3_egyptian_vs_projective_realism",
    "Q4_overlap_hierarchy_meaning",
    "Q5_cubist_contradiction",
    "Q6_ornament_vs_art",
]

# -----------------------------
# HELPERS (same style as Shape)
# -----------------------------
rng = np.random.default_rng(SEED)

def clamp01(x):
    x = np.asarray(x, dtype=float)
    return np.clip(x, 0.0, 1.0)

def robust_norm01(x, lo_q=5, hi_q=95):
    """Robust normalize to [0,1] using percentiles."""
    x = np.asarray(x, dtype=float)
    m = np.isfinite(x)
    if m.sum() == 0:
        return np.full_like(x, np.nan, dtype=float)
    lo = np.nanpercentile(x[m], lo_q)
    hi = np.nanpercentile(x[m], hi_q)
    if not np.isfinite(lo) or not np.isfinite(hi) or (hi - lo) < 1e-9:
        return np.zeros_like(x, dtype=float)
    return clamp01((x - lo) / (hi - lo + 1e-9))

def to_likert_1to5(score01, noise=0.0):
    """Map score in [0,1] -> Likert 1..5 with optional noise."""
    s = np.asarray(score01, dtype=float)
    s = np.where(np.isfinite(s), s, np.nan)

    if noise and noise > 0:
        s = s + rng.normal(0.0, noise, size=s.shape)

    s = clamp01(s)

    out = np.full_like(s, np.nan, dtype=float)
    m = np.isfinite(s)
    out[m] = np.floor(s[m] * 5.0) + 1.0
    out[m] = np.clip(out[m], 1.0, 5.0)
    return out.astype(float)

def safe_read(p: Path):
    if p is None or not Path(p).exists():
        return None
    return pd.read_csv(p)

def merge_on_key(dfs):
    base = None
    for d in dfs:
        if d is not None and len(d) > 0:
            base = d.copy()
            break
    if base is None:
        return pd.DataFrame(columns=["split", "image_id"])

    for col in ["split", "image_id"]:
        if col not in base.columns:
            raise KeyError(f"Missing '{col}' in base dataframe.")

    out = base
    for d in dfs:
        if d is None or d is base or len(d) == 0:
            continue
        if "split" not in d.columns or "image_id" not in d.columns:
            continue

        add = d.copy()
        dup = [c for c in add.columns if c in out.columns and c not in ("split", "image_id")]
        if dup:
            add = add.drop(columns=dup)
        out = out.merge(add, on=["split", "image_id"], how="left")
    return out

def col_as_float_array(df, col, default=np.nan):
    if col not in df.columns:
        return np.full(len(df), default, dtype=float)
    return pd.to_numeric(df[col], errors="coerce").to_numpy(dtype=float)

# -----------------------------
# LOAD + MERGE FORM IMAGE CSVs
# -----------------------------
def normalize_ids(d):
    if d is None or len(d) == 0:
        return d
    d = d.copy()
    d["image_id"] = d["image_id"].astype(str).str.replace("\\", "/", regex=False)
    return d

df1 = safe_read(FORM1_IMAGE_CSV)
df2 = safe_read(FORM2_IMAGE_CSV)
df3 = safe_read(FORM3_IMAGE_CSV)
df4 = safe_read(FORM4_IMAGE_CSV)
df5 = safe_read(FORM5_IMAGE_CSV)
df6 = safe_read(FORM6_IMAGE_CSV)

df1 = normalize_ids(df1)
df2 = normalize_ids(df2)
df3 = normalize_ids(df3)
df4 = normalize_ids(df4)
df5 = normalize_ids(df5)
df6 = normalize_ids(df6)

df = merge_on_key([df1, df2, df3, df4, df5, df6]).copy()

if "split" in df.columns and "image_id" in df.columns:
    df = df.sort_values(["split", "image_id"], kind="stable").reset_index(drop=True)

# -----------------------------
# BUILD OUTPUT SHEET (SHAPE-LIKE)
# -----------------------------
df_out = df[["split", "image_id"]].copy()

# Likert columns: either auto-filled or left blank (NA)
for c in LIKERT_COLS:
    df_out[c] = np.nan

if AUTOFILL_LIKERT:
    # ---- Q1: Orientation crossfire (higher conflict => higher)
    x1 = col_as_float_array(df, "form1_orientation_conflict_index_01")
    q1 = robust_norm01(x1)



    # ---- Q2: Best-view tension (viewpoint ambiguity + symmetry degradation)
    x2a = col_as_float_array(df, "form2_viewpoint_ambiguity_index_01")
    x2b = col_as_float_array(df, "form2_symmetry_degradation_index_01")

    a = robust_norm01(x2a)
    b = robust_norm01(x2b)

    w_a = np.where(np.isfinite(a), 0.7, 0.0)
    w_b = np.where(np.isfinite(b), 0.3, 0.0)
    ws  = w_a + w_b

    q2 = np.where(ws > 0, (w_a*a + w_b*b) / ws, np.nan)

    # ---- Q3: Egyptian vs projective realism (affine distortion)
    x3 = col_as_float_array(df, "form2_affine_distortion_index_01")
    q3 = robust_norm01(x3)

    # ---- Q4: Overlap hierarchy meaning (occlusion structure)
    x4 = col_as_float_array(df, "form3_dominance_subservience_index_01")
    q4 = robust_norm01(x4)

    # ---- Q5: Cubist contradiction (crossfire + ambiguity)
    q5 = 0.5 * robust_norm01(x1) + 0.5 * robust_norm01(x2a)

    # ---- Q6: Ornament vs art (ornamentality high => more ornament-pull)
    x6 = col_as_float_array(df, "form5_ornamentality_index_01")
    q6 = robust_norm01(x6)

    df_out["Q1_orientation_crossfire"]          = to_likert_1to5(q1, noise=NOISE)
    df_out["Q2_aspect_best_view_tension"]       = to_likert_1to5(q2, noise=NOISE)
    df_out["Q3_egyptian_vs_projective_realism"] = to_likert_1to5(q3, noise=NOISE)
    df_out["Q4_overlap_hierarchy_meaning"]      = to_likert_1to5(q4, noise=NOISE)
    df_out["Q5_cubist_contradiction"]           = to_likert_1to5(q5, noise=NOISE)
    df_out["Q6_ornament_vs_art"]                = to_likert_1to5(q6, noise=NOISE)

# Means (as specified in your handoff)
df_out["form_structural_tension_mean"] = (
    df_out[[
        "Q1_orientation_crossfire",
        "Q2_aspect_best_view_tension",
        "Q3_egyptian_vs_projective_realism",
        "Q4_overlap_hierarchy_meaning",
        "Q5_cubist_contradiction",
    ]]
    .astype("Float64")
    .mean(axis=1, skipna=True)
)

df_out["form_ornament_mean"] = df_out["Q6_ornament_vs_art"].astype("Float64")

# Optional: keep computed indices alongside (helps quick joins/inspection)
if INCLUDE_COMPUTED_INDICES:
    keep_cols = [
        # Form-1
        "form1_global_frame_alignment_01",
        "form1_local_frame_influence_abs_01",
        "form1_orientation_conflict_index_01",
        # Form-2
        "form2_affine_distortion_index_01",
        "form2_symmetry_degradation_index_01",
        "form2_viewpoint_ambiguity_index_01",
        "form2_angle_spread_01",
        # Form-3
        "form3_dominance_subservience_index_01",
        "overlap_pairs",
        "occlusion_edges",
        # Form-4
        "form4_good_continuation_01",
        "form4_completion_plausibility_01",
        "form4_amputation_risk_01",
        # Form-5
        "form5_reflection_symmetry_whole_01",
        "form5_repetition_density_01",
        "form5_spacing_regularity_01",
        "form5_structural_surprise_01",
        "form5_ornamentality_index_01",
        # Form-6
        "form6_geometricization_01",
        "form6_structural_economy_01",
        "form6_skeletal_complexity_01",
        "form6_clutter_complexity_01",
    ]
    keep_cols = [c for c in keep_cols if c in df.columns]
    for c in keep_cols:
        df_out[c] = df[c].values

# -----------------------------
# WRITE
# -----------------------------
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(OUT_CSV, index=False)

print("Wrote:", OUT_CSV)
print("Rows:", len(df_out))
print(df_out.head(10))

Wrote: F:\New Dissertation - Image Generation\POC\outputs\form_likert.csv
Rows: 128
       split                      image_id  Q1_orientation_crossfire  \
0  generated  generated/ComfyUI_00082_.png                       1.0   
1  generated  generated/ComfyUI_00083_.png                       1.0   
2  generated  generated/ComfyUI_00084_.png                       2.0   
3  generated  generated/ComfyUI_00085_.png                       5.0   
4  generated  generated/ComfyUI_00086_.png                       1.0   
5  generated  generated/ComfyUI_00087_.png                       1.0   
6  generated  generated/ComfyUI_00088_.png                       1.0   
7  generated  generated/ComfyUI_00089_.png                       4.0   
8  generated  generated/ComfyUI_00090_.png                       4.0   
9  generated  generated/ComfyUI_00091_.png                       1.0   

   Q2_aspect_best_view_tension  Q3_egyptian_vs_projective_realism  \
0                          3.0                        

In [16]:
print(df6[["form6_structural_economy_01","form6_skeletal_complexity_01"]].isna().mean())

form6_structural_economy_01     1.0
form6_skeletal_complexity_01    1.0
dtype: float64
