In [6]:
import json
import re
import string
from pathlib import Path

import numpy as np
import pandas as pd


# ----------------------------
# Utils
# ----------------------------
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

FB_PAT = re.compile(
    r"(mistaken(ly)?\s+(think|believ(e|es|ed))|"
    r"incorrect(ly)?\s+(think|believ(e|es|ed))|"
    r"wrong(ly)?\s+(think|assum(e|es|ed))|"
    r"false\s+belief)",
    flags=re.IGNORECASE,
)

LIKE_PAT = re.compile(r"\b(love|like|likes|liked)\b", re.IGNORECASE)
DISLIKE_PAT = re.compile(r"\b(hate|hates|hated|dislike|dislikes|disliked)\b", re.IGNORECASE)


def load_json(path: str) -> list[dict]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    # Accept: list[dict] or {"data": [...]}
    if isinstance(data, dict) and "data" in data:
        data = data["data"]
    elif isinstance(data, dict):
        # Sometimes keyed dict -> take values
        data = list(data.values())
    assert isinstance(data, list), "Input must be a list of records or {'data': [...]}"
    return data


def clean_text(x: str) -> str:
    if pd.isna(x):
        return ""
    s = str(x)
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s


def to_bool(x):
    if isinstance(x, bool):
        return x
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    if s in {"true", "1", "yes"}:
        return True
    if s in {"false", "0", "no"}:
        return False
    return np.nan


def standardize_sentiment(s):
    """Normalize sentiment labels into a few consistent categories."""
    if pd.isna(s):
        return np.nan
    s = str(s).strip().lower()
    mapping = {
        "the most negative": "most negative",
        "very negative": "most negative",
        "most negative": "most negative",
        "negative": "negative",
        "neutral": "neutral",
        "positive": "positive",
        "very positive": "most positive",
        "most positive": "most positive",
    }
    return mapping.get(s, s)



def pref_polarity(text: str) -> int:
    """+1 if like/love, -1 if hate/dislike, 0 otherwise."""
    if not isinstance(text, str):
        return 0
    if LIKE_PAT.search(text):
        return +1
    if DISLIKE_PAT.search(text):
        return -1
    return 0


def false_belief_flag(preferences: dict | str) -> int:
    """1 if 'mistakenly think' / similar appears in preferences, else 0."""
    if isinstance(preferences, dict):
        blob = " ".join(str(v) for v in preferences.values() if isinstance(v, str))
    else:
        blob = str(preferences or "")
    return int(bool(FB_PAT.search(blob)))


# ----------------------------
# Main preprocessing
# ----------------------------
def preprocess_opentom(in_path="opentom.json", out_csv="opentom_clean.csv") -> pd.DataFrame:
    raw = load_json(in_path)

    # Flatten nested fields
    df = pd.json_normalize(
        raw,
        sep=".",
        max_level=None,
    )

    # --- rename core nested fields to simple names
    rename = {
        "plot_info.mover": "mover",
        "plot_info.observer": "observer",
        "plot_info.eoi": "eoi",
        "plot_info.original_place": "orig_place",
        "plot_info.move_to_place": "move_to",
        "question.type": "q_type",
        "question.answer": "q_answer",
        "question.question": "q_text",
    }
    df.rename(columns={k: v for k, v in rename.items() if k in df.columns}, inplace=True)

    # --- keep originals for important long texts
    for col in ["plot", "narrative", "intention", "personality", "sentiment_statement", "q_text"]:
        if col in df.columns:
            df[f"{col}_orig"] = df[col]

    # --- normalized text (lowercased, trimmed)
    for col in ["plot", "narrative", "intention", "personality", "sentiment_statement", "q_text"]:
        if col in df.columns:
            df[col] = df[col].map(clean_text)

    # --- low-cardinality fields
    if "observed" in df.columns:
        df["observed"] = df["observed"].map(to_bool)

    if "true_sentiment" in df.columns:
        df["true_sentiment"] = df["true_sentiment"].map(standardize_sentiment)

    # --- simple string normalizations
    for col in ["mover", "observer", "eoi", "orig_place", "move_to", "new_location", "q_type", "q_answer"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.lower()

    # --- preferences parsing
    # If preferences stayed nested (dict), use it; else reassemble from flattened columns
    if "preferences" in df.columns:
        prefs_series = df["preferences"]
    else:
        pref_cols = [c for c in df.columns if c.startswith("preferences.")]
        if pref_cols:
            # turn row into dict
            prefs_series = df[pref_cols].apply(
                lambda r: {k.split(".", 1)[1]: v for k, v in r.to_dict().items()}, axis=1
            )
        else:
            prefs_series = pd.Series([{}] * len(df))

    df["false_belief_cue"] = prefs_series.apply(false_belief_flag)

    # mover/observer stated polarity (about eoi) if present
    def _get_pref(role):
        def f(p):
            if isinstance(p, dict):
                return p.get(role, "")
            return ""
        return f

    df["pref_mover_text"] = prefs_series.apply(_get_pref("mover"))
    df["pref_observer_text"] = prefs_series.apply(_get_pref("observer"))
    df["pref_mover_polarity"] = df["pref_mover_text"].apply(pref_polarity)
    df["pref_observer_polarity"] = df["pref_observer_text"].apply(pref_polarity)
    df["preference_conflict"] = (df["pref_mover_polarity"] * df["pref_observer_polarity"] == -1).astype(int)

    # --- light numeric features
    df["len_plot"] = df.get("plot", "").apply(lambda s: len(str(s).split()))
    df["len_narrative"] = df.get("narrative", "").apply(lambda s: len(str(s).split()))
    df["len_intention"] = df.get("intention", "").apply(lambda s: len(str(s).split()))
    df["same_mover_observer"] = ((df.get("mover", "") == df.get("observer", ""))).astype(int)

    # --- housekeeping
    df = df.drop_duplicates()
    if {"plot", "narrative"}.issubset(df.columns):
        df = df[~(df["plot"].eq("") & df["narrative"].eq(""))]

    # order columns for readability
    ordered = [
        # IDs / roles / places
        "mover", "observer", "eoi", "orig_place", "move_to", "new_location",
        # question
        "q_type", "q_answer", "q_text",
        # labels
        "true_sentiment", "observed",
        # preferences
        "pref_mover_text", "pref_observer_text", "pref_mover_polarity",
        "pref_observer_polarity", "preference_conflict", "false_belief_cue",
        # features
        "len_plot", "len_narrative", "len_intention", "same_mover_observer",
        # normalized texts
        "plot", "narrative", "intention", "personality", "sentiment_statement",
        # originals (kept for reference)
        "plot_orig", "narrative_orig", "intention_orig", "personality_orig",
        "sentiment_statement_orig", "q_text_orig",
    ]
    cols = [c for c in ordered if c in df.columns] + [c for c in df.columns if c not in ordered]
    df = df[cols]

    # save
    Path(out_csv).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_csv, index=False)
    print(f"✅ Saved: {out_csv} | rows={len(df)} cols={len(df.columns)}")
    return df


# Run directly
if __name__ == "__main__":
    preprocess_opentom("opentom.json", "opentom_clean.csv")


✅ Saved: opentom_clean.csv | rows=13708 cols=36
