In [None]:
#######CURRENTLY USING DAN'S OPENAI API KEY. REPLACE WITH APPROPRIATE KEY.
# import pandas as pd
import json
import os
import re
import time
import warnings

import numpy as np
import openai
import pandas as pd
import scipy.stats as stats
from openai import OpenAI
from openpyxl import load_workbook
from openpyxl.formatting.rule import CellIsRule
from openpyxl.styles import PatternFill

# from scipy.stats import spearmanr, norm
from openpyxl.utils import get_column_letter
from scipy.stats import f, norm, pearsonr, ttest_rel

# Load data
study_name = "quantitative intuition"
specification_name = "default persona"
human_file = f"{study_name} human data labels anonymized.csv"
twin_file = f"{study_name} twins data labels anonymized.csv"
# df_human = pd.read_csv(human_file, header=0,skiprows=[1,2])
# df_twin  = pd.read_csv(twin_file,  header=0,skiprows=[1,2])


##############################from Daniel:

HUMAN_FILE = "quantitative intuition human data labels anonymized.csv"
TWIN_FILE = "quantitative intuition twins data labels anonymized.csv"

Q_INDIV_COLS = [f"Q20_{i}" for i in range(1, 20) if i != 9]  # drop Q20_9
I_INDIV_COLS = [f"Q1_{i}" for i in range(1, 20)]
Q_ORG_COLS = [f"Q22_{i}" for i in range(1, 8)]
I_ORG_COLS = [f"Q21_{i}" for i in range(1, 10)]  # Q21_10 later excluded
ATTN_CHECKS_REQUIRED = {"Q20_9": "Somewhat disagree", "Q32": "-3"}  # humans
ATTN_CHECKS_SKIPPED = ["Q21_10"]

STRAIGHT_LINE_COLS = [f"Q20_{i}" for i in range(13, 20)]
DURATION_COL = "Duration (in seconds)"
MIN_DURATION = 150
LIKERT_MAP = {
    "Strongly disagree": 1,
    "Somewhat disagree": 2,
    "Neither agree nor disagree": 3,
    "Somewhat agree": 4,
    "Strongly agree": 5,
}


def clean_tid(df):
    df["_tid"] = pd.to_numeric(df["TWIN_ID"], errors="coerce")
    df = df[df["_tid"].notnull()].copy()
    df["TWIN_ID"] = df["_tid"].astype("Int64")
    return df.drop(columns="_tid")


def item_group(col):
    if col.startswith("Q20_"):
        return "Q_indiv"
    if col.startswith("Q1_"):
        return "I_indiv"
    if col.startswith("Q22_"):
        return "Q_org"
    if col.startswith("Q21_"):
        return "I_org"
    return "other"


def load_qi(path: str, label: str, apply_filters: bool):
    df = pd.read_csv(path, header=0, skiprows=[1, 2], low_memory=False)
    df = clean_tid(df)
    stats = {
        "dataset": label,
        "initial_n": len(df),
        "removed_attention": 0,
        "removed_straight": 0,
        "removed_duration": 0,
    }

    if DURATION_COL in df.columns:
        df["_duration"] = pd.to_numeric(df[DURATION_COL], errors="coerce")
    else:
        df["_duration"] = np.nan

    likert = list(
        dict.fromkeys(
            Q_INDIV_COLS
            + I_INDIV_COLS
            + Q_ORG_COLS
            + I_ORG_COLS
            + (list(ATTN_CHECKS_REQUIRED) if apply_filters else [])
            + ATTN_CHECKS_SKIPPED
        )
    )
    likert = [c for c in likert if c in df.columns]
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        if likert:
            df[likert] = df[likert].apply(lambda s: s.astype(str).str.strip()).replace(LIKERT_MAP)

    if apply_filters:
        before = len(df)
        for col, val in ATTN_CHECKS_REQUIRED.items():
            if col in df.columns:
                expected = LIKERT_MAP.get(val, val)

                if col == "Q32":
                    col_str = df[col].astype(str).str.strip().str.strip('"')
                    df = df[col_str == str(expected)]
                else:
                    df = df[df[col].astype(str).str.strip() == str(expected)]

        stats["removed_attention"] = before - len(df)

        before = len(df)
        if all(c in df.columns for c in STRAIGHT_LINE_COLS):
            df = df[df.apply(lambda r: pd.Series(r[STRAIGHT_LINE_COLS]).nunique() > 1, axis=1)]
        stats["removed_straight"] = before - len(df)

        before = len(df)
        if df["_duration"].notna().any():
            df = df[(df["_duration"] >= MIN_DURATION) | df["_duration"].isna()]
        stats["removed_duration"] = before - len(df)

    df["Q_indiv_mean"] = df[[c for c in Q_INDIV_COLS if c in df.columns]].mean(axis=1)
    df["I_indiv_mean"] = df[[c for c in I_INDIV_COLS if c in df.columns]].mean(axis=1)
    df["Q_org_mean"] = df[[c for c in Q_ORG_COLS if c in df.columns]].mean(axis=1)
    df["I_org_mean"] = df[[c for c in I_ORG_COLS if c in df.columns]].mean(axis=1)

    stats["final_n"] = len(df)
    return df, stats


df_human, stats_human = load_qi(HUMAN_FILE, "human", apply_filters=True)
df_twin, stats_twin = load_qi(TWIN_FILE, "twin", apply_filters=False)

before_pair = len(df_twin)
df_twin = df_twin[df_twin["TWIN_ID"].isin(df_human["TWIN_ID"])]
stats_twin.update(
    {
        "dataset": "twin (paired)",
        "initial_n": before_pair,
        "removed_attention": 0,
        "removed_straight": 0,
        "removed_duration": 0,
        "removed_nonpaired": before_pair - len(df_twin),
        "final_n": len(df_twin),
    }
)


print("FILTER SUMMARY:")
print(
    f"  human: start={stats_human['initial_n']} - attn={stats_human['removed_attention']} "
    f"- straight={stats_human['removed_straight']} - duration={stats_human['removed_duration']} "
    f"= final={stats_human['final_n']}"
)
print(
    f"  twin : start={stats_twin['initial_n']} - nonpaired_drop={stats_twin['removed_nonpaired']} "
    f"= final={stats_twin['final_n']}"
)

##################
# add DV related to close-ended scenario
prefix = "Break the problem into parts"

# for both data‐frames, create scenario_close_QI = 1 if the response starts with that prefix, else 0
for df in (df_human, df_twin):
    df["scenario_close_QI"] = (
        df["Q33"]  # replace with your actual column name
        .fillna("")  # avoid errors on NaN
        .astype(str)  # ensure it’s text
        .str.startswith(prefix)  # boolean mask
        .astype(int)  # convert True/False → 1/0
    )


# add DV from open-ended scenario:
FREE_RESP_COL = "Q34"  # ← the column you want to classify
CLASSIFICATION_MODEL = "gpt-4o-mini"
MODEL = CLASSIFICATION_MODEL
CACHE_FILE = f"{FREE_RESP_COL}_{specification_name}_open_classification_cache.csv"

prompt_template = (
    "In a research study, a subject was shown quantitative facts about a company, including that "
    "sales are down, call center complaints are up, unresolved complaints are up, and personnel turnover is up. "
    "They were asked to explain what is going on with the company in up to 100 words.\n\n"
    "Classify the SUBJECT ANSWER as either 'summary' (one that just restates or describes the facts) or 'synthesis' "
    "(one that uses a fact to try to explain another, e.g., 'low sales are due to poor customer service', "
    "or provides an alternative explanation altogether, e.g., 'low sales and high complaints are both probably because the product is bad'). "
    'Return JSON with keys classification (summary|synthesis) and confidence (0-100 integer). SUBJECT ANSWER:\n"{ans}"'
)

OPENAI_API_KEY = ""  # <-- INSERT YOUR KEY (run once to build cache)


def classify_scenario_open_QI():
    # 1) check cache
    if os.path.exists(CACHE_FILE):
        return pd.read_csv(CACHE_FILE)

    # 2) sanity check API key
    if OPENAI_API_KEY.startswith("YOUR_"):
        print("API key not set; skipping scenario_open_QI classification.")
        return pd.DataFrame(columns=["TWIN_ID", "dataset", "scenario_open_QI", "confidence"])

    #    openai.api_key = OPENAI_API_KEY
    client = OpenAI(api_key=OPENAI_API_KEY)
    print(f"[DEBUG] using OpenAI key: {OPENAI_API_KEY[:5]}…{OPENAI_API_KEY[-5:]}")

    rows = []

    def _process(df, suffix, label):
        col = f"{FREE_RESP_COL}"
        if col not in df.columns:
            return
        subset = df[["TWIN_ID", col]].dropna()
        total = len(subset)
        for i, (tid, ans) in enumerate(subset.itertuples(index=False), 1):
            text = str(ans).strip()
            if not text:
                continue
            prompt = prompt_template.format(ans=text.replace('"', '\\"'))

            backoff = 1
            for attempt in range(4):
                try:
                    resp = client.chat.completions.create(
                        model=MODEL,
                        messages=[
                            {"role": "system", "content": "You are a careful research assistant."},
                            {"role": "user", "content": prompt},
                        ],
                        temperature=0,
                    )
                    reply = resp.choices[0].message.content
                    m = re.search(r"\{.*\}", reply, re.DOTALL)
                    data = json.loads(m.group(0)) if m else {}
                    cls = data.get("classification", "").lower()
                    # map summary→0, synthesis→1
                    bin_ = 1 if cls == "synthesis" else 0
                    rows.append((tid, label, bin_, data.get("confidence", None)))
                    break
                except Exception as e:
                    if attempt == 3:
                        rows.append((tid, label, None, None))
                        print(f"[{label}] ✖ permanent failure at {i}/{total} (TWIN_ID={tid}): {e}")
                    else:
                        time.sleep(backoff)
                        print(
                            f"[{label}]  attempt {attempt + 1} error at {i}/{total}: {e} — retrying"
                        )
                        time.sleep(backoff)
                        backoff *= 2

            if i % 100 == 0:
                print(f"[{label}] {i}/{total} classified…")

        print(f"[{label}] done ({total} items)")

    _process(df_human, "human", "human")
    _process(df_twin, "twin", "twin")

    out = pd.DataFrame(rows, columns=["TWIN_ID", "dataset", "scenario_open_QI", "confidence"])
    out.to_csv(CACHE_FILE, index=False)
    print(f"Wrote cache → {CACHE_FILE}")
    return out


# ── run it ────────────────────────────────────────────────────────────────────
sc_df = classify_scenario_open_QI()

# ── pivot & merge back ───────────────────────────────────────────────────────
pivot = sc_df.pivot(index="TWIN_ID", columns="dataset", values="scenario_open_QI")
# pivot.columns might be e.g. Index(['human','twin'], name='dataset')

# ── map directly into each dataframe ─────────────────────────────────────────
# for human:
if "human" in pivot.columns:
    df_human["scenario_open_QI"] = df_human["TWIN_ID"].map(pivot["human"])
else:
    # if no human‐labels were classified, fill with NaN
    df_human["scenario_open_QI"] = np.nan

# for twin:
if "twin" in pivot.columns:
    df_twin["scenario_open_QI"] = df_twin["TWIN_ID"].map(pivot["twin"])
else:
    df_twin["scenario_open_QI"] = np.nan


#####################################################################

out_file = f"{study_name} {specification_name} human data processed.csv"
df_human.to_csv(out_file, index=False)
out_file = f"{study_name} {specification_name} twins data processed.csv"
df_twin.to_csv(out_file, index=False)


# define relevant columns:
# condition variable names:
condition_vars = [""]
# Check if we have a real condition var
if condition_vars and condition_vars[0].strip():
    cond = condition_vars[0]
    cond_h = f"{cond}_human"
    cond_t = f"{cond}_twin"
    cond_exists = True
else:
    cond_exists = False


# raw responses:
raw_vars = [""]
# raw_vars_min = []
# raw_vars_max = []
# #raw responses: domain=social?
# raw_vars_social=[]
# raw_vars_social_map = dict(zip(raw_vars, raw_vars_social))
# #raw responses: domain=cognitive?
# raw_vars_cognitive=[]
# raw_vars_cognitive_map = dict(zip(raw_vars, raw_vars_cognitive))
# #raw responses: replicating know human bias?
# raw_vars_known=[]
# raw_vars_known_map = dict(zip(raw_vars, raw_vars_known))
# #raw responses: preference measure?
# raw_vars_pref=[]
# raw_vars_pref_map = dict(zip(raw_vars, raw_vars_pref))
# #raw responses: stimuli dependent?
# raw_vars_stim=[]
# raw_vars_stim_map = dict(zip(raw_vars, raw_vars_stim))

# DVs:
DV_vars = [
    "Q_indiv_mean",
    "I_indiv_mean",
    "Q_org_mean",
    "I_org_mean",
    "scenario_close_QI",
    "scenario_open_QI",
]
DV_vars_min = [1, 1, 1, 1, 0, 0]
DV_vars_max = [5, 5, 5, 5, 1, 1]
# DVs: domain=social?
DV_vars_social = [0] * 6
DV_vars_social_map = dict(zip(DV_vars, DV_vars_social))
# DVs: domain=cognitive?
DV_vars_cognitive = [1] * 6
DV_vars_cognitive_map = dict(zip(DV_vars, DV_vars_cognitive))
# DVs: replicating know human bias?
DV_vars_known = [0] * 6
DV_vars_known_map = dict(zip(DV_vars, DV_vars_known))
# DVs: preference measure?
DV_vars_pref = [0] * 6
DV_vars_pref_map = dict(zip(DV_vars, DV_vars_pref))
# DVs: stimuli dependent?
DV_vars_stim = [0] * 6
DV_vars_stim_map = dict(zip(DV_vars, DV_vars_stim))
# DVs: knowledge question?
DV_vars_know = [0] * 6
DV_vars_know_map = dict(zip(DV_vars, DV_vars_know))
# DVs: political question?
DV_vars_politics = [0] * 6
DV_vars_politics_map = dict(zip(DV_vars, DV_vars_politics))


# merging key
merge_key = ["TWIN_ID"]

# Merge on TWIN_ID
df = pd.merge(df_human, df_twin, on=merge_key, suffixes=("_human", "_twin"))

# Fix dtypes
for var in DV_vars:
    df[f"{var}_human"] = pd.to_numeric(df[f"{var}_human"], errors="coerce")
    df[f"{var}_twin"] = pd.to_numeric(df[f"{var}_twin"], errors="coerce")

# build min/max maps from both raw_vars and DV_vars
min_map = {v: mn for v, mn in zip(DV_vars, DV_vars_min)}
# min_map = {v: mn for v, mn in zip(raw_vars,      raw_vars_min)}
# min_map.update({v: mn for v, mn in zip(DV_vars,   DV_vars_min)})

max_map = {v: mx for v, mx in zip(DV_vars, DV_vars_max)}
# max_map = {v: mx for v, mx in zip(raw_vars,      raw_vars_max)}
# max_map.update({v: mx for v, mx in zip(DV_vars,   DV_vars_max)})

# now add _min and _max columns for every variable in the union
for var in min_map:
    df[f"{var}_min"] = min_map[var]
    df[f"{var}_max"] = max_map[var]

# Compute results
results = []
# for var in raw_vars:
#     ##############################
#     #07/18/26: condition assignment different for each DV.
# #     col_h = f"{var}_human"
# #     col_t = f"{var}_twin"
# #     min_col = f"{var}_min"
# #     max_col = f"{var}_max"
# #     if cond_exists:
# #         cols = [col_h, col_t, cond_h, cond_t,min_col,max_col]
# #     else:
# #         cols = [col_h, col_t,min_col,max_col]
# #     pair = (
# #     df[cols]
# #       .dropna(subset=[col_h, col_t])
# #     )
# # look up the right condition for this DV
#     cond    = dv_to_cond[var]           # e.g. 'condition_green'
#     cond_h  = f"{cond}_human"           # 'condition_green_human'
#     cond_t  = f"{cond}_twin"            # 'condition_green_twin'
#     col_h   = f"{var}_human"
#     col_t   = f"{var}_twin"
#     min_col = f"{var}_min"
#     max_col = f"{var}_max"
#     # always include the matching condition columns
#     cols = [col_h, col_t, cond_h, cond_t, min_col, max_col]
#     pair = df[cols].dropna(subset=[col_h, col_t, cond_h, cond_t])
# ####################################

#     min_val = pair[min_col].iloc[0]
#     max_val = pair[max_col].iloc[0]
#     n    = len(pair)
#     if n >= 4:
#         r, _    = pearsonr(pair[col_h], pair[col_t])
#         z_f     = np.arctanh(r)
#         se      = 1 / np.sqrt(n - 3)
#         z_crit  = norm.ppf(0.975)
#         lo_z, hi_z = z_f - z_crit*se, z_f + z_crit*se
#         lo_r, hi_r = np.tanh(lo_z), np.tanh(hi_z)
#         z_score    = z_f / se
#         # Accuracy = mean absolute diff / range
#         if pd.isna(min_val) or pd.isna(max_val) or max_val == min_val:
#             accuracy = np.nan
#         else:
#             # compute mean absolute difference
#             abs_diff      = np.abs(pair[col_h] - pair[col_t])
#             mean_abs_diff = abs_diff.mean()
#             accuracy      = 1 - mean_abs_diff / (max_val - min_val)

#         mean_h = pair[col_h].mean()
#         mean_t = pair[col_t].mean()

#         # Paired t‐test
#         t_stat, p_val = ttest_rel(pair[col_h], pair[col_t])

#         std_h = pair[col_h].std(ddof=1)
#         std_t = pair[col_t].std(ddof=1)

#          # F‐test for equal variances
#         df1 = df2 = n - 1
#         f_stat = (std_h**2 / std_t**2) if std_t>0 else np.nan

#         # two‐tailed p‐value:
#         if not np.isnan(f_stat):
#             p_f = 2 * min(f.cdf(f_stat, df1, df2),
#                           1 - f.cdf(f_stat, df1, df2))
#         else:
#             p_f = np.nan

#         # Effect sizes (Cohen's d) across conditions
#         #    For humans:
#         if cond_exists and len(pair)>3:
#             levels_h = pair[cond_h].unique()
#             if len(levels_h) == 2:
#                 g1 = pair.loc[pair[cond_h]==levels_h[0], col_h]
#                 g2 = pair.loc[pair[cond_h]==levels_h[1], col_h]
#                 n1, n2 = len(g1), len(g2)
#                 # pooled sd
#                 s_pool = np.sqrt(((n1-1)*g1.var(ddof=1)+(n2-1)*g2.var(ddof=1)) / (n1+n2-2))
#                 d_human = (g1.mean() - g2.mean()) / s_pool if s_pool>0 else np.nan
#             else:
#                 d_human = np.nan
#         else:
#             d_human = np.nan

#         #    For twins:
#         if cond_exists and len(pair)>3:
#             levels_t = pair[cond_t].unique()
#             if cond_exists and len(levels_t) == 2:
#                 g1 = pair.loc[pair[cond_t]==levels_t[0], col_t]
#                 g2 = pair.loc[pair[cond_t]==levels_t[1], col_t]
#                 n1, n2 = len(g1), len(g2)
#                 s_pool = np.sqrt(((n1-1)*g1.var(ddof=1)+(n2-1)*g2.var(ddof=1)) / (n1+n2-2))
#                 d_twin = (g1.mean() - g2.mean()) / s_pool if s_pool>0 else np.nan
#             else:
#                 d_twin = np.nan
#         else:
#             d_twin = np.nan
#     else:
#         r = lo_r = hi_r = z_score = accuracy = mean_h = mean_t = t_stat = p_val = std_h = std_t = f_stat = p_f = np.nan
#         d_human = d_twin = np.nan


#     results.append({
#         'study name': study_name,
#         'variable name': var,
#         'variable type (raw response/DV)':     'raw',
#         'correlation between the responses from humans vs. their twins':        r,
#         'CI_lower': lo_r,
#         'CI_upper': hi_r,
#         'z-score for correlation between humans vs. their twins':  z_score,
#         'accuracy between humans vs. their twins': accuracy,
#         'mean_human': mean_h,
#         'mean_twin': mean_t,
#         'paired t-test t-stat': t_stat,
#         'paired t-test p-value': p_val,
#         'std_human': std_h,
#         'std_twin': std_t,
#         'variance test F-stat': f_stat,
#         'variance test p-value': p_f,
#         'effect size based on human': d_human,
#         'effect size based on twin': d_twin,
#         'domain=social?':raw_vars_social_map.get(var, np.nan),
#         'domain=cognitive?':raw_vars_cognitive_map.get(var, np.nan),
#         'replicating know human bias?':raw_vars_known_map.get(var, np.nan),
#         'preference measure?':raw_vars_pref_map.get(var, np.nan),
#         'stimuli dependent?':raw_vars_stim_map.get(var, np.nan),
#         'sample size':        n
#     })

for var in DV_vars:
    col_h = f"{var}_human"
    col_t = f"{var}_twin"
    min_col = f"{var}_min"
    max_col = f"{var}_max"

    if cond_exists:
        cols = [col_h, col_t, cond_h, cond_t, min_col, max_col]
    else:
        cols = [col_h, col_t, min_col, max_col]

    dropped_ids = df.loc[df[col_h].isna() | df[col_t].isna(), "TWIN_ID"].unique()
    if len(dropped_ids):
        print(f"{var}: dropping {len(dropped_ids)} pairs → TWIN_IDs: {dropped_ids.tolist()}")
    else:
        print(f"{var}: no observation dropped ")

    pair = df[cols].dropna(subset=[col_h, col_t])
    min_val = pair[min_col].iloc[0]
    max_val = pair[max_col].iloc[0]
    n = len(pair)
    if n >= 4:
        r, _ = pearsonr(pair[col_h], pair[col_t])
        z_f = np.arctanh(r)
        se = 1 / np.sqrt(n - 3)
        z_crit = norm.ppf(0.975)
        lo_z, hi_z = z_f - z_crit * se, z_f + z_crit * se
        lo_r, hi_r = np.tanh(lo_z), np.tanh(hi_z)
        z_score = z_f / se
        # Accuracy = mean absolute diff / range
        if pd.isna(min_val) or pd.isna(max_val) or max_val == min_val:
            accuracy = np.nan
        else:
            # compute mean absolute difference
            abs_diff = np.abs(pair[col_h] - pair[col_t])
            mean_abs_diff = abs_diff.mean()
            accuracy = 1 - mean_abs_diff / (max_val - min_val)

        mean_h = pair[col_h].mean()
        mean_t = pair[col_t].mean()

        # Paired t‐test
        t_stat, p_val = ttest_rel(pair[col_h], pair[col_t])

        std_h = pair[col_h].std(ddof=1)
        std_t = pair[col_t].std(ddof=1)

        # F‐test for equal variances
        df1 = df2 = n - 1
        f_stat = (std_h**2 / std_t**2) if std_t > 0 else np.nan
        # two‐tailed p‐value:
        if not np.isnan(f_stat):
            p_f = 2 * min(f.cdf(f_stat, df1, df2), 1 - f.cdf(f_stat, df1, df2))
        else:
            p_f = np.nan

        # Effect sizes (Cohen's d) across conditions
        #    For humans:
        if cond_exists and len(pair) > 3:
            levels_h = pair[cond_h].unique()
            if len(levels_h) == 2:
                g1 = pair.loc[pair[cond_h] == levels_h[0], col_h]
                g2 = pair.loc[pair[cond_h] == levels_h[1], col_h]
                n1, n2 = len(g1), len(g2)
                # pooled sd
                s_pool = np.sqrt(
                    ((n1 - 1) * g1.var(ddof=1) + (n2 - 1) * g2.var(ddof=1)) / (n1 + n2 - 2)
                )
                d_human = (g1.mean() - g2.mean()) / s_pool if s_pool > 0 else np.nan
            else:
                d_human = np.nan
        else:
            d_human = np.nan

        #    For twins:
        if cond_exists and len(pair) > 3:
            levels_t = pair[cond_t].unique()
            if cond_exists and len(levels_t) == 2:
                g1 = pair.loc[pair[cond_t] == levels_t[0], col_t]
                g2 = pair.loc[pair[cond_t] == levels_t[1], col_t]
                n1, n2 = len(g1), len(g2)
                s_pool = np.sqrt(
                    ((n1 - 1) * g1.var(ddof=1) + (n2 - 1) * g2.var(ddof=1)) / (n1 + n2 - 2)
                )
                d_twin = (g1.mean() - g2.mean()) / s_pool if s_pool > 0 else np.nan
            else:
                d_twin = np.nan
        else:
            d_twin = np.nan
    else:
        r = lo_r = hi_r = z_score = accuracy = mean_h = mean_t = t_stat = p_val = std_h = std_t = (
            f_stat
        ) = p_f = np.nan
        d_human = d_twin = np.nan

    results.append(
        {
            "study name": study_name,
            "persona specification": specification_name,
            "variable name": var,
            #        'variable type (raw response/DV)':     'DV',
            "correlation between the responses from humans vs. their twins": r,
            "CI_lower": lo_r,
            "CI_upper": hi_r,
            "z-score for correlation between humans vs. their twins": z_score,
            "accuracy between humans vs. their twins": accuracy,
            "mean_human": mean_h,
            "mean_twin": mean_t,
            "paired t-test t-stat": t_stat,
            "paired t-test p-value": p_val,
            "std_human": std_h,
            "std_twin": std_t,
            "variance test F-stat": f_stat,
            "variance test p-value": p_f,
            "effect size based on human": d_human,
            "effect size based on twin": d_twin,
            "domain=social?": DV_vars_social_map.get(var, np.nan),
            "domain=cognitive?": DV_vars_cognitive_map.get(var, np.nan),
            "replicating know human bias?": DV_vars_known_map.get(var, np.nan),
            "preference measure?": DV_vars_pref_map.get(var, np.nan),
            "stimuli dependent?": DV_vars_stim_map.get(var, np.nan),
            "knowledge question?": DV_vars_know_map.get(var, np.nan),
            "political question?": DV_vars_politics_map.get(var, np.nan),
            "sample size": n,
        }
    )

# results DataFrame
corr_df = pd.DataFrame(results)
print(corr_df)

# save output as csv - unit of observation is comparison between humans and twins:
out_file = f"{study_name} {specification_name} meta analysis.csv"
corr_df.to_csv(out_file, index=False)


#####participant-level data:
def make_long(df, respondent_type):
    # pick off TWIN_ID + the DVs, then melt
    long = df[["TWIN_ID"] + DV_vars].melt(
        id_vars="TWIN_ID", value_vars=DV_vars, var_name="variable_name", value_name="value"
    )
    # only keep non‑NaN values
    long = long.dropna(subset=["value"])

    long["respondent_type"] = respondent_type
    long["study_name"] = study_name
    long["specification_name"] = specification_name
    return long


# build the two halves
long_h = make_long(df_human, "human")
long_t = make_long(df_twin, "twin")

# stack them
df_long = pd.concat([long_h, long_t], ignore_index=True)

print(df_long.head())
# save output as csv - unit of observation is TWIN_ID:
out_file = f"{study_name} {specification_name} meta analysis individual level.csv"
df_long.to_csv(out_file, index=False)

print("done")

FILTER SUMMARY:
  human: start=1435 - attn=103 - straight=8 - duration=1 = final=1323
  twin : start=1435 - nonpaired_drop=112 = final=1323
Q_indiv_mean: no observation dropped 
I_indiv_mean: no observation dropped 
Q_org_mean: no observation dropped 
I_org_mean: no observation dropped 
scenario_close_QI: no observation dropped 
scenario_open_QI: no observation dropped 
               study name persona specification      variable name  \
0  quantitative intuition       default persona       Q_indiv_mean   
1  quantitative intuition       default persona       I_indiv_mean   
2  quantitative intuition       default persona         Q_org_mean   
3  quantitative intuition       default persona         I_org_mean   
4  quantitative intuition       default persona  scenario_close_QI   
5  quantitative intuition       default persona   scenario_open_QI   

   correlation between the responses from humans vs. their twins  CI_lower  \
0                                           0.513884      