## 1. State - how subjective sleepiness is related to objective graph measures? For each session separately

In [None]:
import os
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests

sleep_csv = "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/SURVEYS/surveys_sleep_circadian_emotional_data_traits_states.csv"

graph_metric_paths = {
    "global_efficiency":   "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/global_efficiency_all_subjects.csv",
    "average_clustering":  "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/average_clustering_all_subjects.csv",
    "average_path_length": "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/average_path_length_all_subjects.csv",
    "modularity":          "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/modularity_all_subjects.csv",
    "avg_graph_distance":  "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/avg_graph_distance_all_subjects.csv",
}

out_dir = "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/SURVEYS/all_results_state_traits_paper"
os.makedirs(out_dir, exist_ok=True)


sleep = pd.read_csv(sleep_csv, sep=";")
sleep = sleep.rename(columns={"Subject_ID": "subject_id"})
sleep["subject_id"] = sleep["subject_id"].astype(str).str.strip()

# Keep KSS columns (raw units)
kss_cols = ["KSS-B1", "KSS-A1", "KSS-C1"]  # B=ses-1, A=ses-2, C=ses-3
sleep = sleep[["subject_id"] + kss_cols]

#long format sleepiness
sleep_long = sleep.melt(
    id_vars=["subject_id"],
    value_vars=kss_cols,
    var_name="kss_session",
    value_name="sleepiness"
)
kss_to_ses = {"KSS-B1": "ses-1", "KSS-A1": "ses-2", "KSS-C1": "ses-3"}
sleep_long["session"] = sleep_long["kss_session"].map(kss_to_ses)

#common settings
session_order = ["ses-1", "ses-2", "ses-3"]
nice = {"ses-1":"Baseline (B)", "ses-2":"Acute (A)", "ses-3":"Chronic (C)"}
formula = "graph_metric ~ 0 + C(session) + C(session):sleepiness"

def fit_model(df, re_formula):
    md = smf.mixedlm(formula, df, groups=df["subject_id"], re_formula=re_formula)
    try:
        return md.fit(reml=False, method="lbfgs", maxiter=200)
    except Exception:
        return md.fit(reml=True, method="lbfgs", maxiter=200)

def analyze_one_metric(metric_name, graph_csv_path):

    graph = pd.read_csv(graph_csv_path, sep=",")
    graph["subject_id"] = graph["subject_id"].astype(str).str.strip()


    graph_long = graph.melt(
        id_vars=["subject_id"],
        value_vars=["ses-1", "ses-2", "ses-3"],
        var_name="session",
        value_name="graph_metric"
    )


    df = pd.merge(sleep_long, graph_long, on=["subject_id", "session"], how="inner")
    df = df.dropna(subset=["sleepiness", "graph_metric"])
    df["session"] = pd.Categorical(df["session"], categories=session_order, ordered=True)


    assert df.groupby(["subject_id","session"]).size().max() == 1, \
        f"[{metric_name}] Duplicate rows per subject-session detected."

    try:
        m = fit_model(df, re_formula="~sleepiness")
    except Exception:
        m = fit_model(df, re_formula="~1")


    params = m.params
    bse = m.bse
    conf = m.conf_int()
    conf.columns = ["CI_low", "CI_high"]

    rows = []
    for s in session_order:
        term = f"C(session)[{s}]:sleepiness"
        if term in params.index:
            beta = params[term]
            se = bse[term]
            z = beta / se
            p = m.pvalues[term]
            rows.append({
                "metric": metric_name,
                "session": nice[s],
                "slope (Δgraph per 1 KSS pt)": beta,
                "SE": se,
                "z": z,
                "p_raw": p,
                "CI_low": conf.loc[term, "CI_low"],
                "CI_high": conf.loc[term, "CI_high"],
                "n_obs": int(m.model.endog.size), 
                "n_subj": df["subject_id"].nunique(),
            })
    print(m.summary()) 
    slopes = pd.DataFrame(rows)

    # ---- FDR per metric (across its 3 session slopes) ----
    if not slopes.empty:
        rej, p_fdr, _, _ = multipletests(slopes["p_raw"].values, alpha=0.05, method="fdr_bh")
        slopes["p_FDR"] = p_fdr
        slopes["FDR_sig@0.05"] = rej

    # save per-metric CSV
    out_csv = os.path.join(out_dir, f"{metric_name}_sleepiness_mixed_slopes_FDR.csv")
    slopes.to_csv(out_csv, index=False)

    # print a neat preview
    print(f"\n=== {metric_name} ===")
    if "p_FDR" in slopes.columns:
        print(slopes[["metric","session","slope (Δgraph per 1 KSS pt)","SE","z","p_raw","p_FDR","FDR_sig@0.05","CI_low","CI_high","n_obs","n_subj"]]
              .to_string(index=False))
    else:
        print("(no rows)")

    return slopes

# Run all metrics
all_results = []
for metric, path in graph_metric_paths.items():
    res = analyze_one_metric(metric, path)
    all_results.append(res)

combined = pd.concat(all_results, ignore_index=True)
combined_csv = os.path.join(out_dir, "ALL_global_graph_metrics_KSS_mixed_model.csv")
combined.to_csv(combined_csv, index=False)

print(f"\nSaved per-metric tables in: {out_dir}")
print(f"Combined results: {combined_csv}")


  assert df.groupby(["subject_id","session"]).size().max() == 1, \
  assert df.groupby(["subject_id","session"]).size().max() == 1, \


                 Mixed Linear Model Regression Results
Model:                 MixedLM     Dependent Variable:     graph_metric
No. Observations:      83          Method:                 ML          
No. Groups:            28          Scale:                  0.0002      
Min. group size:       2           Log-Likelihood:         230.2968    
Max. group size:       3           Converged:              Yes         
Mean group size:       3.0                                             
-----------------------------------------------------------------------
                             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------
C(session)[ses-1]             0.418    0.006 74.738 0.000  0.407  0.429
C(session)[ses-2]             0.406    0.009 45.884 0.000  0.388  0.423
C(session)[ses-3]             0.399    0.009 43.922 0.000  0.381  0.417
C(session)[ses-1]:sleepiness -0.004    0.002 -2.676 0.007 -0.008 -0.001
C(session

  assert df.groupby(["subject_id","session"]).size().max() == 1, \
  assert df.groupby(["subject_id","session"]).size().max() == 1, \


                 Mixed Linear Model Regression Results
Model:                 MixedLM     Dependent Variable:     graph_metric
No. Observations:      83          Method:                 ML          
No. Groups:            28          Scale:                  0.0007      
Min. group size:       2           Log-Likelihood:         167.5199    
Max. group size:       3           Converged:              No          
Mean group size:       3.0                                             
-----------------------------------------------------------------------
                             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------
C(session)[ses-1]             0.446    0.012 36.174 0.000  0.422  0.470
C(session)[ses-2]             0.479    0.019 25.417 0.000  0.442  0.516
C(session)[ses-3]             0.455    0.016 27.680 0.000  0.423  0.488
C(session)[ses-1]:sleepiness  0.000    0.003  0.021 0.984 -0.007  0.007
C(session

  assert df.groupby(["subject_id","session"]).size().max() == 1, \


## 2. Traits - which trait predicts differences in brain functional networks in baseline condition? 

In [None]:
# === Graph metrics vs traits at baseline (ses-1): HC3 + z-scored traits ===
import os
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection  # BH FDR


metric_files = {
    "global_efficiency": "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/global_efficiency_all_subjects.csv",
    "average_clustering": "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/average_clustering_all_subjects.csv",
    "average_path_length": "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/average_path_length_all_subjects.csv",
    "modularity": "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/modularity_all_subjects.csv",
    "avg_graph_distance": "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/graph_metrics/avg_graph_distance_all_subjects.csv",
}
survey_file = "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/SURVEYS/surveys_sleep_circadian_emotional_data2.csv"
traits = ["AM", "ME", "PSQI"]
session_order = ["ses-1", "ses-2", "ses-3"]


def autodetect_read(path):
    return pd.read_csv(path, sep=None, engine="python")

def _find_id_col(cols):
    for c in cols:
        if str(c).lower() in ("subject_id","subjectid","id","subject"):
            return c
    return None

def _zscore_safe(x):
    x = pd.to_numeric(x, errors="coerce")
    sd = np.nanstd(x, ddof=0)
    if not np.isfinite(sd) or sd == 0:
        return pd.Series(np.zeros(len(x)), index=x.index)
    return (x - np.nanmean(x)) / sd

def load_surveys(path):
    s = pd.read_csv(path, sep=";")
    sid = _find_id_col(s.columns)
    if sid is None:
        raise ValueError("Could not find subject id in surveys.")
    s = s.rename(columns={sid: "subject_id"})
    s["subject_id"] = s["subject_id"].astype("string").str.strip()
    for t in traits:
        s[t] = pd.to_numeric(s[t], errors="coerce")
    return s

def melt_metric(df, metric_name):
    kid = _find_id_col(df.columns)
    if kid is None:
        raise ValueError(f"No subject id column in {metric_name} file.")
    df = df.rename(columns={kid: "subject_id"}).copy()
    df["subject_id"] = df["subject_id"].astype("string").str.strip()
    sess_cols = [c for c in df.columns if str(c).startswith("ses-")]
    long = df.melt(id_vars="subject_id", value_vars=sess_cols,
                   var_name="session", value_name=metric_name)
    long["session"] = pd.Categorical(long["session"], categories=session_order, ordered=True)
    long[metric_name] = pd.to_numeric(long[metric_name], errors="coerce")
    return long

#ANALYSIS: baseline only
surv = load_surveys(survey_file)
rows = []

for metric_name, mpath in metric_files.items():
    met = autodetect_read(mpath)
    long_df = melt_metric(met, metric_name)
    df = long_df.merge(surv[["subject_id"] + traits], on="subject_id", how="inner")

    d1 = df[df["session"] == "ses-1"].dropna(subset=[metric_name] + traits).copy()
    if d1.empty:
        continue

    # z-score traits within the ses-1 analysis subset
    for t in traits:
        d1[f"z_{t}"] = _zscore_safe(d1[t])

    formula = f"{metric_name} ~ " + " + ".join([f"z_{t}" for t in traits])
    fit = smf.ols(formula, data=d1).fit(cov_type="HC3")

    # collect p-values only for terms present
    preds = [f"z_{t}" for t in traits if f"z_{t}" in fit.params.index]
    pvals = [float(fit.pvalues[p]) for p in preds]
    rej, p_fdr = (fdrcorrection(pvals, alpha=0.05, method="indep") if pvals else ([], []))

    for p, sig, padj in zip(preds, rej, p_fdr):
        tname = p.replace("z_", "")
        rows.append({
            "metric": metric_name,
            "session": "ses-1",
            "predictor": tname,
            "coef": float(fit.params[p]),
            "p_raw": float(fit.pvalues[p]),
            "p_FDR": float(padj),
            "significant": bool(sig),
            "N": int(fit.nobs),
            "R2": float(fit.rsquared),
        })

res_base = pd.DataFrame(rows).sort_values(["metric","predictor"]).reset_index(drop=True)

#save
out_csv = "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/SURVEYS/all_results_state_traits_paper/global_graph_metrics_ses1_traits.csv"
os.makedirs(os.path.dirname(out_csv), exist_ok=True)
res_base.to_csv(out_csv, index=False)
print("Saved:", out_csv)
print(res_base)


Saved: /Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/SURVEYS/all_results_state_traits_paper/global_graph_metrics_ses1_traits.csv
                 metric session predictor      coef     p_raw     p_FDR  \
0    average_clustering   ses-1        AM -0.007407  0.357775  0.536663   
1    average_clustering   ses-1        ME  0.003466  0.669851  0.669851   
2    average_clustering   ses-1      PSQI -0.015035  0.070021  0.210062   
3   average_path_length   ses-1        AM -0.011461  0.835376  0.835376   
4   average_path_length   ses-1        ME -0.008262  0.823043  0.835376   
5   average_path_length   ses-1      PSQI -0.028680  0.519921  0.835376   
6    avg_graph_distance   ses-1        AM -0.127199  0.354120  0.671778   
7    avg_graph_distance   ses-1        ME -0.034687  0.697944  0.697944   
8    avg_graph_distance   ses-1      PSQI -0.078104  0.447852  0.671778   
9     global_efficiency   ses-1        AM  0.000950  0.796437  0.796437   
10    global_efficiency   ses-1        M

## 3. Traits - which trait predicts the differences / reorganization after sleep deprivation? 

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection  # BH FDR


kappa_files = {
    "degree":     "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/HDI_kappa_results_within_subject/kappas_degree.csv",
    "closeness":  "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/HDI_kappa_results_within_subject/kappas_closeness.csv",
    "clustering": "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/HDI_kappa_results_within_subject/kappas_clustering.csv",
}
survey_file = "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/SURVEYS/surveys_sleep_circadian_emotional_data2.csv"

psych_cols = ["AM", "ME", "PSQI"]  

def _find_id_col(cols):
    for c in cols:
        if c.lower() in ("subject_id","subjectid","id","subject","Subject_ID".lower()):
            return c
    return None

def _zscore_safe(x):
    x = pd.to_numeric(x, errors="coerce")
    sd = np.nanstd(x, ddof=0)
    if not np.isfinite(sd) or sd == 0:
        return pd.Series(np.zeros(len(x)), index=x.index)
    return (x - np.nanmean(x)) / sd

def analyze_metric(metric_name: str, kappa_path: str, survey_path: str, psych_cols: list) -> pd.DataFrame:

    kappa_df = pd.read_csv(kappa_path)
    survey_df = pd.read_csv(survey_path, sep=";")

    kid = _find_id_col(kappa_df.columns)
    sid = _find_id_col(survey_df.columns)
    if kid is None or sid is None:
        raise ValueError("Could not find subject ID column in kappa or survey file.")
    kappa_df = kappa_df.rename(columns={kid: "subject_id"})
    survey_df = survey_df.rename(columns={sid: "subject_id"})
    for df in (kappa_df, survey_df):
        df["subject_id"] = df["subject_id"].astype("string").str.strip()

  
    df = pd.merge(kappa_df, survey_df, on="subject_id", how="inner")


    kappa_cols = [c for c in kappa_df.columns if c != "subject_id"]

    all_rows = []
    print(f"\n#############################")
    print(f"### METRIC: {metric_name.upper()} ###")
    print(f"#############################")

    for k_col in kappa_cols:

        cols = [k_col] + psych_cols
        sub = df[["subject_id"] + cols].copy()
        for c in cols:
            sub[c] = pd.to_numeric(sub[c], errors="coerce")
        sub = sub.dropna(subset=cols)

        print(f"\n=== {k_col} ===")
        if sub.empty:
            print("No data after dropna; skipping.")
            continue


        for c in psych_cols:
            sub[f"z_{c}"] = _zscore_safe(sub[c])

        # OLS with all traits (HC3 robust SEs) 
        formula = f"{k_col} ~ " + " + ".join([f"z_{c}" for c in psych_cols])
        fit = smf.ols(formula, data=sub).fit(cov_type="HC3")

        p_values_ols, tmp_ols = [], []
        for pred in [f"z_{c}" for c in psych_cols]:
            if pred in fit.params.index:
                tmp_ols.append({
                    "Metric": metric_name, "Kappa_Column": k_col, "Psych": pred.replace("z_", ""),
                    "N": int(fit.nobs), "Test": "OLS",
                    "Effect": float(fit.params[pred]),
                    "p": float(fit.pvalues[pred]),
                    "R2": float(fit.rsquared)
                })
                p_values_ols.append(float(fit.pvalues[pred]))

        if p_values_ols:
            reject_ols, p_fdr_ols = fdrcorrection(p_values_ols, alpha=0.05, method="indep")
        else:
            reject_ols, p_fdr_ols = [], []

        for row, sig, pf in zip(tmp_ols, reject_ols, p_fdr_ols):
            row["p_FDR"] = pf
            row["Sig_FDR"] = bool(sig)
            all_rows.append(row)
            print(f"OLS     | {row['Psych']:8} | N={row['N']:3d} | beta = {row['Effect']: .3f} | p = {row['p']:.4f} | p_FDR = {pf:.4f} | Sig_FDR = {sig} | R2={row['R2']:.3f}")

    return pd.DataFrame(all_rows)

#ALL METRICS
results = []
for name, path in kappa_files.items():
    res = analyze_metric(name, path, survey_file, psych_cols)
    results.append(res)

combined = pd.concat(results, ignore_index=True) if results else pd.DataFrame()

#save 
out_csv = "/Users/patrycjascislewska/Analizy_neuro/Graphs/DATA/SURVEYS/all_results_state_traits_paper/kappa_vs_traits.csv"
combined.to_csv(out_csv, index=False)
print(f"\nSaved combined results to: {out_csv}")



#############################
### METRIC: DEGREE ###
#############################

=== κ_Acute_vs_Control ===
OLS     | AM       | N= 28 | beta =  0.019 | p = 0.5736 | p_FDR = 0.7897 | Sig_FDR = False | R2=0.086
OLS     | ME       | N= 28 | beta = -0.043 | p = 0.2903 | p_FDR = 0.7897 | Sig_FDR = False | R2=0.086
OLS     | PSQI     | N= 28 | beta = -0.011 | p = 0.7897 | p_FDR = 0.7897 | Sig_FDR = False | R2=0.086

=== κ_Chronic_vs_Control ===
OLS     | AM       | N= 28 | beta =  0.006 | p = 0.8280 | p_FDR = 0.8280 | Sig_FDR = False | R2=0.152
OLS     | ME       | N= 28 | beta = -0.034 | p = 0.3697 | p_FDR = 0.5546 | Sig_FDR = False | R2=0.152
OLS     | PSQI     | N= 28 | beta = -0.068 | p = 0.0379 | p_FDR = 0.1136 | Sig_FDR = False | R2=0.152

=== κ_Chronic_vs_Acute ===
OLS     | AM       | N= 28 | beta =  0.021 | p = 0.4452 | p_FDR = 0.6677 | Sig_FDR = False | R2=0.189
OLS     | ME       | N= 28 | beta = -0.064 | p = 0.0070 | p_FDR = 0.0211 | Sig_FDR = True | R2=0.189
OLS     | PSQI 