<a href="https://colab.research.google.com/github/SynapTechUMNG/MicroestadosEEG01/blob/main/MS_AnalisysPipeline_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

OpenNeuro + BIDS en Drive es una ruta muy estándar. Abajo te dejo un notebook-style (celdas para Colab) para:

Montar Google Drive

Descargar el dataset de OpenNeuro dentro de Drive (usando datalad o openneuro-py)

Verificar que es BIDS EEG/MEG válido

Cargar con MNE y correr un pipeline de microestados (defaults razonables) con pycrostates

# **0) Celda: Montar Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **1) Celda CONFIG (EDITA AQUÍ)**

In [3]:
# ====== EDITA AQUÍ ======
DATA_ROOT = "/content/drive/MyDrive/EEG_BIDS"  # carpeta en Drive
DS_ID = "ds007006"                             # <- tu dataset de OpenNeuro, p.ej. "ds004000"
DOWNLOAD_METHOD = "datalad"                    # "datalad" recomendado; o "openneuro"

# Microestados (defaults)
K_MODE = "fixed"                               # "fixed" o "sweep_4_8"
K_FIXED = 4                                    # si K_MODE="fixed"
GFP_MIN_PEAK_DISTANCE_SEC = 0.01               # 10 ms
BPASS = (1., 40.)                              # Hz
RESAMPLE = 250                                 # Hz (None para no resamplear)
CROP_SEC = None                                # p.ej. 120 para pruebas rápidas; None = todo

# Suavizado/segmentación (predict)
SMOOTH_FACTOR = 10                             # 0 = sin suavizado (pycrostates predict)
HALF_WINDOW_SIZE = 1
MIN_SEGMENT_LENGTH_SEC = 0.03                  # 30 ms típico
# =========================

import pathlib, os
DATA_ROOT = pathlib.Path(DATA_ROOT)
DATASET_PATH = DATA_ROOT / DS_ID
DATA_ROOT.mkdir(parents=True, exist_ok=True)
print("DATASET_PATH:", DATASET_PATH)

DATASET_PATH: /content/drive/MyDrive/EEG_BIDS/ds007006


# **2) Descargar desde OpenNeuro a Drive (elige una)**
## **Opción A** (recomendada): datalad (rápida, eficiente; baja solo lo que pides)

In [4]:
!apt-get -qq update
!apt-get -qq install -y git-annex
!pip -q install datalad

if not DATASET_PATH.exists():
    !datalad install -s https://github.com/OpenNeuroDatasets/ds007006.git {DATASET_PATH}

# Trae metadatos BIDS básicos
!datalad -C {DATASET_PATH} get dataset_description.json participants.tsv 2>/dev/null || true

# Trae EEG (ajusta patrones si lo necesitas)
!datalad -C {DATASET_PATH} get -n
!datalad -C {DATASET_PATH} get "sub-*/**/*_eeg.*" "sub-*/**/*_channels.tsv" "sub-*/**/*_events.tsv" 2>/dev/null || true

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package netbase.
(Reading database ... 117540 files and directories currently installed.)
Preparing to unpack .../0-netbase_6.3_all.deb ...
Unpacking netbase (6.3) ...
Selecting previously unselected package libc-ares2:amd64.
Preparing to unpack .../1-libc-ares2_1.18.1-1ubuntu0.22.04.3_amd64.deb ...
Unpacking libc-ares2:amd64 (1.18.1-1ubuntu0.22.04.3) ...
Selecting previously unselected package libaria2-0:amd64.
Preparing to unpack .../2-libaria2-0_1.36.0-1_amd64.deb ...
Unpacking libaria2-0:amd64 (1.36.0-1) ...
Selecting previously unselected package aria2.
Preparing to unpack .../3-aria2_1.36.0-1_amd64.deb ...
Unpacking aria2 (1.36.0-1) ...
Selecting previously unselected package git-annex.
Preparing to unpack .../4-git-annex_8.20210223-2ubuntu2_amd64.deb ...
Unpacki

### Tip: datalad get acepta patrones. Si tu dataset es muy grande, es mejor bajar solo resting-state o solo ciertos sujetos.

## **Opción B:** openneuro-py (descarga “clásica”)

In [None]:
!pip -q install openneuro-py
!openneuro download --dataset {DS_ID} --target_dir {DATASET_PATH}

# **3) Celda “CHECK PATHS” (verifica BIDS y qué archivos hay)**

In [6]:
import glob, json

assert (DATASET_PATH / "dataset_description.json").exists(), "Falta dataset_description.json (¿descarga incompleta?)."
eeg_files = glob.glob(str(DATASET_PATH / "sub-*/**/*_eeg.*"), recursive=True)
print("EEG files encontrados:", len(eeg_files))
for f in eeg_files[:20]:
    print(" -", f)

assert len(eeg_files) > 0, "No veo archivos *_eeg.* dentro de sub-*/. Revisa si el dataset es iEEG/MEG o si falta descargar."

AssertionError: Falta dataset_description.json (¿descarga incompleta?).

# **4) Cargar BIDS con MNE + preprocesado mínimo (EEG)**

In [None]:
!pip -q install mne mne-bids pycrostates

import mne
from mne_bids import BIDSPath, get_entity_vals, read_raw_bids

mne.set_log_level("WARNING")

subjects = sorted(get_entity_vals(DATASET_PATH, "subject"))
assert subjects, "No detecto sujetos en el BIDS."
sub = subjects[0]

tasks = sorted(get_entity_vals(DATASET_PATH, "task", subject=sub))
# Heurística: si existe algo tipo rest, lo prioriza
task = None
if tasks:
    rest_like = [t for t in tasks if "rest" in t.lower()]
    task = rest_like[0] if rest_like else tasks[0]

runs = sorted(get_entity_vals(DATASET_PATH, "run", subject=sub, task=task)) if task else []
run = runs[0] if runs else None

print("Usando:", dict(subject=sub, task=task, run=run))

bp = BIDSPath(root=DATASET_PATH, subject=sub, task=task, run=run)
raw = read_raw_bids(bp, verbose=False)  # también intenta poblar bads/anotaciones desde sidecars si existen :contentReference[oaicite:2]{index=2}
raw.load_data()

# Nos quedamos con EEG (y solo EEG)
raw.pick("eeg")

# (Opcional) recorte para pruebas
if CROP_SEC is not None:
    raw.crop(tmax=float(CROP_SEC))

# Referencia promedio (muy estándar para microestados)
raw.set_eeg_reference("average", projection=False)

# Filtro típico
raw.filter(BPASS[0], BPASS[1], fir_design="firwin")

# (Opcional) resample para acelerar
if RESAMPLE is not None:
    raw.resample(RESAMPLE)

# Si no hay posiciones de electrodos (para topomaps), intento estándar 10-20 SOLO para visualización
mont = raw.get_montage()
ch_pos = None if mont is None else mont.get_positions().get("ch_pos", {})
if (mont is None) or (ch_pos is None) or (len(ch_pos) == 0):
    try:
        raw.set_montage("standard_1020", match_case=False, on_missing="ignore")
        print("Montage: standard_1020 (fallback para topomaps)")
    except Exception as e:
        print("No pude setear montage (topomaps pueden fallar):", repr(e))

print(raw)

# **5) Microestados (picos GFP → ModKMeans → backfitting)**

## **⚠️ Ojo: en pycrostates**, extract_gfp_peaks usa min_peak_distance en muestras, así que lo convierto desde segundos.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pycrostates.preprocessing import extract_gfp_peaks
from pycrostates.cluster import ModKMeans

sfreq = raw.info["sfreq"]
min_peak_samp = max(1, int(round(GFP_MIN_PEAK_DISTANCE_SEC * sfreq)))
min_seg_samp  = max(0, int(round(MIN_SEGMENT_LENGTH_SEC * sfreq)))

# 1) GFP peaks (devuelve un ChData, ideal para pasar directo a fit) :contentReference[oaicite:4]{index=4}
peaks = extract_gfp_peaks(raw, picks="eeg", min_peak_distance=min_peak_samp, reject_by_annotation=True)
print("GFP peaks extraídos:", peaks.get_data().shape)

def fit_and_segment(k: int):
    cluster = ModKMeans(n_clusters=k, random_state=42, n_init=20, max_iter=300, tol=1e-6)
    cluster.fit(peaks)  # fit acepta ChData :contentReference[oaicite:5]{index=5}

    # 2) backfitting/segmentación sobre el raw (predict devuelve RawSegmentation) :contentReference[oaicite:6]{index=6}
    seg = cluster.predict(
        raw,
        picks="eeg",
        factor=SMOOTH_FACTOR,
        half_window_size=HALF_WINDOW_SIZE,
        min_segment_length=min_seg_samp,
        reject_edges=True,
        reject_by_annotation=True,
    )
    return cluster, seg

if K_MODE == "fixed":
    cluster, seg = fit_and_segment(K_FIXED)
    best_k = K_FIXED
else:
    # barrido simple k=4..8 y elegimos el mejor por GEV del clustering (rápido y práctico)
    results = []
    models = {}
    for k in range(4, 9):
        c, s = fit_and_segment(k)
        gev = getattr(c, "GEV_", np.nan)  # atributo documentado :contentReference[oaicite:7]{index=7}
        results.append({"k": k, "GEV_cluster": float(gev)})
        models[k] = (c, s)
    res_df = pd.DataFrame(results).sort_values("GEV_cluster", ascending=False)
    display(res_df)
    best_k = int(res_df.iloc[0]["k"])
    cluster, seg = models[best_k]

print("k elegido:", best_k)

# **6) Figuras + parámetros (métricas) y export a Drive**

## En RawSegmentation, las métricas típicas salen con compute_parameters() (duración, ocurrencia, cobertura, etc.)

In [None]:
# Figuras
fig_dir = DATASET_PATH / "derivatives" / "microstates_pycrostates" / "figures"
fig_dir.mkdir(parents=True, exist_ok=True)

cluster.plot(topomap_args=dict(contours=0))
plt.tight_layout()
plt.savefig(fig_dir / f"microstates_maps_sub-{sub}_task-{task}_run-{run}_k-{best_k}.png", dpi=150)
plt.show()

seg.plot(tmin=0, tmax=min(10, raw.times[-1]))
plt.tight_layout()
plt.savefig(fig_dir / f"microstates_seg_sub-{sub}_task-{task}_run-{run}_k-{best_k}.png", dpi=150)
plt.show()

# Parámetros (dict) :contentReference[oaicite:9]{index=9}
params = seg.compute_parameters(norm_gfp=True, return_dist=False)

# Transiciones (opcional)
tm = seg.compute_transition_matrix(stat="probability")

out_dir = DATASET_PATH / "derivatives" / "microstates_pycrostates"
out_dir.mkdir(parents=True, exist_ok=True)

# Guardar parámetros
df = pd.DataFrame([params])
df.insert(0, "subject", sub)
df.insert(1, "task", task)
df.insert(2, "run", run)
df.insert(3, "k", best_k)
csv_path = out_dir / f"microstates_params_sub-{sub}_task-{task}_run-{run}_k-{best_k}.csv"
df.to_csv(csv_path, index=False)
print("Guardado:", csv_path)

# Guardar transición
tm_path = out_dir / f"microstates_transition_sub-{sub}_task-{task}_run-{run}_k-{best_k}.csv"
pd.DataFrame(tm).to_csv(tm_path, index=False)
print("Guardado:", tm_path)

# Guardar modelo (centros) para reproducibilidad
npz_path = out_dir / f"microstates_model_sub-{sub}_task-{task}_run-{run}_k-{best_k}.npz"
np.savez(npz_path, cluster_centers=cluster.cluster_centers_, ch_names=raw.ch_names)
print("Guardado:", npz_path)

# **7) (Opcional) Correr “por todos los sujetos” y consolidar CSV**

## Si quieres, pega esta celda y te genera un CSV “long” con parámetros de todos los sub-* (con la misma lógica de task/run por sujeto, tomando el primero/rest-like).

In [None]:
all_rows = []

for sub in subjects:
    tasks = sorted(get_entity_vals(DATASET_PATH, "task", subject=sub))
    task = None
    if tasks:
        rest_like = [t for t in tasks if "rest" in t.lower()]
        task = rest_like[0] if rest_like else tasks[0]
    runs = sorted(get_entity_vals(DATASET_PATH, "run", subject=sub, task=task)) if task else []
    run = runs[0] if runs else None

    bp = BIDSPath(root=DATASET_PATH, subject=sub, task=task, run=run)
    raw = read_raw_bids(bp, verbose=False)
    raw.load_data()
    raw.pick("eeg")

    raw.set_eeg_reference("average", projection=False)
    raw.filter(BPASS[0], BPASS[1], fir_design="firwin")
    if RESAMPLE is not None:
        raw.resample(RESAMPLE)

    sfreq = raw.info["sfreq"]
    min_peak_samp = max(1, int(round(GFP_MIN_PEAK_DISTANCE_SEC * sfreq)))
    min_seg_samp  = max(0, int(round(MIN_SEGMENT_LENGTH_SEC * sfreq)))

    mont = raw.get_montage()
    ch_pos = None if mont is None else mont.get_positions().get("ch_pos", {})
    if (mont is None) or (ch_pos is None) or (len(ch_pos) == 0):
        try:
            raw.set_montage("standard_1020", match_case=False, on_missing="ignore")
        except Exception:
            pass

    peaks = extract_gfp_peaks(raw, picks="eeg", min_peak_distance=min_peak_samp, reject_by_annotation=True)

    # k fijo o barrido rápido por GEV
    if K_MODE == "fixed":
        best_k = K_FIXED
        cluster = ModKMeans(n_clusters=best_k, random_state=42, n_init=20, max_iter=300, tol=1e-6)
        cluster.fit(peaks)
    else:
        best = (-np.inf, None)
        best_cluster = None
        for k in range(4, 9):
            c = ModKMeans(n_clusters=k, random_state=42, n_init=20, max_iter=300, tol=1e-6)
            c.fit(peaks)
            gev = float(getattr(c, "GEV_", np.nan))
            if gev > best[0]:
                best = (gev, k)
                best_cluster = c
        best_k = best[1]
        cluster = best_cluster

    seg = cluster.predict(
        raw, picks="eeg",
        factor=SMOOTH_FACTOR, half_window_size=HALF_WINDOW_SIZE,
        min_segment_length=min_seg_samp,
        reject_edges=True, reject_by_annotation=True
    )
    params = seg.compute_parameters(norm_gfp=True, return_dist=False)

    row = {"subject": sub, "task": task, "run": run, "k": best_k, **params}
    all_rows.append(row)

group_df = pd.DataFrame(all_rows)
group_csv = (DATASET_PATH / "derivatives" / "microstates_pycrostates" / "microstates_params_ALL.csv")
group_df.to_csv(group_csv, index=False)
print("Guardado:", group_csv)
group_df.head()