In [25]:
import numpy as np
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score


def collect_paths_and_labels(dataset_root: str):
    """
    dataset_root/
      classA/*.npy
      classB/*.npy
      ...
    を想定。class名のソート順にラベル index を振る。
    """
    root = Path(dataset_root)
    class_names = sorted([d.name for d in root.iterdir() if d.is_dir()])
    if not class_names:
        raise RuntimeError(f"No class directories under: {root}")

    class_to_idx = {c: i for i, c in enumerate(class_names)}
    paths = []
    labels = []
    for c in class_names:
        for p in sorted((root / c).glob("*.npy")):
            paths.append(str(p))
            labels.append(class_to_idx[c])

    if not paths:
        raise RuntimeError(f"No .npy found under: {root}")

    return paths, np.array(labels, dtype=np.int64), class_names


import numpy as np

def extract_features_mean_std(
    npy_path: str,
    seq_start: int,
    seq_end: int,
    taxel_indices=None,   # 例: [0,1,2,3,8,9,10,11]
    axis_indices=None,    # 例: [0,1,2] or [2] (Zだけ)
    channel_mask=None,    # (n_taxels,3) bool を直接指定したい場合（任意）
):
    """
    超単純特徴：時間平均 + 時間std
    入力 .npy は (T, n_taxels, 3) を想定

    - taxel_indices: 使うセル番号（0..n_taxels-1）
    - axis_indices : 使う軸番号（0,1,2）= (X,Y,Z) 想定
    - channel_mask : taxel×axis をより細かく指定したいときの bool マスク
                     shape=(n_taxels,3)
    """
    arr = np.load(npy_path)  # (T, n_taxels, 3)
    if arr.ndim != 3 or arr.shape[-1] != 3:
        raise ValueError(f"Invalid shape {arr.shape} in {npy_path}")

    T, n_taxels, n_axes = arr.shape
    if seq_end > T:
        raise ValueError(f"seq_end({seq_end}) > T({T}) in {npy_path}")

    x = arr[seq_start:seq_end].astype(np.float32)  # (L, n_taxels, 3)
    L = x.shape[0]

    # ---- チャネル選択 ----
    if channel_mask is not None:
        m = np.asarray(channel_mask, dtype=bool)
        if m.shape != (n_taxels, 3):
            raise ValueError(f"channel_mask shape must be {(n_taxels,3)} but got {m.shape}")
        x_flat = x.reshape(L, -1)                 # (L, n_taxels*3)
        x_flat = x_flat[:, m.reshape(-1)]         # (L, selected_channels)
    else:
        if taxel_indices is None:
            taxel_indices = np.arange(n_taxels)
        else:
            taxel_indices = np.asarray(taxel_indices, dtype=int)
            if taxel_indices.min() < 0 or taxel_indices.max() >= n_taxels:
                raise ValueError(f"taxel_indices out of range 0..{n_taxels-1}: {taxel_indices}")

        if axis_indices is None:
            axis_indices = np.arange(3)
        else:
            axis_indices = np.asarray(axis_indices, dtype=int)
            if axis_indices.min() < 0 or axis_indices.max() >= 3:
                raise ValueError(f"axis_indices out of range 0..2: {axis_indices}")

        x_sel = x[:, taxel_indices, :][:, :, axis_indices]  # (L, k_taxels, k_axes)
        x_flat = x_sel.reshape(L, -1)                       # (L, k_taxels*k_axes)

    # ---- 特徴（mean + std）----
    mu = x_flat.mean(axis=0)
    sd = x_flat.std(axis=0)
    feat = np.concatenate([mu, sd], axis=0)  # (2*channels,)
    return feat



def build_feature_matrix(paths, seq_start: int, seq_end: int,
                         taxel_indices=None, axis_indices=None, channel_mask=None):
    X = np.stack([
        extract_features_mean_std(p, seq_start, seq_end,
                                  taxel_indices=taxel_indices,
                                  axis_indices=axis_indices,
                                  channel_mask=channel_mask)
        for p in paths
    ], axis=0)
    return X


def run_linear_baseline_cv(dataset_root: str, seq_start: int, seq_end: int,
                           n_splits: int = 5, seed: int = 0,
                           taxel_indices=None, axis_indices=None, channel_mask=None):
    paths, y, class_names = collect_paths_and_labels(dataset_root)

    rng = np.random.RandomState(seed)
    perm = rng.permutation(len(paths))
    paths = [paths[i] for i in perm]
    y = y[perm]

    X = build_feature_matrix(paths, seq_start, seq_end,
                             taxel_indices=taxel_indices,
                             axis_indices=axis_indices,
                             channel_mask=channel_mask)

    print(f"[features] X shape={X.shape} (N, 2*selected_channels)")

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    clf = Pipeline([
        ("scaler", StandardScaler()),
        ("lr", LogisticRegression(
            max_iter=5000,
            multi_class="multinomial",
            solver="lbfgs",
            n_jobs=None
        )),
    ])

    dummy = DummyClassifier(strategy="most_frequent")

    accs = []
    dummy_accs = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        # ★ 見た目＆学習の順序もシャッフル（分割集合自体は同じ）
        rng_fold = np.random.RandomState(seed + 1000 + fold)
        tr_idx = tr_idx[rng_fold.permutation(len(tr_idx))]
        va_idx = va_idx[rng_fold.permutation(len(va_idx))]

        Xtr, Xva = X[tr_idx], X[va_idx]
        ytr, yva = y[tr_idx], y[va_idx]

        clf.fit(Xtr, ytr)
        pred = clf.predict(Xva)
        acc = accuracy_score(yva, pred)
        accs.append(acc)

        dummy.fit(Xtr, ytr)
        dpred = dummy.predict(Xva)
        dacc = accuracy_score(yva, dpred)
        dummy_accs.append(dacc)

        # ここで print すると、yva が「連続ラベル」になりにくい
        print(f"[fold {fold}] acc={acc*100:.2f}%  dummy={dacc*100:.2f}%")

    print("-" * 60)
    print(f"[CV] mean acc = {np.mean(accs)*100:.2f}%  std = {np.std(accs)*100:.2f}%")
    print(f"[CV] dummy    = {np.mean(dummy_accs)*100:.2f}%")
    return accs



# ==== 実行例 ====
if __name__ == "__main__":
    dataset_root = "./All_materials/"  # あなたのデータ
    seq_start = 400
    seq_end = 1200
    taxels = [0]
    run_linear_baseline_cv("./normalized_dataset/20251215_161803/", 400, 1200, n_splits=5, seed=0,
                       taxel_indices=taxels, axis_indices=[0])


[features] X shape=(250, 2) (N, 2*selected_channels)
[fold 0] acc=68.00%  dummy=4.00%
[fold 1] acc=60.00%  dummy=4.00%
[fold 2] acc=66.00%  dummy=4.00%
[fold 3] acc=58.00%  dummy=4.00%
[fold 4] acc=72.00%  dummy=4.00%
------------------------------------------------------------
[CV] mean acc = 64.80%  std = 5.15%
[CV] dummy    = 4.00%




In [9]:
rng = np.random.RandomState(0)
y = rng.permutation(y)

accs = []
for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y_shuf)):
    clf.fit(X[tr_idx], y_shuf[tr_idx])
    pred = clf.predict(X[va_idx])
    acc = accuracy_score(y_shuf[va_idx], pred)
    accs.append(acc)
    print(f"[shuf fold {fold}] acc={acc*100:.2f}%")
print("mean:", np.mean(accs)*100)

NameError: name 'y' is not defined