In [15]:
import numpy as np
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score


def collect_paths_and_labels(dataset_root: str):
    """
    dataset_root/
      classA/*.npy
      classB/*.npy
      ...
    を想定。class名のソート順にラベル index を振る。
    """
    root = Path(dataset_root)
    class_names = sorted([d.name for d in root.iterdir() if d.is_dir()])
    if not class_names:
        raise RuntimeError(f"No class directories under: {root}")

    class_to_idx = {c: i for i, c in enumerate(class_names)}
    paths = []
    labels = []
    for c in class_names:
        for p in sorted((root / c).glob("*.npy")):
            paths.append(str(p))
            labels.append(class_to_idx[c])

    if not paths:
        raise RuntimeError(f"No .npy found under: {root}")

    return paths, np.array(labels, dtype=np.int64), class_names


def extract_features_mean_std(npy_path: str, seq_start: int, seq_end: int):
    """
    超単純特徴：時間平均(48) + 時間std(48) の 96次元
    入力 .npy は (T, n_taxels, 3) を想定
    """
    arr = np.load(npy_path)  # (T, n_taxels, 3)
    if arr.ndim != 3 or arr.shape[-1] != 3:
        raise ValueError(f"Invalid shape {arr.shape} in {npy_path}")

    if seq_end > arr.shape[0]:
        raise ValueError(f"seq_end({seq_end}) > T({arr.shape[0]}) in {npy_path}")

    x = arr[seq_start:seq_end]  # (seq_len, n_taxels, 3)
    x = x.reshape(x.shape[0], -1).astype(np.float32)  # (seq_len, 48)

    mu = x.mean(axis=0)  # (48,)
    sd = x.std(axis=0)   # (48,)
    feat = np.concatenate([mu, sd], axis=0)  # (96,)
    return feat


def build_feature_matrix(paths, seq_start: int, seq_end: int):
    X = np.stack([extract_features_mean_std(p, seq_start, seq_end) for p in paths], axis=0)
    return X


def run_linear_baseline_cv(dataset_root: str, seq_start: int, seq_end: int,
                           n_splits: int = 5, seed: int = 0):
    paths, y, class_names = collect_paths_and_labels(dataset_root)
    print(f"[dataset] root={dataset_root}")
    print(f"[dataset] num_classes={len(class_names)} num_samples={len(paths)}")
    print(f"[slice] seq_start={seq_start} seq_end={seq_end} (len={seq_end-seq_start})")

    # ★ 重要：paths と y を「ペアのまま」先にシャッフル（順序依存を消す）
    rng = np.random.RandomState(seed)
    perm = rng.permutation(len(paths))
    paths = [paths[i] for i in perm]
    y = y[perm]

    X = build_feature_matrix(paths, seq_start, seq_end)
    print(f"[features] X shape={X.shape} (should be [N, 96])")

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    clf = Pipeline([
        ("scaler", StandardScaler()),
        ("lr", LogisticRegression(
            max_iter=5000,
            multi_class="multinomial",
            solver="lbfgs",
            n_jobs=None
        )),
    ])

    dummy = DummyClassifier(strategy="most_frequent")

    accs = []
    dummy_accs = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        # ★ 見た目＆学習の順序もシャッフル（分割集合自体は同じ）
        rng_fold = np.random.RandomState(seed + 1000 + fold)
        tr_idx = tr_idx[rng_fold.permutation(len(tr_idx))]
        va_idx = va_idx[rng_fold.permutation(len(va_idx))]

        Xtr, Xva = X[tr_idx], X[va_idx]
        ytr, yva = y[tr_idx], y[va_idx]

        clf.fit(Xtr, ytr)
        pred = clf.predict(Xva)
        acc = accuracy_score(yva, pred)
        accs.append(acc)

        dummy.fit(Xtr, ytr)
        dpred = dummy.predict(Xva)
        dacc = accuracy_score(yva, dpred)
        dummy_accs.append(dacc)

        # ここで print すると、yva が「連続ラベル」になりにくい
        print("xva :", ytr)
        print("yva head:", yva[:30])
        print(f"[fold {fold}] acc={acc*100:.2f}%  dummy={dacc*100:.2f}%")

    print("-" * 60)
    print(f"[CV] mean acc = {np.mean(accs)*100:.2f}%  std = {np.std(accs)*100:.2f}%")
    print(f"[CV] dummy    = {np.mean(dummy_accs)*100:.2f}%")
    return accs



# ==== 実行例 ====
if __name__ == "__main__":
    dataset_root = "./All_materials/"  # あなたのデータ
    seq_start = 400
    seq_end = 1200
    run_linear_baseline_cv(dataset_root, seq_start, seq_end, n_splits=5, seed=0)


[dataset] root=./All_materials/
[dataset] num_classes=25 num_samples=250
[slice] seq_start=400 seq_end=1200 (len=800)
[features] X shape=(250, 96) (should be [N, 96])
xva : [14  1  3 22  2 21 23 10 22 10 18 18  6 20 21  3  0  3 11  8 18  9 21 15
  2 19 15 24 23 11  2 14  9 13 16  4 13  5  1 14 21 14 20  8  9  8  7  0
  3 17 23 19  6 12 11  5 20 14  1 19 18 13  7 15 22 10 20 18  4  5 12 14
 16 16 17 22  7  9 22 11 10 13 24  3 12 19 10 18  4  9  1 12 10  7  9  6
  0  8 22  2 23 16 15 19 12  0 24  6 24 17  8 23 14 20 13 20  5 23  0  3
 16 16  5  1  3 10 15 18 16 12 15 19 14  0 24 16 22 11 17 13  9 17  7  8
 13 21 13  1  5  1  2  2  3 11 12 22 24  4 23  6 23 21 15 17  5  7 18  6
  7  4  0  9 19  6 20 11 15 20  4  1  2  5 17 17  7  8 21 11  6 10  2 21
 19 12  4 24  8  0  4 24]
yva head: [16 22 24 23 10  9  8  5 19 17  0  2  3 14  2 10  7 15 13  1 11  5 22 21
  4  8 24  4 20 15]
[fold 0] acc=100.00%  dummy=4.00%
xva : [ 7  6 24 11  3  1 14 11 21 12 23 20  1 22 10 17  5 24  1 10  9  0 22 22
 



In [9]:
rng = np.random.RandomState(0)
y = rng.permutation(y)

accs = []
for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y_shuf)):
    clf.fit(X[tr_idx], y_shuf[tr_idx])
    pred = clf.predict(X[va_idx])
    acc = accuracy_score(y_shuf[va_idx], pred)
    accs.append(acc)
    print(f"[shuf fold {fold}] acc={acc*100:.2f}%")
print("mean:", np.mean(accs)*100)

NameError: name 'y' is not defined