In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import glob
import numpy as np

PROCESSED_DIR = "/content/drive/MyDrive/sleep-insights/processed"
OUT_PATH = os.path.join(
    PROCESSED_DIR,
    "sleep_edf_all_25_nights_with_ids.npz"
)

# Find all per-night npz files (exclude old combined files)
night_files = sorted([
    f for f in glob.glob(f"{PROCESSED_DIR}/*.npz")
    if "sleep_edf_all" not in os.path.basename(f)
])

print(f"Found {len(night_files)} nightly files")

X_all = []
y_all = []
night_ids_all = []

for night_idx, f in enumerate(night_files):
    data = np.load(f)
    X = data["X"]
    y = data["y"]

    # Fix off-by-one issue if present (3001 -> 3000)
    if X.shape[-1] == 3001:
        X = X[:, :, :3000]

    assert X.shape[0] == y.shape[0], f"Mismatch in {f}"

    X_all.append(X)
    y_all.append(y)

    # Assign this night index to all its epochs
    night_ids_all.append(
        np.full(len(y), night_idx, dtype=np.int32)
    )

    print(
        f"{os.path.basename(f)} \u2192 "
        f"epochs: {len(y)} | night_id: {night_idx}"
    )

# Stack everything
X_all = np.concatenate(X_all, axis=0)
y_all = np.concatenate(y_all, axis=0)
night_ids_all = np.concatenate(night_ids_all, axis=0)

print("\n\u2705 Final dataset shapes")
print("X:", X_all.shape)
print("y:", y_all.shape)
print("night_ids:", night_ids_all.shape)

print("Unique nights:", np.unique(night_ids_all).shape[0])

# Save
np.savez(
    OUT_PATH,
    X=X_all,
    y=y_all,
    night_ids=night_ids_all
)

print(f"\n\u2705 Saved combined dataset with night IDs:\n{OUT_PATH}")

Found 25 nightly files
SC4001.npz → epochs: 2649 | night_id: 0
SC4001E0.npz → epochs: 153 | night_id: 1
SC4002E0.npz → epochs: 150 | night_id: 2
SC4011E0.npz → epochs: 125 | night_id: 3
SC4012E0.npz → epochs: 170 | night_id: 4
SC4021E0.npz → epochs: 160 | night_id: 5
SC4022E0.npz → epochs: 177 | night_id: 6
SC4031E0.npz → epochs: 118 | night_id: 7
SC4032E0.npz → epochs: 122 | night_id: 8
SC4041E0.npz → epochs: 159 | night_id: 9
SC4042E0.npz → epochs: 173 | night_id: 10
SC4051E0.npz → epochs: 129 | night_id: 11
SC4052E0.npz → epochs: 136 | night_id: 12
SC4061E0.npz → epochs: 77 | night_id: 13
SC4062E0.npz → epochs: 97 | night_id: 14
SC4071E0.npz → epochs: 115 | night_id: 15
SC4072E0.npz → epochs: 178 | night_id: 16
SC4081E0.npz → epochs: 141 | night_id: 17
SC4082E0.npz → epochs: 155 | night_id: 18
SC4091E0.npz → epochs: 137 | night_id: 19
SC4092E0.npz → epochs: 102 | night_id: 20
SC4101E0.npz → epochs: 61 | night_id: 21
SC4102E0.npz → epochs: 116 | night_id: 22
SC4111E0.npz → epochs: 12

In [None]:
#extract the features


import numpy as np
from scipy.signal import welch

fs = 100 #double the max fq


def bandpower(epoch, low_freq, high_freq):
  freq, psd = welch(epoch, fs=fs, nperseg=4*fs)
  mask = (freq >= low_freq) & (freq <= high_freq)
  return np.trapz(psd[mask], freq[mask])


def extract_features(X):
  features = []
  for epoch in X:
    sig = epoch[0]
    delta = bandpower(sig, 0.5, 4)
    theta = bandpower(sig, 4, 8)
    alpha = bandpower(sig, 8, 12)
    sigma = bandpower(sig, 12, 15)
    beta = bandpower(sig, 15, 30)

    feats = [
        delta, theta, alpha, sigma, beta,
        delta/(delta+theta+alpha+beta+sigma), #see if delta is dominating
        theta / (alpha + beta + 1e-6) #see if theta is dominating
    ]
    features.append(feats)
  return np.array(features)

X_feat = extract_features(X_all)

print(X_feat.shape)







  return np.trapz(psd[mask], freq[mask])


(5849, 7)


In [None]:
print("X_feat rows == y rows?", X_feat.shape[0] == y_all.shape[0])
print("Any NaNs?", np.isnan(X_feat).any())
print("Any infs?", np.isinf(X_feat).any())
print(np.unique(y_all, return_counts=True))


X_feat rows == y rows? True
Any NaNs? False
Any infs? False
(array([0, 1, 2, 3, 4]), array([2307,  724, 1272, 1144,  402]))


In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score, confusion_matrix
import numpy as np

clf = make_pipeline(
    StandardScaler(),
    LinearDiscriminantAnalysis()
)

gkf = GroupKFold(n_splits=5) #how many nights will be in 1 group -- 5 nights of data is 1 group

bal_accs, macro_f1s = [], []

for fold, (train, test) in enumerate(gkf.split(X_feat, y_all, groups=night_ids_all),1):
  clf.fit(X_feat[train], y_all[train])
  pred = clf.predict(X_feat[test])

  bal = balanced_accuracy_score(y_all[test], pred)
  mf1 = f1_score(y_all[test], pred, average='macro')

  bal_accs.append(bal)
  macro_f1s.append(mf1)

  print(f"Fold {fold}:")
  print("balanced acc:", round(bal, 3), "| Macro F1: ", round(mf1, 3))
  print(classification_report(y_all[test], pred, digits=3))


print("\nOverall:")
print("Balanced Acc mean+- std", np.mean(bal_accs), "+-", np.std(bal_accs))
print("Macro F1 mean+- std", np.mean(macro_f1s), "+-", np.std(macro_f1s))




Fold 1:
balanced acc: 0.516 | Macro F1:  0.248
              precision    recall  f1-score   support

           0      1.000     0.041     0.079      1996
           1      0.172     0.534     0.261        58
           2      0.145     0.948     0.252       250
           3      0.282     0.886     0.428       220
           4      0.328     0.168     0.222       125

    accuracy                          0.214      2649
   macro avg      0.386     0.516     0.248      2649
weighted avg      0.810     0.214     0.135      2649

Fold 2:
balanced acc: 0.532 | Macro F1:  0.521
              precision    recall  f1-score   support

           0      0.250     0.493     0.332        73
           1      0.569     0.422     0.484       166
           2      0.575     0.644     0.607       233
           3      0.877     0.702     0.780       275
           4      0.406     0.400     0.403        65

    accuracy                          0.585       812
   macro avg      0.535     0.532    