In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from scipy.signal import butter, filtfilt, stft
from pathlib import Path

#### MERGE CODE WITH SESSION ID

In [3]:
def combine_excel_sheets_to_parquet_eeg(excel_file_path, output_parquet_path):
    xls = pd.ExcelFile(excel_file_path, engine="openpyxl")
    sheet_names = xls.sheet_names

    # Read first sheet to get column names
    df_ref = pd.read_excel(
        excel_file_path,
        sheet_name=sheet_names[0],
        engine="openpyxl"
    )
    column_names = df_ref.columns.tolist()

    # ---- DEFINE FIXED SCHEMA (CRITICAL) ----
    fields = [
        pa.field(col, pa.float32()) for col in column_names
    ]
    fields.append(pa.field("session_id", pa.int32()))
    schema = pa.schema(fields)
    # --------------------------------------

    parquet_writer = None

    for sheet_idx, sheet_name in enumerate(sheet_names):
        print(f"Processing sheet {sheet_idx}: {sheet_name}")

        df = pd.read_excel(
            excel_file_path,
            sheet_name=sheet_name,
            header=0 if sheet_idx == 0 else None,
            engine="openpyxl"
        )

        if df.shape[1] != len(column_names):
            print(f"Skipping {sheet_name}: column mismatch")
            continue

        df.columns = column_names

        # Force numeric EEG
        df = df.apply(pd.to_numeric, errors="coerce")
        df.dropna(inplace=True)

        # Enforce dtypes explicitly
        df = df.astype({col: "float32" for col in column_names})
        df["session_id"] = sheet_idx
        df["session_id"] = df["session_id"].astype("int32")

        # Convert using FIXED schema
        table = pa.Table.from_pandas(
            df,
            schema=schema,
            preserve_index=False
        )

        if parquet_writer is None:
            parquet_writer = pq.ParquetWriter(
                output_parquet_path,
                schema,
                compression="snappy"
            )

        parquet_writer.write_table(table)
        print(f"  Added {len(df)} samples")

    if parquet_writer:
        parquet_writer.close()

    print("Parquet file written successfully with fixed schema.")

In [4]:
RAW_DATA_PATH = Path("./Data_raw/")

INPUT_FILE = "right eye open_1.xlsx"
OUTPUT_FILE = "eeg_data.parquet"

combine_excel_sheets_to_parquet_eeg(RAW_DATA_PATH / INPUT_FILE, RAW_DATA_PATH / OUTPUT_FILE)

Processing sheet 0: Sheet1
  Added 169500 samples
Processing sheet 1: Sheet2
  Added 30000 samples
Processing sheet 2: Sheet3
  Added 30000 samples
Processing sheet 3: Sheet4
  Added 30000 samples
Parquet file written successfully with fixed schema.


#### CONSTANTS

In [5]:
PARQUET_FILE = "eeg_data.parquet"

EEG_CHANNELS = [
    "Fp1-A1", "Fp2-A2",
    "P3-A1", "P4-A2", "Pz-Aav",
    "O1-A1", "O2-A2",
    "Cz-Aav"
]

FS = 125
WINDOW_SEC = 10
SAMPLES_PER_TRIAL = FS * WINDOW_SEC

#### BAND-PASS FILTERING Theta + Alpha (4–13 Hz)

In [6]:
def bandpass_filter(eeg, fs, low=4, high=13, order=4):
    nyq = 0.5 * fs
    b, a = butter(order, [low / nyq, high / nyq], btype="band")
    return filtfilt(b, a, eeg, axis=1)

#### TIME–FREQUENCY TRANSFORM (STFT)

In [7]:
def compute_stft_trial(trial, fs):
    ch_tf = []
    for ch in trial:
        f, t, Z = stft(
            ch,
            fs=fs,
            nperseg=fs,
            noverlap=fs // 2
        )
        mask = (f >= 4) & (f <= 13)
        ch_tf.append(np.abs(Z[mask]))
    return np.array(ch_tf)  # [C, F, T]

In [8]:
dataset = ds.dataset(RAW_DATA_PATH / PARQUET_FILE, format="parquet")

# Per-session rolling buffers
session_buffers = {}  # session_id → [C, N_samples]

tf_trials = []  # final output (can also stream-save if huge)

for batch in dataset.to_batches(batch_size=200_000):
    df = batch.to_pandas()

    for sid in df["session_id"].unique():
        df_s = df[df["session_id"] == sid]

        eeg_chunk = df_s[EEG_CHANNELS].values.T  # [C, chunk_len]

        if sid not in session_buffers:
            session_buffers[sid] = eeg_chunk
        else:
            session_buffers[sid] = np.concatenate(
                [session_buffers[sid], eeg_chunk], axis=1
            )

        # While enough samples exist → extract trials
        while session_buffers[sid].shape[1] >= SAMPLES_PER_TRIAL:
            trial = session_buffers[sid][:, :SAMPLES_PER_TRIAL]
            session_buffers[sid] = session_buffers[sid][:, SAMPLES_PER_TRIAL:]

            # Filter
            trial = bandpass_filter(trial, fs=FS)

            # STFT
            tf = compute_stft_trial(trial, fs=FS)
            tf_trials.append(tf)

    print(f"Processed batch, total trials so far: {len(tf_trials)}")


Processed batch, total trials so far: 135
Processed batch, total trials so far: 159
Processed batch, total trials so far: 183
Processed batch, total trials so far: 207


#### CONVERT TO **PyTorch** TENSOR

In [9]:
import torch

In [12]:
X = torch.tensor(np.array(tf_trials), dtype=torch.float32)

torch.save(X, "X_eeg_tf.pt")
print("Saved X_eeg_tf.pt with shape:", X.shape)

Saved X_eeg_tf.pt with shape: torch.Size([207, 8, 10, 21])


In [15]:
metadata = {
    "num_trials": X.shape[0],
    "channels": 8,
    "freq_bins": 10,
    "time_bins": 21,
    "sampling_rate": 125,
    "window_sec": 10,
    "freq_band": "4–13 Hz",
    "preprocessing": "parquet_chunked_session_buffered"
}

In [16]:
torch.save(metadata, "eeg_metadata.pt")