Import Necessary Libraries

In [1]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import mne
import os
import argparse
import pandas as pd
from typing import Dict
from loguru import logger
# from DataCollection.actions import Action
from scipy import stats
from scipy import signal
from scipy import integrate
from sklearn.decomposition import PCA
from dataclasses import dataclass

In [6]:
print(os.getcwd())

/Users/yuelei/Desktop/Universum-clean/ModelDevelopment


In [2]:
matplotlib.use("TkAgg")  # TkAgg is a backend that uses the Tkinter GUI toolkit for rendering interactive plots in a separate interactive window, where you can zoom, pan, or save the plot.
EPSILON = 1e-8  # constant EPSILON is a very small positive number, to avoid Division by Zero, and prevent Logarithmic Errors

Define the `Action` data class, which encapsulates information about user actions in the EEG dataset.

In [3]:
@dataclass
class Action:
    action_value: int
    text: str
    audio: str
    image: str

Load data from the directory

In [10]:
# Dynamically determine the base path (parent directory of ModelDevelopment)
base_path = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Move up one level
# Construct the path to the target data directory
data_directory_path = os.path.join(base_path, "DataCollection", "data", "EEGdata", "110", "1", "1")

# Check if directory exists
if os.path.exists(data_directory_path):
    # Recursively search for the CSV files
    found_files = []
    for root, dirs, files in os.walk(data_directory_path):
        for file in files:
            if file in ["eeg_data_raw.csv", "accelerometer_data.csv", "action_data.csv"]:
                found_files.append(os.path.join(root, file))
    
    # Verify found files
    print("Found files:", found_files)
    
    # Load the files if all are found
    if len(found_files) == 3:
        eeg_data = pd.read_csv([f for f in found_files if "eeg_data_raw.csv" in f][0])
        accel_data = pd.read_csv([f for f in found_files if "accelerometer_data.csv" in f][0])
        action_data = pd.read_csv([f for f in found_files if "action_data.csv" in f][0])
        print("EEG Data Shape:", eeg_data.shape)
        print("Accelerometer Data Shape:", accel_data.shape)
        print("Action Data Shape:", action_data.shape)
    else:
        print("Error: Not all required files were found!")
else:
    print("Error: Directory does not exist!")

print("Directory exists:", os.path.exists(data_directory_path))
print("Files in directory:", os.listdir(data_directory_path) if os.path.exists(data_directory_path) else "Directory not found")

eeg_data = pd.read_csv(os.path.join(data_directory_path, "eeg_data_raw.csv"))
accel_data = pd.read_csv(os.path.join(data_directory_path, "accelerometer_data.csv"))
action_data = pd.read_csv(os.path.join(data_directory_path, "action_data.csv"))

Found files: ['/Users/yuelei/Desktop/Universum-clean/DataCollection/data/EEGdata/110/1/1/eeg_data_raw.csv', '/Users/yuelei/Desktop/Universum-clean/DataCollection/data/EEGdata/110/1/1/action_data.csv', '/Users/yuelei/Desktop/Universum-clean/DataCollection/data/EEGdata/110/1/1/accelerometer_data.csv']
EEG Data Shape: (20127, 9)
Accelerometer Data Shape: (582, 9)
Action Data Shape: (57, 2)
Directory exists: True
Files in directory: ['eeg_data_raw.csv', 'action_data.csv', 'accelerometer_data.csv']


Print data shapes

In [11]:
print("EEG Data Shape:", eeg_data.shape)
print("Accelerometer Data Shape:", accel_data.shape)
print("Action Data Shape:", action_data.shape)

EEG Data Shape: (20127, 9)
Accelerometer Data Shape: (582, 9)
Action Data Shape: (57, 2)


Process action data
Remove consecutive duplicate actions and end collection rows

In [12]:
action_data = action_data.loc[
    (abs(action_data["action_value"] - action_data["action_value"].shift(1)) > 0)
].reset_index(drop=True)
action_data = action_data[action_data["action_value"] != 5]
print("Processed Action Data Shape:", action_data.shape)

Processed Action Data Shape: (16, 2)


Convert timestamps to seconds since epoch for all datasets

In [14]:
#Check the column names of each DataFrame
print("EEG Data Columns:", eeg_data.columns)
print("Accelerometer Data Columns:", accel_data.columns)
print("Action Data Columns:", action_data.columns)

EEG Data Columns: Index(['141.270631407', '-49718.23046875', '-2875.260986328125',
       '-6550.75927734375', '12230.6044921875', '8940.00390625',
       '-21440.103515625', '-6175.11572265625', '-69093.8359375'],
      dtype='object')
Accelerometer Data Columns: Index(['timestamp', 'device_id', 'x', 'y', 'z', 'pitch', 'roll',
       'acceleration', 'inclination'],
      dtype='object')
Action Data Columns: Index(['timestamp', 'action_value'], dtype='object')


In [15]:
def convert_timestamp_to_time_since_last_epoch(df, timestamp_col="timestamp"):
    if timestamp_col in df.columns:
        df[timestamp_col] = pd.to_datetime(df[timestamp_col]).astype("int64") / 10**9
    else:
        print(f"Warning: Column '{timestamp_col}' not found in DataFrame.")
    return df

# Convert timestamps for accelerometer and action data
accel_data = convert_timestamp_to_time_since_last_epoch(accel_data)
action_data = convert_timestamp_to_time_since_last_epoch(action_data)
print("Timestamps converted for Accelerometer and Action Data.")

# EEG data handling (no 'timestamp' column)
print("EEG Data does not contain a timestamp column. Skipping conversion for EEG Data.")

Timestamps converted for Accelerometer and Action Data.
EEG Data does not contain a timestamp column. Skipping conversion for EEG Data.


Align data to experiment start and end time

In [34]:
# Ensure the EEG data has timestamps (generated or actual)
if "timestamp" not in eeg_data.columns:
    sampling_frequency = 256  # Replace with actual sampling frequency
    eeg_data["timestamp"] = np.arange(len(eeg_data)) / sampling_frequency

# Align the data based on experiment start and end times
experiment_start_time = max(
    [accel_data["timestamp"].min(), eeg_data["timestamp"].min()]
)
experiment_end_time = min(
    [accel_data["timestamp"].max(), eeg_data["timestamp"].max()]
)

print("Experiment Start Time:", experiment_start_time)
print("Experiment End Time:", experiment_end_time)

def align_data_to_experiment_start_and_end_time(df, start_time, end_time):
    return df[(df["timestamp"] >= start_time) & (df["timestamp"] <= end_time)]

eeg_data = align_data_to_experiment_start_and_end_time(
    eeg_data, experiment_start_time, experiment_end_time
)
accel_data = align_data_to_experiment_start_and_end_time(
    accel_data, experiment_start_time, experiment_end_time
)
action_data = align_data_to_experiment_start_and_end_time(
    action_data, experiment_start_time, experiment_end_time
)

print("EEG Data Shape after alignment:", eeg_data.shape)
print("Accelerometer Data Shape after alignment:", accel_data.shape)
print("Action Data Shape after alignment:", action_data.shape)

Experiment Start Time: nan
Experiment End Time: nan
EEG Data Shape after alignment: (0, 10)
Accelerometer Data Shape after alignment: (0, 9)
Action Data Shape after alignment: (0, 2)


Check the Original Data:

In [35]:
# Debugging to ensure valid data is loaded
print("Initial EEG Data Shape:", eeg_data.shape)
print("Initial Accelerometer Data Shape:", accel_data.shape)

Initial EEG Data Shape: (0, 10)
Initial Accelerometer Data Shape: (0, 9)


Verify Timestamp Conversion

In [36]:
print("EEG Data Before Timestamp Conversion:")
print(eeg_data.head())
print("Accelerometer Data Before Timestamp Conversion:")
print(accel_data.head())

# Check if timestamps were dropped
print("EEG Data After Timestamp Conversion:")
print(eeg_data)
print("Accelerometer Data After Timestamp Conversion:")
print(accel_data)

EEG Data Before Timestamp Conversion:
Empty DataFrame
Columns: [141.270631407, -49718.23046875, -2875.260986328125, -6550.75927734375, 12230.6044921875, 8940.00390625, -21440.103515625, -6175.11572265625, -69093.8359375, timestamp]
Index: []
Accelerometer Data Before Timestamp Conversion:
Empty DataFrame
Columns: [timestamp, device_id, x, y, z, pitch, roll, acceleration, inclination]
Index: []
EEG Data After Timestamp Conversion:
Empty DataFrame
Columns: [141.270631407, -49718.23046875, -2875.260986328125, -6550.75927734375, 12230.6044921875, 8940.00390625, -21440.103515625, -6175.11572265625, -69093.8359375, timestamp]
Index: []
Accelerometer Data After Timestamp Conversion:
Empty DataFrame
Columns: [timestamp, device_id, x, y, z, pitch, roll, acceleration, inclination]
Index: []


Ensure all timestamps are numeric and drop invalid rows

In [37]:
def ensure_numeric_timestamps(df, timestamp_col="timestamp"):
    if timestamp_col in df.columns:
        try:
            # Convert timestamp to datetime and then to seconds since the epoch
            df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors="coerce")
            df[timestamp_col] = df[timestamp_col].astype("int64") / 10**9  # Convert to float
        except Exception as e:
            print(f"Error processing '{timestamp_col}': {e}")
            df[timestamp_col] = None  # Handle cases where conversion fails
        df = df.dropna(subset=[timestamp_col])  # Drop rows where timestamp is NaN
    else:
        print(f"Warning: '{timestamp_col}' not found in DataFrame.")
        df = pd.DataFrame()  # Return empty DataFrame if no timestamp column
    return df

Process timestamps

In [38]:
# Apply to EEG and Accelerometer data
eeg_data = ensure_numeric_timestamps(eeg_data, timestamp_col="timestamp")
accel_data = ensure_numeric_timestamps(accel_data, timestamp_col="timestamp")

# Debugging processed data
print("EEG Data Shape After Timestamp Processing:", eeg_data.shape)
print("Accelerometer Data Shape After Timestamp Processing:", accel_data.shape)

EEG Data Shape After Timestamp Processing: (0, 10)
Accelerometer Data Shape After Timestamp Processing: (0, 9)


Handle empty DataFrames before interpolation

In [None]:
if eeg_data.empty or accel_data.empty:
    print("Error: One or both data sets are empty. Interpolation cannot proceed.")
else:
    # Align accelerometer data with EEG timestamps
    accel_data_columns = accel_data.columns
    accel_data_np = accel_data.to_numpy()
    new_accel_data = np.zeros((len(eeg_data), accel_data_np.shape[1]))
    
    for i in range(accel_data_np.shape[1]):
        try:
            new_accel_data[:, i] = np.interp(
                eeg_data["timestamp"], accel_data["timestamp"], accel_data_np[:, i]
            )
        except ValueError as e:
            print(f"Interpolation error for column {accel_data_columns[i]}: {e}")
    
    # Create a DataFrame from the interpolated data
    accel_data = pd.DataFrame(new_accel_data, columns=accel_data_columns)
    print("Accelerometer Data Shape after interpolation:", accel_data.shape)


Error: One or both data sets are empty. Interpolation cannot proceed.


: 

Interpolate accelerometer data to align with EEG timestamps

In [21]:
print(eeg_data["timestamp"].head())
print(accel_data["timestamp"].head())
# Ensure all columns in accel_data are numeric
for col in accel_data.columns:
    accel_data[col] = pd.to_numeric(accel_data[col], errors="coerce")

# Drop rows with invalid (NaN) values
accel_data = accel_data.dropna()

# Perform interpolation
accel_data_columns = accel_data.columns
accel_data_np = accel_data.to_numpy()
new_accel_data = np.zeros((len(eeg_data), accel_data_np.shape[1]))
for i in range(accel_data_np.shape[1]):
    new_accel_data[:, i] = np.interp(
        eeg_data["timestamp"], accel_data["timestamp"], accel_data_np[:, i]
    )
accel_data = pd.DataFrame(new_accel_data, columns=accel_data_columns)
print("Accelerometer Data Shape after interpolation:", accel_data.shape)


Series([], Name: timestamp, dtype: float64)
Series([], Name: timestamp, dtype: float64)


ValueError: array of sample points is empty

Create MNE raw object

In [None]:
ch_names = eeg_data.columns[1:].tolist()
ch_types = ["eeg"] * len(ch_names)
info = mne.create_info(ch_names=ch_names, sfreq=256, ch_types=ch_types)
eeg_data_array = eeg_data[ch_names].to_numpy().T
raw = mne.io.RawArray(eeg_data_array, info)
print("Raw MNE Object created.")

Apply band-pass filter

In [None]:
raw.filter(l_freq=1, h_freq=45, fir_design="firwin")
print("Band-pass filter applied.")

Epoch the data

In [None]:
events = []
eeg_data["timestamp"] = pd.to_datetime(eeg_data["timestamp"])
action_data["timestamp"] = pd.to_datetime(action_data["timestamp"])
for index, row in action_data.iterrows():
    sample = np.argmin(np.abs(eeg_data["timestamp"] - row["timestamp"]))
    events.append([sample, 0, int(row["action_value"])])
events = np.array(events)
event_dict = {"action_value": action_data["action_value"].unique().tolist()}

epochs = mne.Epochs(
    raw, events, event_id=event_dict, tmin=0, tmax=2, preload=True
)
x = epochs.get_data()
print("Epoch Data Shape:", x.shape)

Compute SNR before filtering

In [None]:
snr_values = np.apply_along_axis(lambda signal: 10 * np.log10(np.mean(signal**2) / np.var(signal - np.mean(signal))), axis=-1, arr=x)
print("SNR Shape:", snr_values.shape)

Apply Wiener filter and compute SNR after filtering

In [None]:
x_filtered = signal.wiener(x.reshape(-1)).reshape(x.shape)
snr_filtered = np.apply_along_axis(lambda signal: 10 * np.log10(np.mean(signal**2) / np.var(signal - np.mean(signal))), axis=-1, arr=x_filtered)
print("SNR After Filtering Shape:", snr_filtered.shape)

Feature extraction

In [None]:
num_epochs, num_channels, num_samples = x_filtered.shape
x_flattened = x_filtered.reshape(num_channels, -1)
pca = PCA(n_components=32)
pca_features = pca.fit_transform(x_flattened.T).T
print("PCA Features Shape:", pca_features.shape)


In [8]:
def convert_timestamp_to_time_since_last_epoch(df):
    """Converts the timestamp to time since the last epoch."""
    df["timestamp"] = pd.to_datetime(df["timestamp"]).astype(int) / 10**9
    #Converts the timestamp column to pandas datetime format
    #a column of datetime64[ns] objects as the number of nanoseconds since the Unix epoch (January 1, 1970, 00:00:00 UTC).
    #Then convert the datetime64[ns] type to int, it returns the raw nanoseconds since the epoch.
    #Dividing by 10**9 (1 billion) converts the nanosecond-based timestamp into seconds.
    return df

def align_data_to_experiment_start_and_end_time(df, start_time: float, end_time: float):
    """Aligns the data to the experiment start and end time."""
    #Ensures that the end_time is greater than the start_time before proceeding.
    #If this condition is not true, the program will raise an AssertionError and stop execution.
    assert end_time > start_time
    #Selects rows where the timestamp value is greater than or equal to start_time
    #Selects rows where the timestamp value is less than or equal to end_time
    #The & operator performs an element-wise logical "AND". 
    #Only rows that satisfy both conditions will be included.
    #Returns a new DataFrame with only the rows that satisfy the time range condition
    return df[(df["timestamp"] >= start_time) & (df["timestamp"] <= end_time)]

def time_align_accel_data_by_linearly_interpolating(accel_data, eeg_data):
    """Aligns accelerometer data with EEG data using linear interpolation."""
    #align accel_data to match the timestamps in eeg_data by interpolating the accelerometer data values.
    #Stores the column names of accel_data as accel_data_columns for later use in the final DataFrame
    #to ensure the returned DataFrame has the same structure as the original
    accel_data_columns = accel_data.columns
    #Converts accel_data to a 2D Numpy Array
    #where rows correspond to timestamps and columns correspond to accelerometer measurements.
    accel_data_np = accel_data.to_numpy()
    #Creates an empty NumPy array to store the interpolated accelerometer data.
    #The number of rows matches the number of rows in eeg_data (the number of EEG timestamps).
    #The number of columns matches the number of columns in accel_data_np.
    new_accel_data = np.zeros((len(eeg_data), accel_data_np.shape[1]))
    for i in range(accel_data_np.shape[1]):
        new_accel_data[:, i] = np.interp(
            eeg_data["timestamp"],
            accel_data["timestamp"],
            accel_data_np[:, i],
        )
    #new_accel_data is transformed back into a DataFrame using the original column names
    return pd.DataFrame(new_accel_data, columns=accel_data_columns)