# EEG Focus Analysis Pipeline

This notebook demonstrates the process of retrieving EEG data and associated focus labels from Google Cloud Firestore, extracting and preparing the raw EEG signal from the RIGHT_TEMP channel (sampled at 256 Hz), and performing signal processing and feature extraction. The workflow includes:

1. **Data Retrieval:** Fetching EEG and focus label data from Firestore.
2. **Preprocessing:** Extracting the RIGHT_TEMP channel, aligning time, and organizing data into a DataFrame.
3. **Signal Processing:** Filtering (bandpass and notch), detrending, and flattening the data for analysis.
4. **Spectral Analysis:** Computing the power spectral density (PSD) for each segment.
5. **Feature Extraction:** Calculating bandpower for standard EEG bands and computing focus and engagement indices.
6. **Comparison:** Normalizing and comparing the computed indices to the ground-truth focus label for further study.

Each code cell below is commented to clarify its purpose and the steps involved.

In [None]:
# Import and initialize Firebase Admin SDK using credentials from environment variable
import firebase_admin
from google.cloud import firestore
from firebase_admin import credentials
import os
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

# Initialize Firebase app with the Google Application Credentials
cred = credentials.Certificate(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))
firebase_admin.initialize_app(cred)

In [None]:
# Initialize Firestore client
firestore_client = firestore.Client()

# Define the collection and document paths
collection_ref = firestore_client.collection("eeg_data")
document_ref = collection_ref.document("cristiana.principato@gmail.com")
eeg_doc = document_ref.collection("live_data")
focus_doc = document_ref.collection("focus_sessions")


In [None]:
# Import custom helper functions (assumed to contain utility code for this analysis)
from awearfunction_pd import *

In [None]:
# Download all EEG and focus label documents as lists of dictionaries
eeg_data = [doc_snapshot.to_dict() for doc_snapshot in eeg_doc.stream()]
focus_data = [doc_snapshot.to_dict() for doc_snapshot in focus_doc.stream()]

# Print the first entry of each for inspection
print(eeg_data[0])
print(focus_data[0])

In [None]:
from datetime import datetime, timezone
# only waveform data from waveformRIGHT_TEMP at 256 Hz
raw_data = []
timestamps = []
utc_timestamps = []
for i in eeg_data:
    if len(i["waveformRIGHT_TEMP"]) == 256:
        raw_data.append(i["waveformRIGHT_TEMP"])
        timestamps.append(i["timestamp"])
        dt = datetime.fromisoformat(str(i["timestamp"])).replace(tzinfo=timezone.utc)
        utc_timestamps.append(dt.timestamp())

In [None]:
# Store the number of samples per segment and total number of segments
n_samples = 256
n_segments = len(raw_data)

In [None]:
import numpy as np
import pandas as pd

fs = 256  # Sampling rate in Hz

# Convert lists to numpy arrays and prepare for DataFrame creation
raw_data = np.array(raw_data)
segments = np.repeat([f'seg_{i}' for i in range(n_segments)], n_samples)
timestamps = np.repeat(timestamps, n_samples)
utc_timestamps = np.repeat(utc_timestamps, n_samples)
time_s = np.tile(np.arange(n_samples) / fs, n_segments)  # Time within each segment


In [None]:
# Flatten the data and create a long-form DataFrame for analysis
long_df = pd.DataFrame({
    'sample_value': raw_data.flatten(),
    'segment': segments,
    'time_UTC': utc_timestamps,
    'timestamp': timestamps,
    'time_sample': time_s
})

long_df.head()

In [None]:
from scipy.signal import butter, filtfilt, iirnotch, detrend

# Bandpass filter: retain frequencies between 1-47 Hz (typical EEG range)
def bandpass_filter(x, fs, lowcut=1.0, highcut=47.0, order=4):
    nyq = 0.5 * fs
    b, a = butter(order, [lowcut / nyq, highcut / nyq], btype='band')
    return filtfilt(b, a, x)

# Notch filter: remove 60 Hz powerline noise
def notch_filter(x, fs, freq=60.0, Q=30.0):
    nyq = 0.5 * fs
    w0 = freq / nyq
    b, a = iirnotch(w0, Q)
    return filtfilt(b, a, x)

# Full preprocessing pipeline for a segment: bandpass, notch, and detrend
def preprocess_segment(x, fs):
    x = bandpass_filter(x, fs)
    x = notch_filter(x, fs)
    x = detrend(x)
    return x

# Apply preprocessing to all segments
filtered = np.array([preprocess_segment(seg, fs) for seg in raw_data])



In [None]:
# Add the filtered signal to the DataFrame for further analysis
long_df['filtered_value'] = filtered.flatten()

In [None]:
from scipy.signal import welch

# Compute the power spectral density (PSD) for a segment using Welch's method
def compute_psd(seg, fs):
    freqs, psd = welch(seg, fs=fs, nperseg=len(seg), window='hann')
    return freqs, psd

# Example: compute PSD for the first segment
freqs, psd = compute_psd(filtered[0], fs)



In [None]:
# Define EEG frequency bands
bands = {
    'delta': (0.5, 4),
    'theta': (4, 8),
    'alpha': (8, 13),
    'beta': (13, 30),
    'gamma': (30, 47),
}

# Compute bandpower for a given frequency band
def bandpower(freqs, psd, band):
    mask = (freqs >= band[0]) & (freqs <= band[1])
    return np.trapz(psd[mask], freqs[mask])

# Extract features: bandpowers and indices
def extract_band_features(freqs, psd):
    powers = {name: bandpower(freqs, psd, b) for name, b in bands.items()}
    powers['theta_beta_ratio'] = powers['theta'] / powers['beta'] if powers['beta'] > 0 else np.nan
    powers['engagement_index'] = powers['alpha'] / powers['beta'] if powers['beta'] > 0 else np.nan
    powers['focus_index'] = powers['beta'] / (powers['theta'] + powers['alpha']) if (powers['theta'] + powers['alpha']) > 0 else np.nan
    return powers

# Compute features for all segments and create a DataFrame
features = [extract_band_features(*compute_psd(seg, fs)) for seg in filtered]
features_df = pd.DataFrame(features)

features_df.head()




# Focus Index

Focus Index = Beta Power / (Alpha Power + Theta Power)

High = alert, concentrated
Low = distracted, drowsy


# Engagement index
Engagement Index = Alpha Power / Beta Power

High = alert, concentrated
Low = distracted, drowsy

In [None]:
# Normalize the focus and engagement indices (z-score) for comparison
for col in ['focus_index', 'engagement_index']:
    features_df[col+'_norm'] = (features_df[col] - features_df[col].mean()) / features_df[col].std()


In [None]:
import plotly.graph_objects as go

# Construct time vector
segment_duration = 256 / 256  # 1.0s per segment if fs = 256
features_df['time_s'] = features_df.index * segment_duration

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=features_df['time_s'],
    y=features_df['focus_index'],
    mode='lines+markers',
    name='Focus Index',
    line=dict(color='blue')
))

fig.add_trace(go.Scatter(
    x=features_df['time_s'],
    y=features_df['engagement_index'],
    mode='lines+markers',
    name='Engagement Index',
    line=dict(color='orange')
))

fig.update_layout(
    title='EEG-Derived Focus and Engagement Over Time',
    xaxis_title='Time (s)',
    yaxis_title='Index (z-scored)',
    width=900,
    height=400,
    template='plotly_white'
)

fig.show()

In [None]:
import plotly.express as px

fig = px.histogram(features_df, x='engagement_index', nbins=1000, title='Histogram of Engagement Index')
fig.show()

In [None]:
features_df.engagement_index.min(), features_df.engagement_index.max()