In [None]:
#%load_ext cudf


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

## **21_EEG**






In [None]:
df_21_EEG = pd.read_csv('data/STData/21/21_EEG.csv')

In [None]:
df_21_EEG.head()

In [None]:
df_21_EEG.shape

In [None]:
df_21_EEG.columns

In [None]:
df_21_EEG.info()

In [None]:
df_21_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_21_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_21_EEG['QuestionKey'].unique()

In [None]:
df_21_EEG['TimeStamp'] = pd.to_datetime(df_21_EEG['TimeStamp'])

In [None]:
df_21_EEG.describe()

In [None]:
df_21_EEG.head(3)

In [None]:
df_21_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_21_EEG['QuestionKey'].unique()

In [None]:
df_21_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_21_EEG['QuestionKey'].value_counts()

In [None]:
df_21_EEG['HeadBandOn'].unique()

In [None]:
df_21_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_21_EEG.shape

In [None]:
df_21_EEG.dropna(inplace=True)

In [None]:
df_21_EEG.shape

In [None]:
df_21_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_21_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_21_EEG.columns

In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
# from IPython.display import display, Markdown

# canvas = ds.Canvas(plot_width=1000, plot_height=600)

# for col in cols:
#     # Add a markdown cell before each plot for better separation and labeling
#     display(Markdown(f'### {col} over Time'))
#     agg = canvas.line(df_1_EEG, x='UnixTime', y=col)
#     img = tf.shade(agg)
#     plt.figure(figsize=(16, 10))
#     plt.imshow(img.to_pil())
#     # plt.axis('off')  # Removed this line to show axes
#     plt.xlabel("UnixTime") # Add x-axis label
#     plt.ylabel(col) # Add y-axis label
#     plt.show()

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_21_EEG['TimeStamp'], df_21_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_21_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_21_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_21_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_21_EEG[col].unique()}")

In [None]:
df_21_EEG['HSI_TP9'].value_counts()

In [None]:
df_21_EEG['HSI_TP10'].value_counts()

In [None]:
df_21_EEG['HSI_AF7'].value_counts()

In [None]:
df_21_EEG['HSI_AF8'].value_counts()

In [None]:
df_21_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_21_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_21_EEG['HSI_TP9_flag'] = df_21_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_21_EEG['HSI_TP10_flag'] = df_21_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_21_EEG['HSI_AF7_flag'] = df_21_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_21_EEG['HSI_AF8_flag'] = df_21_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_21_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_21_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_21_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_21_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_21_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_21_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_21_EEG)
plt.show()

# **22 EGG**




In [None]:
df_22_EEG = pd.read_csv('data/STData/22/22_EEG.csv')

In [None]:
df_22_EEG.head()

In [None]:
df_22_EEG.shape

In [None]:
df_22_EEG.columns

In [None]:
df_22_EEG.info()

In [None]:
df_22_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_22_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_22_EEG['QuestionKey'].unique()

In [None]:
df_22_EEG['TimeStamp'] = pd.to_datetime(df_22_EEG['TimeStamp'])

In [None]:
df_22_EEG.describe()

In [None]:
df_22_EEG.head(3)

In [None]:
df_22_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_22_EEG['QuestionKey'].unique()

In [None]:
df_22_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_22_EEG['QuestionKey'].value_counts()

In [None]:
df_22_EEG['HeadBandOn'].unique()

In [None]:
df_22_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_22_EEG.shape

In [None]:
df_22_EEG.dropna(inplace=True)

In [None]:
df_22_EEG.shape

In [None]:
df_22_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_22_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_2_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_22_EEG['TimeStamp'], df_22_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_22_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_22_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_22_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_2_EEG[col].unique()}")

In [None]:
df_22_EEG['HSI_TP9'].value_counts()

In [None]:
df_22_EEG['HSI_TP10'].value_counts()

In [None]:
df_22_EEG['HSI_AF7'].value_counts()

In [None]:
df_22_EEG['HSI_AF8'].value_counts()

In [None]:
df_22_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_22_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_22_EEG['HSI_TP9_flag'] = df_22_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_22_EEG['HSI_TP10_flag'] = df_22_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_22_EEG['HSI_AF7_flag'] = df_22_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_22_EEG['HSI_AF8_flag'] = df_22_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)2

In [None]:
df_22_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_22_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_22_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_22_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_22_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_22_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_22_EEG)
plt.show()

**23 EGG**

In [None]:
df_23_EEG = pd.read_csv('data/STData/23/23_EEG.csv')

In [None]:
df_23_EEG.head()

In [None]:
df_23_EEG.shape

In [None]:
df_23_EEG.columns

In [None]:
df_23_EEG.info()

In [None]:
df_23_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_23_EEG.isnull(), cmap='YlGnBu')
plt.show()

In [None]:
df_23_EEG['QuestionKey'].unique()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_23_EEG['QuestionKey'].unique()

In [None]:
df_23_EEG['TimeStamp'] = pd.to_datetime(df_23_EEG['TimeStamp'])

In [None]:
df_23_EEG.describe()

In [None]:
df_23_EEG.head(3)

In [None]:
df_23_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_23_EEG['QuestionKey'].unique()

In [None]:
df_23_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_23_EEG['QuestionKey'].value_counts()

In [None]:
df_23_EEG['HeadBandOn'].unique()

In [None]:
df_23_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_23_EEG.shape

In [None]:
df_23_EEG.dropna(inplace=True)

In [None]:
df_23_EEG.shape

In [None]:
df_23_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_23_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_23_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_23_EEG['TimeStamp'], df_23_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_23_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_23_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_23_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_23_EEG[col].unique()}")

In [None]:
df_23_EEG['HSI_TP9'].value_counts()

In [None]:
df_23_EEG['HSI_TP10'].value_counts()

In [None]:
df_23_EEG['HSI_AF7'].value_counts()

In [None]:
df_23_EEG['HSI_AF8'].value_counts()

In [None]:
df_23_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_23_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_23_EEG['HSI_TP9_flag'] = df_23_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_23_EEG['HSI_TP10_flag'] = df_23_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_23_EEG['HSI_AF7_flag'] = df_23_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_23_EEG['HSI_AF8_flag'] = df_23_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_23_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_23_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_23_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_23_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_23_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_23_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_23_EEG)
plt.show()

# ***24 EGG***

In [None]:
df_24_EEG = pd.read_csv('data/STData/24/24_EEG.csv')

In [None]:
df_24_EEG.head()

In [None]:
df_24_EEG.shape

In [None]:
df_24_EEG.columns

In [None]:
df_24_EEG.info()

In [None]:
df_24_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_24_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_24_EEG['QuestionKey'].unique()

In [None]:
df_24_EEG['TimeStamp'] = pd.to_datetime(df_4_EEG['TimeStamp'])

In [None]:
df_24_EEG.describe()

In [None]:
df_24_EEG.head(3)

In [None]:
df_24_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_24_EEG['QuestionKey'].unique()

In [None]:
df_24_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_24_EEG['QuestionKey'].value_counts()

In [None]:
df_24_EEG['HeadBandOn'].unique()

In [None]:
df_24_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_24_EEG.shape

In [None]:
df_24_EEG.dropna(inplace=True)

In [None]:
df_24_EEG.shape

In [None]:
df_24_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_24_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_24_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_24_EEG['TimeStamp'], df_24_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_24_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_24_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_24_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_24_EEG[col].unique()}")

In [None]:
df_24_EEG['HSI_TP9'].value_counts()

In [None]:
df_24_EEG['HSI_TP10'].value_counts()

In [None]:
df_24_EEG['HSI_AF7'].value_counts()

In [None]:
df_24_EEG['HSI_AF8'].value_counts()

In [None]:
df_24_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_24_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_24_EEG['HSI_TP9_flag'] = df_24_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_24_EEG['HSI_TP10_flag'] = df_24_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_24_EEG['HSI_AF7_flag'] = df_24_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_24_EEG['HSI_AF8_flag'] = df_24_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_24_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_24_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_24_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_24_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_24_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_24_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_24_EEG)
plt.show()

***25 EGG***

In [None]:
df_25_EEG = pd.read_csv('data/STData/25/25_EEG.csv')

In [None]:
df_25_EEG.head()

In [None]:
df_25_EEG.shape

In [None]:
df_25_EEG.columns

In [None]:
df_25_EEG.info()

In [None]:
df_25_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_25_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_25_EEG['QuestionKey'].unique()

In [None]:
df_25_EEG['TimeStamp'] = pd.to_datetime(df_25_EEG['TimeStamp'])

In [None]:
df_25_EEG.describe()

In [None]:
df_25_EEG.head(3)

In [None]:
df_25_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_25_EEG['QuestionKey'].unique()

In [None]:
df_25_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_25_EEG['QuestionKey'].value_counts()

In [None]:
df_25_EEG['HeadBandOn'].unique()

In [None]:
df_25_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_25_EEG.shape

In [None]:
df_25_EEG.dropna(inplace=True)

In [None]:
df_25_EEG.shape

In [None]:
df_25_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_25_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_25_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_25_EEG['TimeStamp'], df_25_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_25_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_25_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_25_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_25_EEG[col].unique()}")

In [None]:
df_25_EEG['HSI_TP9'].value_counts()

In [None]:
df_25_EEG['HSI_TP10'].value_counts()

In [None]:
df_25_EEG['HSI_AF7'].value_counts()

In [None]:
df_25_EEG['HSI_AF8'].value_counts()

In [None]:
df_25_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_25_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_25_EEG['HSI_TP9_flag'] = df_25_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_25_EEG['HSI_TP10_flag'] = df_25_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_25_EEG['HSI_AF7_flag'] = df_25_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_25_EEG['HSI_AF8_flag'] = df_25_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_25_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_25_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_25_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_25_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_25_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_25_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_25_EEG)
plt.show()

# ***26 EGG***

In [None]:
df_26_EEG = pd.read_csv('data/STData/26/26_EEG.csv')

In [None]:
df_26_EEG.head()

In [None]:
df_26_EEG.shape

In [None]:
df_26_EEG.columns

In [None]:
df_26_EEG.info()

In [None]:
df_26_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_26_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_26_EEG['QuestionKey'].unique()

In [None]:
df_26_EEG['TimeStamp'] = pd.to_datetime(df_6_EEG['TimeStamp'])

In [None]:
df_26_EEG.describe()

In [None]:
df_26_EEG.head(3)

In [None]:
df_26_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_26_EEG['QuestionKey'].unique()

In [None]:
df_26_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_26_EEG['QuestionKey'].value_counts()

In [None]:
df_26_EEG['HeadBandOn'].unique()

In [None]:
df_26_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_26_EEG.shape

In [None]:
df_26_EEG.dropna(inplace=True)

In [None]:
df_26_EEG.shape

In [None]:
df_26_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_26_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_26_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_26_EEG['TimeStamp'], df_26_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_26_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_26_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_26_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_26_EEG[col].unique()}")

In [None]:
df_26_EEG['HSI_TP9'].value_counts()

In [None]:
df_26_EEG['HSI_TP10'].value_counts()

In [None]:
df_26_EEG['HSI_AF7'].value_counts()

In [None]:
df_26_EEG['HSI_AF8'].value_counts()

In [None]:
df_26_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_26_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_26_EEG['HSI_TP9_flag'] = df_26_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_26_EEG['HSI_TP10_flag'] = df_26_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_26_EEG['HSI_AF7_flag'] = df_26_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_26_EEG['HSI_AF8_flag'] = df_26_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_26_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_26_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_26_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_26_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_26_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_26_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_26_EEG)
plt.show()

***27 EGG***

In [None]:
df_27_EEG = pd.read_csv('data/STData/27/27_EEG.csv')

In [None]:
df_27_EEG.head()

In [None]:
df_27_EEG.shape

In [None]:
df_27_EEG.columns

In [None]:
df_27_EEG.info()

In [None]:
df_27_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_27_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_27_EEG['QuestionKey'].unique()

In [None]:
df_27_EEG['TimeStamp'] = pd.to_datetime(df_27_EEG['TimeStamp'])

In [None]:
df_27_EEG.describe()

In [None]:
df_27_EEG.head(3)

In [None]:
df_27_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_27_EEG['QuestionKey'].unique()

In [None]:
df_27_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_27_EEG['QuestionKey'].value_counts()

In [None]:
df_27_EEG['HeadBandOn'].unique()

In [None]:
df_27_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_27_EEG.shape

In [None]:
df_27_EEG.dropna(inplace=True)

In [None]:
df_27_EEG.shape

In [None]:
df_27_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_27_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_27_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_27_EEG['TimeStamp'], df_27_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_27_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_27_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_27_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_27_EEG[col].unique()}")

In [None]:
df_27_EEG['HSI_TP9'].value_counts()

In [None]:
df_27_EEG['HSI_TP10'].value_counts()

In [None]:
df_27_EEG['HSI_AF7'].value_counts()

In [None]:
df_27_EEG['HSI_AF8'].value_counts()

In [None]:
df_27_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_27_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_27_EEG['HSI_TP9_flag'] = df_27_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_27_EEG['HSI_TP10_flag'] = df_27_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_27_EEG['HSI_AF7_flag'] = df_27_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_27_EEG['HSI_AF8_flag'] = df_27_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_27_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_27_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_27_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_27_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_27_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_27_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_27_EEG)
plt.show()

***28 EGG***

In [None]:
df_28_EEG.head()

In [None]:
df_28_EEG = pd.read_csv('data/STData/8/8_EEG.csv')

In [None]:
df_28_EEG.shape


In [None]:
df_28_EEG.columns

In [None]:
df_28_EEG.info()

In [None]:
df_28_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_28_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_28_EEG['QuestionKey'].unique()

In [None]:
df_28_EEG['TimeStamp'] = pd.to_datetime(df_28_EEG['TimeStamp'])

In [None]:
df_28_EEG.describe()

In [None]:
df_28_EEG.head(3)

In [None]:
df_28_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_28_EEG['QuestionKey'].unique()

In [None]:
df_28_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_28_EEG['QuestionKey'].value_counts()

In [None]:
df_28_EEG['HeadBandOn'].unique()

In [None]:
df_28_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_28_EEG.shape

In [None]:
df_28_EEG.dropna(inplace=True)

In [None]:
df_28_EEG.shape

In [None]:
df_28_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_28_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_28_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_28_EEG['TimeStamp'], df_28_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_28_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_28_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_28_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_28_EEG[col].unique()}")

In [None]:
df_28_EEG['HSI_TP9'].value_counts()

In [None]:
df_28_EEG['HSI_TP10'].value_counts()

In [None]:
df_28_EEG['HSI_AF7'].value_counts()

In [None]:
df_28_EEG['HSI_AF8'].value_counts()

In [None]:
df_28_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_28_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_28_EEG['HSI_TP9_flag'] = df_28_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_28_EEG['HSI_TP10_flag'] = df_28_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_28_EEG['HSI_AF7_flag'] = df_28_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_28_EEG['HSI_AF8_flag'] = df_28_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_28_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_28_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_28_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_28_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_28_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_28_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_28_EEG)
plt.show()

***29 EGG***

In [None]:
df_29_EEG = pd.read_csv('data/STData/29/29_EEG.csv')

In [None]:
df_29_EEG.head()

In [None]:
df_29_EEG.shape

In [None]:
df_29_EEG.columns

In [None]:
df_29_EEG.info()

In [None]:
df_29_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_29_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_29_EEG['QuestionKey'].unique()

In [None]:
df_29_EEG['TimeStamp'] = pd.to_datetime(df_29_EEG['TimeStamp'])

In [None]:
df_29_EEG.describe()

In [None]:
df_29_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_29_EEG['QuestionKey'].unique()

In [None]:
df_29_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_29_EEG['QuestionKey'].value_counts()

In [None]:
df_29_EEG['HeadBandOn'].unique()

In [None]:
df_29_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_29_EEG.shape

In [None]:
df_29_EEG.dropna(inplace=True)

In [None]:
df_29_EEG.shape

In [None]:
df_29_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_29_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_29_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_29_EEG['TimeStamp'], df_29_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_29_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_29_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_29_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_29_EEG[col].unique()}")

In [None]:
df_29_EEG['HSI_TP9'].value_counts()

In [None]:
df_29_EEG['HSI_TP10'].value_counts()

In [None]:
df_29_EEG['HSI_AF7'].value_counts()

In [None]:
df_29_EEG['HSI_AF8'].value_counts()

In [None]:
df_29_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_29_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_29_EEG['HSI_TP9_flag'] = df_29_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_29_EEG['HSI_TP10_flag'] = df_29_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_29_EEG['HSI_AF7_flag'] = df_29_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_29_EEG['HSI_AF8_flag'] = df_29_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_29_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_29_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_29_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_29_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_29_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_29_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_29_EEG)
plt.show()

***30 EGG***

In [None]:
df_30_EEG = pd.read_csv('data/STData/30/30_EEG.csv')

In [None]:
df_30_EEG.head()

In [None]:
df_30_EEG.shape

In [None]:
df_30_EEG.columns

In [None]:
df_30_EEG.info()

In [None]:
df_30_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_30_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_30_EEG['QuestionKey'].unique()

In [None]:
df_30_EEG['TimeStamp'] = pd.to_datetime(df_30_EEG['TimeStamp'])

In [None]:
df_30_EEG.describe()

In [None]:
df_30_EEG.head(3)

In [None]:
df_30_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_30_EEG['QuestionKey'].unique()

In [None]:
df_30_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_30_EEG['QuestionKey'].value_counts()

In [None]:
df_30_EEG['HeadBandOn'].unique()

In [None]:
df_30_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_30_EEG.shape

In [None]:
df_30_EEG.dropna(inplace=True)

In [None]:
df_30_EEG.shape

In [None]:
df_30_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_30_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_30_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_30_EEG['TimeStamp'], df_30_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_30_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_30_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_30_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_30_EEG[col].unique()}")

In [None]:
df_30_EEG['HSI_TP9'].value_counts()

In [None]:
df_30_EEG['HSI_TP10'].value_counts()

In [None]:
df_30_EEG['HSI_AF7'].value_counts()

In [None]:
df_30_EEG['HSI_AF8'].value_counts()

In [None]:
df_30_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_30_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_30_EEG['HSI_TP9_flag'] = df_30_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_30_EEG['HSI_TP10_flag'] = df_30_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_30_EEG['HSI_AF7_flag'] = df_30_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_30_EEG['HSI_AF8_flag'] = df_30_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_30_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(28,12))
sns.heatmap(df_30_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)a
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_30_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_30_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_30_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_30_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(28,12))
sns.pairplot(df_30_EEG)
plt.show()