In [None]:
%load_ext cudf


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

## **21_EEG**






In [None]:
df_1_EEG = pd.read_csv('data/STData/1/1_EEG.csv')

In [None]:
df_1_EEG.head()

In [None]:
df_1_EEG.shape

In [None]:
df_1_EEG.columns

In [None]:
df_1_EEG.info()

In [None]:
df_1_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_1_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_1_EEG['QuestionKey'].unique()

In [None]:
df_1_EEG['TimeStamp'] = pd.to_datetime(df_1_EEG['TimeStamp'])

In [None]:
df_1_EEG.describe()

In [None]:
df_1_EEG.head(3)

In [None]:
df_1_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_1_EEG['QuestionKey'].unique()

In [None]:
df_1_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_1_EEG['QuestionKey'].value_counts()

In [None]:
df_1_EEG['HeadBandOn'].unique()

In [None]:
df_1_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_1_EEG.shape

In [None]:
df_1_EEG.dropna(inplace=True)

In [None]:
df_1_EEG.shape

In [None]:
df_1_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_1_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_1_EEG.columns

In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
# from IPython.display import display, Markdown

# canvas = ds.Canvas(plot_width=1000, plot_height=600)

# for col in cols:
#     # Add a markdown cell before each plot for better separation and labeling
#     display(Markdown(f'### {col} over Time'))
#     agg = canvas.line(df_1_EEG, x='UnixTime', y=col)
#     img = tf.shade(agg)
#     plt.figure(figsize=(16, 10))
#     plt.imshow(img.to_pil())
#     # plt.axis('off')  # Removed this line to show axes
#     plt.xlabel("UnixTime") # Add x-axis label
#     plt.ylabel(col) # Add y-axis label
#     plt.show()

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_1_EEG['TimeStamp'], df_1_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_1_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_1_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_1_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_1_EEG[col].unique()}")

In [None]:
df_1_EEG['HSI_TP9'].value_counts()

In [None]:
df_1_EEG['HSI_TP10'].value_counts()

In [None]:
df_1_EEG['HSI_AF7'].value_counts()

In [None]:
df_1_EEG['HSI_AF8'].value_counts()

In [None]:
df_1_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_1_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_1_EEG['HSI_TP9_flag'] = df_1_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_1_EEG['HSI_TP10_flag'] = df_1_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_1_EEG['HSI_AF7_flag'] = df_1_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_1_EEG['HSI_AF8_flag'] = df_1_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_1_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(df_1_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_1_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_1_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_1_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_1_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.pairplot(df_1_EEG)
plt.show()

# **22 EEG**




In [None]:
df_2_EEG = pd.read_csv('data/STData/2/2_EEG.csv')

In [None]:
df_2_EEG.head()

In [None]:
df_2_EEG.shape

In [None]:
df_2_EEG.columns

In [None]:
df_2_EEG.info()

In [None]:
df_2_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_2_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_2_EEG['QuestionKey'].unique()

In [None]:
df_2_EEG['TimeStamp'] = pd.to_datetime(df_2_EEG['TimeStamp'])

In [None]:
df_2_EEG.describe()

In [None]:
df_2_EEG.head(3)

In [None]:
df_2_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_2_EEG['QuestionKey'].unique()

In [None]:
df_2_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_2_EEG['QuestionKey'].value_counts()

In [None]:
df_2_EEG['HeadBandOn'].unique()

In [None]:
df_2_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_2_EEG.shape

In [None]:
df_2_EEG.dropna(inplace=True)

In [None]:
df_2_EEG.shape

In [None]:
df_2_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_2_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_2_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_2_EEG['TimeStamp'], df_2_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_2_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_2_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_2_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_2_EEG[col].unique()}")

In [None]:
df_2_EEG['HSI_TP9'].value_counts()

In [None]:
df_2_EEG['HSI_TP10'].value_counts()

In [None]:
df_2_EEG['HSI_AF7'].value_counts()

In [None]:
df_2_EEG['HSI_AF8'].value_counts()

In [None]:
df_2_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_2_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_2_EEG['HSI_TP9_flag'] = df_2_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_2_EEG['HSI_TP10_flag'] = df_2_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_2_EEG['HSI_AF7_flag'] = df_2_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_2_EEG['HSI_AF8_flag'] = df_2_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_2_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_2_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_2_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_2_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_2_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_2_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.pairplot(df_2_EEG)
plt.show()

**23 EGG**

In [None]:
df_3_EEG = pd.read_csv('data/STData/3/3_EEG.csv')

In [None]:
df_3_EEG.head()

In [None]:
df_3_EEG.shape

In [None]:
df_3_EEG.columns

In [None]:
df_3_EEG.info()

In [None]:
df_3_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_3_EEG.isnull(), cmap='YlGnBu')
plt.show()

In [None]:
df_3_EEG['QuestionKey'].unique()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_3_EEG['QuestionKey'].unique()

In [None]:
df_3_EEG['TimeStamp'] = pd.to_datetime(df_3_EEG['TimeStamp'])

In [None]:
df_3_EEG.describe()

In [None]:
df_3_EEG.head(3)

In [None]:
df_3_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_3_EEG['QuestionKey'].unique()

In [None]:
df_3_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_3_EEG['QuestionKey'].value_counts()

In [None]:
df_3_EEG['HeadBandOn'].unique()

In [None]:
df_3_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_3_EEG.shape

In [None]:
df_3_EEG.dropna(inplace=True)

In [None]:
df_3_EEG.shape

In [None]:
df_3_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_3_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_3_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_3_EEG['TimeStamp'], df_3_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_3_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_3_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_3_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_3_EEG[col].unique()}")

In [None]:
df_3_EEG['HSI_TP9'].value_counts()

In [None]:
df_3_EEG['HSI_TP10'].value_counts()

In [None]:
df_3_EEG['HSI_AF7'].value_counts()

In [None]:
df_3_EEG['HSI_AF8'].value_counts()

In [None]:
df_3_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_3_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_3_EEG['HSI_TP9_flag'] = df_3_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_3_EEG['HSI_TP10_flag'] = df_3_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_3_EEG['HSI_AF7_flag'] = df_3_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_3_EEG['HSI_AF8_flag'] = df_3_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_3_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_3_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_3_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_3_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_3_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_3_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.pairplot(df_3_EEG)
plt.show()

# ***24 EGG***

In [None]:
df_4_EEG = pd.read_csv('data/STData/4/4_EEG.csv')

In [None]:
df_4_EEG.head()

In [None]:
df_4_EEG.shape

In [None]:
df_4_EEG.columns

In [None]:
df_4_EEG.info()

In [None]:
df_4_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_4_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_4_EEG['QuestionKey'].unique()

In [None]:
df_4_EEG['TimeStamp'] = pd.to_datetime(df_4_EEG['TimeStamp'])

In [None]:
df_4_EEG.describe()

In [None]:
df_4_EEG.head(3)

In [None]:
df_4_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_4_EEG['QuestionKey'].unique()

In [None]:
df_4_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_4_EEG['QuestionKey'].value_counts()

In [None]:
df_4_EEG['HeadBandOn'].unique()

In [None]:
df_4_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_4_EEG.shape

In [None]:
df_4_EEG.dropna(inplace=True)

In [None]:
df_4_EEG.shape

In [None]:
df_4_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_4_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_4_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_4_EEG['TimeStamp'], df_4_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_4_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_4_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_4_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_4_EEG[col].unique()}")

In [None]:
df_4_EEG['HSI_TP9'].value_counts()

In [None]:
df_4_EEG['HSI_TP10'].value_counts()

In [None]:
df_4_EEG['HSI_AF7'].value_counts()

In [None]:
df_4_EEG['HSI_AF8'].value_counts()

In [None]:
df_4_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_4_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_4_EEG['HSI_TP9_flag'] = df_4_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_4_EEG['HSI_TP10_flag'] = df_4_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_4_EEG['HSI_AF7_flag'] = df_4_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_4_EEG['HSI_AF8_flag'] = df_4_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_4_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_4_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_4_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_4_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_4_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_4_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(24,12))
sns.pairplot(df_4_EEG)
plt.show()

***25 EGG***

In [None]:
df_5_EEG = pd.read_csv('data/STData/5/5_EEG.csv')

In [None]:
df_5_EEG.head()

In [None]:
df_5_EEG.shape

In [None]:
df_5_EEG.columns

In [None]:
df_5_EEG.info()

In [None]:
df_5_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_5_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_5_EEG['QuestionKey'].unique()

In [None]:
df_5_EEG['TimeStamp'] = pd.to_datetime(df_5_EEG['TimeStamp'])

In [None]:
df_5_EEG.describe()

In [None]:
df_5_EEG.head(3)

In [None]:
df_5_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_5_EEG['QuestionKey'].unique()

In [None]:
df_5_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_5_EEG['QuestionKey'].value_counts()

In [None]:
df_5_EEG['HeadBandOn'].unique()

In [None]:
df_5_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_5_EEG.shape

In [None]:
df_5_EEG.dropna(inplace=True)

In [None]:
df_5_EEG.shape

In [None]:
df_5_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_5_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_5_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_5_EEG['TimeStamp'], df_5_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_5_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_5_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_5_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_5_EEG[col].unique()}")

In [None]:
df_5_EEG['HSI_TP9'].value_counts()

In [None]:
df_5_EEG['HSI_TP10'].value_counts()

In [None]:
df_5_EEG['HSI_AF7'].value_counts()

In [None]:
df_5_EEG['HSI_AF8'].value_counts()

In [None]:
df_5_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_5_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_5_EEG['HSI_TP9_flag'] = df_5_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_5_EEG['HSI_TP10_flag'] = df_5_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_5_EEG['HSI_AF7_flag'] = df_5_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_5_EEG['HSI_AF8_flag'] = df_5_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_5_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_5_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_5_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_5_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_5_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_5_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(24,12))
sns.pairplot(df_5_EEG)
plt.show()

# ***26 EGG***

In [None]:
df_6_EEG = pd.read_csv('data/STData/6/6_EEG.csv')

In [None]:
df_6_EEG.head()

In [None]:
df_6_EEG.shape

In [None]:
df_6_EEG.columns

In [None]:
df_6_EEG.info()

In [None]:
df_6_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_6_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_6_EEG['QuestionKey'].unique()

In [None]:
df_6_EEG['TimeStamp'] = pd.to_datetime(df_6_EEG['TimeStamp'])

In [None]:
df_6_EEG.describe()

In [None]:
df_6_EEG.head(3)

In [None]:
df_6_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_6_EEG['QuestionKey'].unique()

In [None]:
df_6_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_6_EEG['QuestionKey'].value_counts()

In [None]:
df_6_EEG['HeadBandOn'].unique()

In [None]:
df_6_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_6_EEG.shape

In [None]:
df_6_EEG.dropna(inplace=True)

In [None]:
df_6_EEG.shape

In [None]:
df_6_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_6_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_6_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_6_EEG['TimeStamp'], df_6_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_6_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_6_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_6_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_6_EEG[col].unique()}")

In [None]:
df_6_EEG['HSI_TP9'].value_counts()

In [None]:
df_6_EEG['HSI_TP10'].value_counts()

In [None]:
df_6_EEG['HSI_AF7'].value_counts()

In [None]:
df_6_EEG['HSI_AF8'].value_counts()

In [None]:
df_6_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_6_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_6_EEG['HSI_TP9_flag'] = df_6_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_6_EEG['HSI_TP10_flag'] = df_6_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_6_EEG['HSI_AF7_flag'] = df_6_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_6_EEG['HSI_AF8_flag'] = df_6_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_6_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_6_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_6_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_6_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_6_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_6_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.pairplot(df_6_EEG)
plt.show()

***27 EGG***

In [None]:
df_7_EEG = pd.read_csv('data/STData/7/7_EEG.csv')

In [None]:
df_7_EEG.head()

In [None]:
df_7_EEG.shape

In [None]:
df_7_EEG.columns

In [None]:
df_7_EEG.info()

In [None]:
df_7_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_7_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_7_EEG['QuestionKey'].unique()

In [None]:
df_7_EEG['TimeStamp'] = pd.to_datetime(df_7_EEG['TimeStamp'])

In [None]:
df_7_EEG.describe()

In [None]:
df_7_EEG.head(3)

In [None]:
df_7_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_7_EEG['QuestionKey'].unique()

In [None]:
df_7_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_7_EEG['QuestionKey'].value_counts()

In [None]:
df_7_EEG['HeadBandOn'].unique()

In [None]:
df_7_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_7_EEG.shape

In [None]:
df_7_EEG.dropna(inplace=True)

In [None]:
df_7_EEG.shape

In [None]:
df_7_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_7_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_7_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_7_EEG['TimeStamp'], df_7_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_7_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_7_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_7_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_7_EEG[col].unique()}")

In [None]:
df_7_EEG['HSI_TP9'].value_counts()

In [None]:
df_7_EEG['HSI_TP10'].value_counts()

In [None]:
df_7_EEG['HSI_AF7'].value_counts()

In [None]:
df_7_EEG['HSI_AF8'].value_counts()

In [None]:
df_7_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_7_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_7_EEG['HSI_TP9_flag'] = df_7_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_7_EEG['HSI_TP10_flag'] = df_7_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_7_EEG['HSI_AF7_flag'] = df_7_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_7_EEG['HSI_AF8_flag'] = df_7_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_7_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_7_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_7_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_7_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_7_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_7_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.pairplot(df_7_EEG)
plt.show()

***28 EGG***

In [None]:
df_8_EEG = pd.read_csv('data/STData/8/8_EEG.csv')

In [None]:
df_8_EEG.head()

In [None]:
df_8_EEG.shape


In [None]:
df_8_EEG.columns

In [None]:
df_8_EEG.info()

In [None]:
df_8_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_8_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_8_EEG['QuestionKey'].unique()

In [None]:
df_8_EEG['TimeStamp'] = pd.to_datetime(df_8_EEG['TimeStamp'])

In [None]:
df_8_EEG.describe()

In [None]:
df_8_EEG.head(3)

In [None]:
df_8_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_8_EEG['QuestionKey'].unique()

In [None]:
df_8_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_8_EEG['QuestionKey'].value_counts()

In [None]:
df_8_EEG['HeadBandOn'].unique()

In [None]:
df_8_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_8_EEG.shape

In [None]:
df_8_EEG.dropna(inplace=True)

In [None]:
df_8_EEG.shape

In [None]:
df_8_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_8_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_8_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_8_EEG['TimeStamp'], df_8_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(df_8_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_8_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_8_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_8_EEG[col].unique()}")

In [None]:
df_8_EEG['HSI_TP9'].value_counts()

In [None]:
df_8_EEG['HSI_TP10'].value_counts()

In [None]:
df_8_EEG['HSI_AF7'].value_counts()

In [None]:
df_8_EEG['HSI_AF8'].value_counts()

In [None]:
df_8_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_8_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_8_EEG['HSI_TP9_flag'] = df_8_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_8_EEG['HSI_TP10_flag'] = df_8_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_8_EEG['HSI_AF7_flag'] = df_8_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_8_EEG['HSI_AF8_flag'] = df_8_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_8_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_8_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_8_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_8_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_8_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_8_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.pairplot(df_8_EEG)
plt.show()

***29 EGG***

In [None]:
df_9_EEG = pd.read_csv('data/STData/9/9_EEG.csv')

In [None]:
df_9_EEG.head()

In [None]:
df_9_EEG.shape

In [None]:
df_9_EEG.columns

In [None]:
df_9_EEG.info()

In [None]:
df_9_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_9_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_9_EEG['QuestionKey'].unique()

In [None]:
df_9_EEG['TimeStamp'] = pd.to_datetime(df_9_EEG['TimeStamp'])

In [None]:
df_9_EEG.describe()

In [None]:
df_9_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_9_EEG['QuestionKey'].unique()

In [None]:
df_9_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_9_EEG['QuestionKey'].value_counts()

In [None]:
df_9_EEG['HeadBandOn'].unique()

In [None]:
df_9_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_9_EEG.shape

In [None]:
df_9_EEG.dropna(inplace=True)

In [None]:
df_9_EEG.shape

In [None]:
df_9_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_9_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_9_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_9_EEG['TimeStamp'], df_9_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_9_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_9_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_9_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_9_EEG[col].unique()}")

In [None]:
df_9_EEG['HSI_TP9'].value_counts()

In [None]:
df_9_EEG['HSI_TP10'].value_counts()

In [None]:
df_9_EEG['HSI_AF7'].value_counts()

In [None]:
df_9_EEG['HSI_AF8'].value_counts()

In [None]:
df_9_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_9_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_9_EEG['HSI_TP9_flag'] = df_9_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_9_EEG['HSI_TP10_flag'] = df_9_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_9_EEG['HSI_AF7_flag'] = df_9_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_9_EEG['HSI_AF8_flag'] = df_9_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_9_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_9_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_9_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_9_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_9_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_9_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.pairplot(df_9_EEG)
plt.show()

***30 EGG***

In [None]:
df_10_EEG = pd.read_csv('data/STData/10/10_EEG.csv')

In [None]:
df_10_EEG.head()

In [None]:
df_10_EEG.shape

In [None]:
df_10_EEG.columns

In [None]:
df_10_EEG.info()

In [None]:
df_10_EEG.isnull().sum()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_10_EEG.isnull(), cmap='YlGnBu')
plt.show()

# Notes & Observations

- We observe all **null** (or missing) values in the `QuestionKey` and `Elements` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  
- The `Elements` column is almost entirely null; dropping it likely won’t meaningfully impact our data quality.


In [None]:
df_10_EEG['QuestionKey'].unique()

In [None]:
df_10_EEG['TimeStamp'] = pd.to_datetime(df_10_EEG['TimeStamp'])

In [None]:
df_10_EEG.describe()

In [None]:
df_10_EEG.head(3)

In [None]:
df_10_EEG.drop('Elements', axis=1, inplace=True)

In [None]:
df_10_EEG['QuestionKey'].unique()

In [None]:
df_10_EEG['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_10_EEG['QuestionKey'].value_counts()

In [None]:
df_10_EEG['HeadBandOn'].unique()

In [None]:
df_10_EEG['HeadBandOn'].isnull().sum()

In [None]:
df_10_EEG.shape

In [None]:
df_10_EEG.dropna(inplace=True)

In [None]:
df_10_EEG.shape

In [None]:
df_10_EEG.drop('HeadBandOn', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df_10_EEG.isnull(), cmap='YlGnBu')
plt.show()

## Status Update

- All **null** (or missing) values in the dataset have been handled / eliminated.
- There are no remaining null values in any column.
- The dataset is now “complete” in the sense that every cell has a valid (non-null) entry.


In [None]:
df_10_EEG.columns


In [None]:
cols = ['Delta_TP9', 'Delta_AF7',
       'Delta_AF8', 'Delta_TP10', 'Theta_TP9', 'Theta_AF7', 'Theta_AF8',
       'Theta_TP10', 'Alpha_TP9', 'Alpha_AF7', 'Alpha_AF8', 'Alpha_TP10',
       'Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Beta_TP10', 'Gamma_TP9',
       'Gamma_AF7', 'Gamma_AF8', 'Gamma_TP10', 'RAW_TP9', 'RAW_AF7', 'RAW_AF8',
       'RAW_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y',
       'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z',
       'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'Battery']

In [None]:
from IPython.display import display, Markdown


for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_10_EEG['TimeStamp'], df_10_EEG[col])
    plt.xlabel("TimeStamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_10_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping `AUX_RIGHT`
- The `AUX_RIGHT` channel comes from the auxiliary electrode input of the EEG headset.  
- Upon inspection, its values appeared as **flat high-amplitude noise** (750–950 range) without any meaningful oscillatory EEG patterns.  
- Correlation analysis also showed **no significant relationship** between `AUX_RIGHT` and other EEG features.  
- Since it does not carry useful information and only adds noise/dimensionality, we **dropped `AUX_RIGHT`** from the dataset.




In [None]:
df_10_EEG.drop('AUX_RIGHT', axis=1, inplace=True)

In [None]:
df_10_EEG.head()

In [None]:
HSI_cols = ["HSI_TP9", "HSI_AF7", "HSI_AF8", "HSI_TP10"]

In [None]:
for col in HSI_cols:
    print(f"Unique values for {col}: {df_10_EEG[col].unique()}")

In [None]:
df_10_EEG['HSI_TP9'].value_counts()

In [None]:
df_10_EEG['HSI_TP10'].value_counts()

In [None]:
df_10_EEG['HSI_AF7'].value_counts()

In [None]:
df_10_EEG['HSI_AF8'].value_counts()

In [None]:
df_10_EEG.describe()

## Notes & Observations

### Handling `HSI_*` Columns
- The `HSI_TP9`, `HSI_AF7`, `HSI_AF8`, `HSI_TP10` columns represent **Headset Signal Integrity** for each electrode:  
  - `1 = Good connection`  
  - `2 = Medium connection`  
  - `4 = Bad connection`  
- We need to pay attention to these values because:
  - Bad connections (`4`) indicate unreliable EEG readings.  
  - Medium connections (`2`) may still be usable but should be treated with caution.  
- Instead of dropping rows, we will **convert HSI values into binary flags**:  
  - **1 = Bad connection present**  
  - **0 = Otherwise (Good or Medium)**  
- This preserves all data while giving the model information about electrode reliability.  
- After creating these flags, the raw `HSI_*` columns can be removed to reduce dimensionality.


In [None]:
df_10_EEG.head()

In [None]:
signal_flags_values = { 1: 0, 2: 0, 4: 1}

In [None]:
df_10_EEG['HSI_TP9_flag'] = df_10_EEG['HSI_TP9'].map(signal_flags_values).astype(np.int8)
df_10_EEG['HSI_TP10_flag'] = df_10_EEG['HSI_TP10'].map(signal_flags_values).astype(np.int8)
df_10_EEG['HSI_AF7_flag'] = df_10_EEG['HSI_AF7'].map(signal_flags_values).astype(np.int8)
df_10_EEG['HSI_AF8_flag'] = df_10_EEG['HSI_AF8'].map(signal_flags_values).astype(np.int8)

In [None]:
df_10_EEG.drop(['HSI_TP9', 'HSI_TP10', 'HSI_AF7', 'HSI_AF8'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(24,12))
sns.heatmap(df_10_EEG.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

## Notes & Observations

### Dropping RAW EEG Channels

- The EEG dataset includes `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` columns, which represent the unprocessed voltage readings from each electrode.  
- Upon inspection:
  - These RAW signals are **highly noisy** and show no clear oscillatory patterns typical of brainwave activity.  
  - Correlation analysis with other features shows **very low correlations** (near 0.0), indicating they do not contribute predictive value.  
  - Plots of RAW vs. time reveal large fluctuations without meaningful structure.

- Reason for dropping:
  - The dataset already provides **frequency band powers** (`Delta`, `Theta`, `Alpha`, `Beta`, `Gamma`) for each electrode, which are **derived from RAW signals** and are far more informative.  
  - Keeping RAW adds unnecessary dimensionality and noise, which can negatively affect machine learning models.

- Action taken:
  - `RAW_TP9`, `RAW_AF7`, `RAW_AF8`, `RAW_TP10` are **dropped from preprocessing**.  
  - The focus will be on the **precomputed band power features** for all EEG electrodes, which are sufficient for cognitive load prediction and regression tasks.


In [None]:
df_10_EEG.drop(['RAW_TP9',	'RAW_AF7',	'RAW_AF8',	'RAW_TP10'], axis=1, inplace=True)

In [None]:
df_10_EEG.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_10_EEG.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_10_EEG[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
sns.pairplot(df_10_EEG)
plt.show()

***10 EEG Dataset Not Used***

Upon further inspection, the `df_10_EEG` dataset was found to contain predominantly null or irrelevant data. Specifically:

- The `QuestionKey` and `Elements` columns are entirely null.
- The `Delta`, `Theta`, `Alpha`, `Beta`, and `Gamma` band power columns are all zero, indicating no meaningful EEG activity was recorded.
- The `HeadBandOn` column is uniformly 0, suggesting the headset was not worn correctly or at all during the recording.
- The `HSI_*` columns are all 4.0, indicating consistently bad signal integrity for all electrodes.

As a result, this dataset will not be used for further analysis or model training due to the lack of usable data.