## Student 21

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_21_EYE = pd.read_csv('data/STData/21/21_EYE.csv')

In [None]:
df_21_EYE.head()

In [None]:
df_21_EYE.shape

In [None]:
df_21_EYE.columns

In [None]:
df_21_EYE.info()

In [None]:
df_21_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_21_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_21_EYE['QuestionKey'].unique()

In [None]:
df_21_EYE['Timestamp'] = pd.to_datetime(df_21_EYE['Timestamp'])

In [None]:
df_21_EYE.head(3)

In [None]:
df_21_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_21_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_21_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_21_EYE.isnull().sum()

In [None]:
df_21_EYE.dropna(inplace=True)

In [None]:
df_21_EYE.head()

In [None]:
df_21_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_21_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_21_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_21_EYE['ET_ValidityLeft'].unique()

In [None]:
df_21_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_21_EYE['ET_ValidityRight'].unique()

In [None]:
df_21_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_21_EYE['ET_ValidityLeft'].value_counts().index, y=df_21_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_21_EYE['ET_ValidityRight'].value_counts().index, y=df_21_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_21_EYE['ET_ValidityLeft'] = df_21_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_21_EYE['ET_ValidityRight'] = df_21_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_21_EYE.head(3)

In [None]:
df_21_EYE.describe()

In [None]:
df_21_EYE[df_21_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_21_EYE[df_21_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_21_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_21_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_21_EYE[df_21_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_21_EYE[df_21_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_21_EYE[df_21_EYE['ET_PupilLeft'] == -1].shape[0] / df_21_EYE.shape[0]

In [None]:
df_21_EYE[df_21_EYE['ET_PupilRight'] == -1].shape[0] / df_21_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_21_EYE[df_21_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_21_EYE[df_21_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_21_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_21_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_21_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_21_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_21_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_21_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_21_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_21_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_21_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_21_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_21_EYE['Timestamp'], df_21_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_21_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_21_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_21_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_21_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_21_EYE[col].fillna(df_21_EYE[col].mean(), inplace=True)

In [None]:
df_21_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_21_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_21_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_21_EYE = pd.read_csv('data/STData/21/21_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_21_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_21_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_21_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_21_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_21_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_21_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_21_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_21_EYE['Timestamp'] = pd.to_datetime(df_21_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_21_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_21_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_21_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_21_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_21_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_21_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_21_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_21_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_21_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_21_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_21_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_21_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_21_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_21_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_21_EYE['ET_ValidityLeft'].value_counts().index, y=df_21_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_21_EYE['ET_ValidityRight'].value_counts().index, y=df_21_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_21_EYE['ET_ValidityLeft'] = df_21_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_21_EYE['ET_ValidityRight'] = df_21_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_21_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_21_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_21_EYE[df_21_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_21_EYE[df_21_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_21_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_21_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_21_EYE[df_21_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_21_EYE[df_21_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_21_EYE[df_21_EYE['ET_PupilLeft'] == -1].shape[0] / df_21_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_21_EYE[df_21_EYE['ET_PupilRight'] == -1].shape[0] / df_21_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_21_EYE[df_21_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_21_EYE[df_21_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_21_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_21_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_21_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_21_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_21_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_21_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_21_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_21_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_21_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_21_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_21_EYE['Timestamp'], df_21_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_21_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_21_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_21_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_21_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_21_EYE[col].fillna(df_21_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_21_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_21_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_21_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 22

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_22_EYE = pd.read_csv('data/STData/22/22_EYE.csv')

In [None]:
df_22_EYE.head()

In [None]:
df_22_EYE.shape

In [None]:
df_22_EYE.columns

In [None]:
df_22_EYE.info()

In [None]:
df_22_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_22_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_22_EYE['QuestionKey'].unique()

In [None]:
df_22_EYE['Timestamp'] = pd.to_datetime(df_22_EYE['Timestamp'])

In [None]:
df_22_EYE.head(3)

In [None]:
df_22_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_22_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_22_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_22_EYE.isnull().sum()

In [None]:
df_22_EYE.dropna(inplace=True)

In [None]:
df_22_EYE.head()

In [None]:
df_22_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_22_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_22_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_22_EYE['ET_ValidityLeft'].unique()

In [None]:
df_22_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_22_EYE['ET_ValidityRight'].unique()

In [None]:
df_22_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_22_EYE['ET_ValidityLeft'].value_counts().index, y=df_22_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_22_EYE['ET_ValidityRight'].value_counts().index, y=df_22_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_22_EYE['ET_ValidityLeft'] = df_22_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_22_EYE['ET_ValidityRight'] = df_22_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_22_EYE.head(3)

In [None]:
df_22_EYE.describe()

In [None]:
df_22_EYE[df_22_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_22_EYE[df_22_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_22_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_22_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_22_EYE[df_22_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_22_EYE[df_22_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_22_EYE[df_22_EYE['ET_PupilLeft'] == -1].shape[0] / df_22_EYE.shape[0]

In [None]:
df_22_EYE[df_22_EYE['ET_PupilRight'] == -1].shape[0] / df_22_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_22_EYE[df_22_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_22_EYE[df_22_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_22_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_22_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_22_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_22_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_22_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_22_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_22_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_22_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_22_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_22_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_22_EYE['Timestamp'], df_22_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_22_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_22_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_22_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_22_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_22_EYE[col].fillna(df_22_EYE[col].mean(), inplace=True)

In [None]:
df_22_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_22_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_22_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_22_EYE = pd.read_csv('data/STData/22/22_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_22_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_22_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_22_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_22_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_22_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_22_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_22_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_22_EYE['Timestamp'] = pd.to_datetime(df_22_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_22_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_22_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_22_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_22_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_22_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_22_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_22_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_22_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_22_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_22_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_22_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_22_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_22_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_22_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_22_EYE['ET_ValidityLeft'].value_counts().index, y=df_22_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_22_EYE['ET_ValidityRight'].value_counts().index, y=df_22_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_22_EYE['ET_ValidityLeft'] = df_22_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_22_EYE['ET_ValidityRight'] = df_22_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_22_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_22_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_22_EYE[df_22_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_22_EYE[df_22_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_22_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_22_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_22_EYE[df_22_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_22_EYE[df_22_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_22_EYE[df_22_EYE['ET_PupilLeft'] == -1].shape[0] / df_22_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_22_EYE[df_22_EYE['ET_PupilRight'] == -1].shape[0] / df_22_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_22_EYE[df_22_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_22_EYE[df_22_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_22_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_22_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_22_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_22_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_22_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_22_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_22_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_22_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_22_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_22_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_22_EYE['Timestamp'], df_22_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_22_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_22_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_22_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_22_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_22_EYE[col].fillna(df_22_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_22_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_22_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_22_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 23

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_23_EYE = pd.read_csv('data/STData/23/23_EYE.csv')

In [None]:
df_23_EYE.head()

In [None]:
df_23_EYE.shape

In [None]:
df_23_EYE.columns

In [None]:
df_23_EYE.info()

In [None]:
df_23_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_23_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_23_EYE['QuestionKey'].unique()

In [None]:
df_23_EYE['Timestamp'] = pd.to_datetime(df_23_EYE['Timestamp'])

In [None]:
df_23_EYE.head(3)

In [None]:
df_23_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_23_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_23_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_23_EYE.isnull().sum()

In [None]:
df_23_EYE.dropna(inplace=True)

In [None]:
df_23_EYE.head()

In [None]:
df_23_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_23_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_23_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_23_EYE['ET_ValidityLeft'].unique()

In [None]:
df_23_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_23_EYE['ET_ValidityRight'].unique()

In [None]:
df_23_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_23_EYE['ET_ValidityLeft'].value_counts().index, y=df_23_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_23_EYE['ET_ValidityRight'].value_counts().index, y=df_23_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_23_EYE['ET_ValidityLeft'] = df_23_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_23_EYE['ET_ValidityRight'] = df_23_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_23_EYE.head(3)

In [None]:
df_23_EYE.describe()

In [None]:
df_23_EYE[df_23_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_23_EYE[df_23_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_23_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_23_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_23_EYE[df_23_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_23_EYE[df_23_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_23_EYE[df_23_EYE['ET_PupilLeft'] == -1].shape[0] / df_23_EYE.shape[0]

In [None]:
df_23_EYE[df_23_EYE['ET_PupilRight'] == -1].shape[0] / df_23_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_23_EYE[df_23_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_23_EYE[df_23_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_23_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_23_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_23_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_23_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_23_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_23_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_23_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_23_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_23_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_23_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_23_EYE['Timestamp'], df_23_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_23_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_23_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_23_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_23_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_23_EYE[col].fillna(df_23_EYE[col].mean(), inplace=True)

In [None]:
df_23_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_23_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_23_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_23_EYE = pd.read_csv('data/STData/23/23_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_23_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_23_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_23_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_23_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_23_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_23_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_23_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_23_EYE['Timestamp'] = pd.to_datetime(df_23_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_23_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_23_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_23_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_23_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_23_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_23_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_23_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_23_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_23_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_23_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_23_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_23_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_23_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_23_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_23_EYE['ET_ValidityLeft'].value_counts().index, y=df_23_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_23_EYE['ET_ValidityRight'].value_counts().index, y=df_23_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_23_EYE['ET_ValidityLeft'] = df_23_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_23_EYE['ET_ValidityRight'] = df_23_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_23_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_23_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_23_EYE[df_23_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_23_EYE[df_23_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_23_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_23_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_23_EYE[df_23_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_23_EYE[df_23_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_23_EYE[df_23_EYE['ET_PupilLeft'] == -1].shape[0] / df_23_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_23_EYE[df_23_EYE['ET_PupilRight'] == -1].shape[0] / df_23_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_23_EYE[df_23_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_23_EYE[df_23_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_23_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_23_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_23_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_23_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_23_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_23_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_23_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_23_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_23_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_23_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_23_EYE['Timestamp'], df_23_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_23_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_23_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_23_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_23_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_23_EYE[col].fillna(df_23_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_23_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_23_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_23_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 24

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_24_EYE = pd.read_csv('data/STData/24/24_EYE.csv')

In [None]:
df_24_EYE.head()

In [None]:
df_24_EYE.shape

In [None]:
df_24_EYE.columns

In [None]:
df_24_EYE.info()

In [None]:
df_24_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_24_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_24_EYE['QuestionKey'].unique()

In [None]:
df_24_EYE['Timestamp'] = pd.to_datetime(df_24_EYE['Timestamp'])

In [None]:
df_24_EYE.head(3)

In [None]:
df_24_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_24_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_24_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_24_EYE.isnull().sum()

In [None]:
df_24_EYE.dropna(inplace=True)

In [None]:
df_24_EYE.head()

In [None]:
df_24_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_24_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_24_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_24_EYE['ET_ValidityLeft'].unique()

In [None]:
df_24_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_24_EYE['ET_ValidityRight'].unique()

In [None]:
df_24_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_24_EYE['ET_ValidityLeft'].value_counts().index, y=df_24_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_24_EYE['ET_ValidityRight'].value_counts().index, y=df_24_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_24_EYE['ET_ValidityLeft'] = df_24_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_24_EYE['ET_ValidityRight'] = df_24_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_24_EYE.head(3)

In [None]:
df_24_EYE.describe()

In [None]:
df_24_EYE[df_24_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_24_EYE[df_24_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_24_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_24_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_24_EYE[df_24_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_24_EYE[df_24_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_24_EYE[df_24_EYE['ET_PupilLeft'] == -1].shape[0] / df_24_EYE.shape[0]

In [None]:
df_24_EYE[df_24_EYE['ET_PupilRight'] == -1].shape[0] / df_24_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_24_EYE[df_24_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_24_EYE[df_24_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_24_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_24_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_24_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_24_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_24_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_24_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_24_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_24_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_24_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_24_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_24_EYE['Timestamp'], df_24_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_24_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_24_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_24_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_24_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_24_EYE[col].fillna(df_24_EYE[col].mean(), inplace=True)

In [None]:
df_24_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_24_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_24_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_24_EYE = pd.read_csv('data/STData/24/24_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_24_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_24_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_24_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_24_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_24_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_24_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_24_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_24_EYE['Timestamp'] = pd.to_datetime(df_24_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_24_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_24_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_24_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_24_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_24_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_24_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_24_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_24_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_24_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_24_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_24_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_24_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_24_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_24_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_24_EYE['ET_ValidityLeft'].value_counts().index, y=df_24_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_24_EYE['ET_ValidityRight'].value_counts().index, y=df_24_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_24_EYE['ET_ValidityLeft'] = df_24_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_24_EYE['ET_ValidityRight'] = df_24_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_24_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_24_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_24_EYE[df_24_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_24_EYE[df_24_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_24_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_24_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_24_EYE[df_24_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_24_EYE[df_24_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_24_EYE[df_24_EYE['ET_PupilLeft'] == -1].shape[0] / df_24_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_24_EYE[df_24_EYE['ET_PupilRight'] == -1].shape[0] / df_24_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_24_EYE[df_24_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_24_EYE[df_24_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_24_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_24_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_24_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_24_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_24_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_24_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_24_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_24_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_24_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_24_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_24_EYE['Timestamp'], df_24_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_24_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_24_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_24_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_24_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_24_EYE[col].fillna(df_24_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_24_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_24_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_24_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 25

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_25_EYE = pd.read_csv('data/STData/25/25_EYE.csv')

In [None]:
df_25_EYE.head()

In [None]:
df_25_EYE.shape

In [None]:
df_25_EYE.columns

In [None]:
df_25_EYE.info()

In [None]:
df_25_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_25_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_25_EYE['QuestionKey'].unique()

In [None]:
df_25_EYE['Timestamp'] = pd.to_datetime(df_25_EYE['Timestamp'])

In [None]:
df_25_EYE.head(3)

In [None]:
df_25_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_25_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_25_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_25_EYE.isnull().sum()

In [None]:
df_25_EYE.dropna(inplace=True)

In [None]:
df_25_EYE.head()

In [None]:
df_25_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_25_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_25_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_25_EYE['ET_ValidityLeft'].unique()

In [None]:
df_25_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_25_EYE['ET_ValidityRight'].unique()

In [None]:
df_25_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_25_EYE['ET_ValidityLeft'].value_counts().index, y=df_25_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_25_EYE['ET_ValidityRight'].value_counts().index, y=df_25_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_25_EYE['ET_ValidityLeft'] = df_25_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_25_EYE['ET_ValidityRight'] = df_25_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_25_EYE.head(3)

In [None]:
df_25_EYE.describe()

In [None]:
df_25_EYE[df_25_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_25_EYE[df_25_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_25_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_25_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_25_EYE[df_25_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_25_EYE[df_25_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_25_EYE[df_25_EYE['ET_PupilLeft'] == -1].shape[0] / df_25_EYE.shape[0]

In [None]:
df_25_EYE[df_25_EYE['ET_PupilRight'] == -1].shape[0] / df_25_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_25_EYE[df_25_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_25_EYE[df_25_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_25_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_25_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_25_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_25_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_25_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_25_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_25_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_25_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_25_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_25_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_25_EYE['Timestamp'], df_25_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_25_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_25_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_25_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_25_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_25_EYE[col].fillna(df_25_EYE[col].mean(), inplace=True)

In [None]:
df_25_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_25_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_25_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_25_EYE = pd.read_csv('data/STData/25/25_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_25_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_25_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_25_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_25_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_25_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_25_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_25_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_25_EYE['Timestamp'] = pd.to_datetime(df_25_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_25_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_25_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_25_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_25_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_25_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_25_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_25_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_25_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_25_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_25_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_25_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_25_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_25_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_25_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_25_EYE['ET_ValidityLeft'].value_counts().index, y=df_25_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_25_EYE['ET_ValidityRight'].value_counts().index, y=df_25_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_25_EYE['ET_ValidityLeft'] = df_25_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_25_EYE['ET_ValidityRight'] = df_25_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_25_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_25_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_25_EYE[df_25_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_25_EYE[df_25_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_25_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_25_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_25_EYE[df_25_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_25_EYE[df_25_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_25_EYE[df_25_EYE['ET_PupilLeft'] == -1].shape[0] / df_25_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_25_EYE[df_25_EYE['ET_PupilRight'] == -1].shape[0] / df_25_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_25_EYE[df_25_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_25_EYE[df_25_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_25_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_25_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_25_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_25_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_25_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_25_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_25_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_25_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_25_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_25_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_25_EYE['Timestamp'], df_25_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_25_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_25_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_25_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_25_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_25_EYE[col].fillna(df_25_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_25_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_25_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_25_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 26

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_26_EYE = pd.read_csv('data/STData/26/26_EYE.csv')

In [None]:
df_26_EYE.head()

In [None]:
df_26_EYE.shape

In [None]:
df_26_EYE.columns

In [None]:
df_26_EYE.info()

In [None]:
df_26_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_26_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_26_EYE['QuestionKey'].unique()

In [None]:
df_26_EYE['Timestamp'] = pd.to_datetime(df_26_EYE['Timestamp'])

In [None]:
df_26_EYE.head(3)

In [None]:
df_26_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_26_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_26_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_26_EYE.isnull().sum()

In [None]:
df_26_EYE.dropna(inplace=True)

In [None]:
df_26_EYE.head()

In [None]:
df_26_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_26_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_26_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_26_EYE['ET_ValidityLeft'].unique()

In [None]:
df_26_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_26_EYE['ET_ValidityRight'].unique()

In [None]:
df_26_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_26_EYE['ET_ValidityLeft'].value_counts().index, y=df_26_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_26_EYE['ET_ValidityRight'].value_counts().index, y=df_26_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_26_EYE['ET_ValidityLeft'] = df_26_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_26_EYE['ET_ValidityRight'] = df_26_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_26_EYE.head(3)

In [None]:
df_26_EYE.describe()

In [None]:
df_26_EYE[df_26_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_26_EYE[df_26_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_26_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_26_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_26_EYE[df_26_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_26_EYE[df_26_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_26_EYE[df_26_EYE['ET_PupilLeft'] == -1].shape[0] / df_26_EYE.shape[0]

In [None]:
df_26_EYE[df_26_EYE['ET_PupilRight'] == -1].shape[0] / df_26_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_26_EYE[df_26_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_26_EYE[df_26_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_26_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_26_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_26_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_26_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_26_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_26_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_26_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_26_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_26_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_26_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_26_EYE['Timestamp'], df_26_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_26_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_26_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_26_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_26_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_26_EYE[col].fillna(df_26_EYE[col].mean(), inplace=True)

In [None]:
df_26_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_26_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_26_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_26_EYE = pd.read_csv('data/STData/26/26_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_26_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_26_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_26_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_26_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_26_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_26_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_26_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_26_EYE['Timestamp'] = pd.to_datetime(df_26_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_26_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_26_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_26_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_26_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_26_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_26_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_26_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_26_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_26_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_26_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_26_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_26_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_26_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_26_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_26_EYE['ET_ValidityLeft'].value_counts().index, y=df_26_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_26_EYE['ET_ValidityRight'].value_counts().index, y=df_26_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_26_EYE['ET_ValidityLeft'] = df_26_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_26_EYE['ET_ValidityRight'] = df_26_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_26_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_26_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_26_EYE[df_26_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_26_EYE[df_26_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_26_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_26_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_26_EYE[df_26_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_26_EYE[df_26_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_26_EYE[df_26_EYE['ET_PupilLeft'] == -1].shape[0] / df_26_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_26_EYE[df_26_EYE['ET_PupilRight'] == -1].shape[0] / df_26_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_26_EYE[df_26_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_26_EYE[df_26_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_26_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_26_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_26_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_26_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_26_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_26_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_26_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_26_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_26_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_26_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_26_EYE['Timestamp'], df_26_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_26_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_26_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_26_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_26_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_26_EYE[col].fillna(df_26_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_26_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_26_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_26_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 27

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_27_EYE = pd.read_csv('data/STData/27/27_EYE.csv')

In [None]:
df_27_EYE.head()

In [None]:
df_27_EYE.shape

In [None]:
df_27_EYE.columns

In [None]:
df_27_EYE.info()

In [None]:
df_27_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_27_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_27_EYE['QuestionKey'].unique()

In [None]:
df_27_EYE['Timestamp'] = pd.to_datetime(df_27_EYE['Timestamp'])

In [None]:
df_27_EYE.head(3)

In [None]:
df_27_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_27_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_27_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_27_EYE.isnull().sum()

In [None]:
df_27_EYE.dropna(inplace=True)

In [None]:
df_27_EYE.head()

In [None]:
df_27_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_27_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_27_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_27_EYE['ET_ValidityLeft'].unique()

In [None]:
df_27_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_27_EYE['ET_ValidityRight'].unique()

In [None]:
df_27_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_27_EYE['ET_ValidityLeft'].value_counts().index, y=df_27_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_27_EYE['ET_ValidityRight'].value_counts().index, y=df_27_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_27_EYE['ET_ValidityLeft'] = df_27_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_27_EYE['ET_ValidityRight'] = df_27_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_27_EYE.head(3)

In [None]:
df_27_EYE.describe()

In [None]:
df_27_EYE[df_27_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_27_EYE[df_27_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_27_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_27_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_27_EYE[df_27_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_27_EYE[df_27_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_27_EYE[df_27_EYE['ET_PupilLeft'] == -1].shape[0] / df_27_EYE.shape[0]

In [None]:
df_27_EYE[df_27_EYE['ET_PupilRight'] == -1].shape[0] / df_27_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_27_EYE[df_27_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_27_EYE[df_27_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_27_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_27_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_27_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_27_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_27_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_27_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_27_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_27_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_27_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_27_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_27_EYE['Timestamp'], df_27_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_27_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_27_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_27_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_27_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_27_EYE[col].fillna(df_27_EYE[col].mean(), inplace=True)

In [None]:
df_27_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_27_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_27_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_27_EYE = pd.read_csv('data/STData/27/27_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_27_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_27_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_27_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_27_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_27_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_27_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_27_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_27_EYE['Timestamp'] = pd.to_datetime(df_27_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_27_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_27_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_27_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_27_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_27_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_27_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_27_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_27_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_27_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_27_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_27_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_27_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_27_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_27_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_27_EYE['ET_ValidityLeft'].value_counts().index, y=df_27_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_27_EYE['ET_ValidityRight'].value_counts().index, y=df_27_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_27_EYE['ET_ValidityLeft'] = df_27_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_27_EYE['ET_ValidityRight'] = df_27_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_27_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_27_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_27_EYE[df_27_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_27_EYE[df_27_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_27_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_27_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_27_EYE[df_27_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_27_EYE[df_27_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_27_EYE[df_27_EYE['ET_PupilLeft'] == -1].shape[0] / df_27_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_27_EYE[df_27_EYE['ET_PupilRight'] == -1].shape[0] / df_27_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_27_EYE[df_27_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_27_EYE[df_27_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_27_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_27_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_27_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_27_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_27_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_27_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_27_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_27_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_27_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_27_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_27_EYE['Timestamp'], df_27_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_27_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_27_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_27_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_27_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_27_EYE[col].fillna(df_27_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_27_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_27_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_27_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 28

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_28_EYE = pd.read_csv('data/STData/28/28_EYE.csv')

In [None]:
df_28_EYE.head()

In [None]:
df_28_EYE.shape

In [None]:
df_28_EYE.columns

In [None]:
df_28_EYE.info()

In [None]:
df_28_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_28_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_28_EYE['QuestionKey'].unique()

In [None]:
df_28_EYE['Timestamp'] = pd.to_datetime(df_28_EYE['Timestamp'])

In [None]:
df_28_EYE.head(3)

In [None]:
df_28_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_28_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_28_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_28_EYE.isnull().sum()

In [None]:
df_28_EYE.dropna(inplace=True)

In [None]:
df_28_EYE.head()

In [None]:
df_28_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_28_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_28_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_28_EYE['ET_ValidityLeft'].unique()

In [None]:
df_28_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_28_EYE['ET_ValidityRight'].unique()

In [None]:
df_28_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_28_EYE['ET_ValidityLeft'].value_counts().index, y=df_28_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_28_EYE['ET_ValidityRight'].value_counts().index, y=df_28_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_28_EYE['ET_ValidityLeft'] = df_28_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_28_EYE['ET_ValidityRight'] = df_28_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_28_EYE.head(3)

In [None]:
df_28_EYE.describe()

In [None]:
df_28_EYE[df_28_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_28_EYE[df_28_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_28_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_28_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_28_EYE[df_28_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_28_EYE[df_28_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_28_EYE[df_28_EYE['ET_PupilLeft'] == -1].shape[0] / df_28_EYE.shape[0]

In [None]:
df_28_EYE[df_28_EYE['ET_PupilRight'] == -1].shape[0] / df_28_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_28_EYE[df_28_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_28_EYE[df_28_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_28_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_28_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_28_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_28_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_28_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_28_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_28_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_28_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_28_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_28_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_28_EYE['Timestamp'], df_28_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_28_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_28_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_28_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_28_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_28_EYE[col].fillna(df_28_EYE[col].mean(), inplace=True)

In [None]:
df_28_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_28_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_28_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_28_EYE = pd.read_csv('data/STData/28/28_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_28_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_28_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_28_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_28_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_28_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_28_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_28_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_28_EYE['Timestamp'] = pd.to_datetime(df_28_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_28_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_28_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_28_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_28_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_28_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_28_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_28_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_28_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_28_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_28_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_28_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_28_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_28_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_28_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_28_EYE['ET_ValidityLeft'].value_counts().index, y=df_28_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_28_EYE['ET_ValidityRight'].value_counts().index, y=df_28_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_28_EYE['ET_ValidityLeft'] = df_28_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_28_EYE['ET_ValidityRight'] = df_28_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_28_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_28_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_28_EYE[df_28_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_28_EYE[df_28_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_28_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_28_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_28_EYE[df_28_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_28_EYE[df_28_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_28_EYE[df_28_EYE['ET_PupilLeft'] == -1].shape[0] / df_28_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_28_EYE[df_28_EYE['ET_PupilRight'] == -1].shape[0] / df_28_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_28_EYE[df_28_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_28_EYE[df_28_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_28_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_28_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_28_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_28_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_28_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_28_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_28_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_28_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_28_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_28_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_28_EYE['Timestamp'], df_28_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_28_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_28_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_28_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_28_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_28_EYE[col].fillna(df_28_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_28_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_28_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_28_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 29

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_29_EYE = pd.read_csv('data/STData/29/29_EYE.csv')

In [None]:
df_29_EYE.head()

In [None]:
df_29_EYE.shape

In [None]:
df_29_EYE.columns

In [None]:
df_29_EYE.info()

In [None]:
df_29_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_29_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_29_EYE['QuestionKey'].unique()

In [None]:
df_29_EYE['Timestamp'] = pd.to_datetime(df_29_EYE['Timestamp'])

In [None]:
df_29_EYE.head(3)

In [None]:
df_29_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_29_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_29_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_29_EYE.isnull().sum()

In [None]:
df_29_EYE.dropna(inplace=True)

In [None]:
df_29_EYE.head()

In [None]:
df_29_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_29_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_29_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_29_EYE['ET_ValidityLeft'].unique()

In [None]:
df_29_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_29_EYE['ET_ValidityRight'].unique()

In [None]:
df_29_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_29_EYE['ET_ValidityLeft'].value_counts().index, y=df_29_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_29_EYE['ET_ValidityRight'].value_counts().index, y=df_29_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_29_EYE['ET_ValidityLeft'] = df_29_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_29_EYE['ET_ValidityRight'] = df_29_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_29_EYE.head(3)

In [None]:
df_29_EYE.describe()

In [None]:
df_29_EYE[df_29_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_29_EYE[df_29_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_29_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_29_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_29_EYE[df_29_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_29_EYE[df_29_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_29_EYE[df_29_EYE['ET_PupilLeft'] == -1].shape[0] / df_29_EYE.shape[0]

In [None]:
df_29_EYE[df_29_EYE['ET_PupilRight'] == -1].shape[0] / df_29_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_29_EYE[df_29_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_29_EYE[df_29_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_29_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_29_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_29_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_29_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_29_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_29_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_29_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_29_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_29_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_29_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_29_EYE['Timestamp'], df_29_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_29_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_29_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_29_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_29_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_29_EYE[col].fillna(df_29_EYE[col].mean(), inplace=True)

In [None]:
df_29_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_29_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_29_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_29_EYE = pd.read_csv('data/STData/29/29_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_29_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_29_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_29_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_29_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_29_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_29_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_29_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_29_EYE['Timestamp'] = pd.to_datetime(df_29_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_29_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_29_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_29_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_29_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_29_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_29_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_29_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_29_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_29_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_29_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_29_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_29_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_29_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_29_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_29_EYE['ET_ValidityLeft'].value_counts().index, y=df_29_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_29_EYE['ET_ValidityRight'].value_counts().index, y=df_29_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_29_EYE['ET_ValidityLeft'] = df_29_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_29_EYE['ET_ValidityRight'] = df_29_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_29_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_29_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_29_EYE[df_29_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_29_EYE[df_29_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_29_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_29_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_29_EYE[df_29_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_29_EYE[df_29_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_29_EYE[df_29_EYE['ET_PupilLeft'] == -1].shape[0] / df_29_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_29_EYE[df_29_EYE['ET_PupilRight'] == -1].shape[0] / df_29_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_29_EYE[df_29_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_29_EYE[df_29_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_29_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_29_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_29_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_29_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_29_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_29_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_29_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_29_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_29_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_29_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_29_EYE['Timestamp'], df_29_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_29_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_29_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_29_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_29_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_29_EYE[col].fillna(df_29_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_29_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_29_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_29_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 30

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_30_EYE = pd.read_csv('data/STData/30/30_EYE.csv')

In [None]:
df_30_EYE.head()

In [None]:
df_30_EYE.shape

In [None]:
df_30_EYE.columns

In [None]:
df_30_EYE.info()

In [None]:
df_30_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_30_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_30_EYE['QuestionKey'].unique()

In [None]:
df_30_EYE['Timestamp'] = pd.to_datetime(df_30_EYE['Timestamp'])

In [None]:
df_30_EYE.head(3)

In [None]:
df_30_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_30_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_30_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_30_EYE.isnull().sum()

In [None]:
df_30_EYE.dropna(inplace=True)

In [None]:
df_30_EYE.head()

In [None]:
df_30_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_30_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_30_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_30_EYE['ET_ValidityLeft'].unique()

In [None]:
df_30_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_30_EYE['ET_ValidityRight'].unique()

In [None]:
df_30_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_30_EYE['ET_ValidityLeft'].value_counts().index, y=df_30_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_30_EYE['ET_ValidityRight'].value_counts().index, y=df_30_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_30_EYE['ET_ValidityLeft'] = df_30_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_30_EYE['ET_ValidityRight'] = df_30_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_30_EYE.head(3)

In [None]:
df_30_EYE.describe()

In [None]:
df_30_EYE[df_30_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_30_EYE[df_30_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_30_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_30_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_30_EYE[df_30_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_30_EYE[df_30_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_30_EYE[df_30_EYE['ET_PupilLeft'] == -1].shape[0] / df_30_EYE.shape[0]

In [None]:
df_30_EYE[df_30_EYE['ET_PupilRight'] == -1].shape[0] / df_30_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_30_EYE[df_30_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_30_EYE[df_30_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_30_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_30_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_30_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_30_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_30_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_30_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_30_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_30_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_30_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_30_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_30_EYE['Timestamp'], df_30_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_30_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_30_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_30_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_30_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_30_EYE[col].fillna(df_30_EYE[col].mean(), inplace=True)

In [None]:
df_30_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_30_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_30_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_30_EYE = pd.read_csv('data/STData/30/30_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_30_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_30_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_30_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_30_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_30_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_30_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_30_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_30_EYE['Timestamp'] = pd.to_datetime(df_30_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_30_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_30_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_30_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_30_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_30_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_30_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_30_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_30_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_30_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_30_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_30_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_30_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_30_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_30_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_30_EYE['ET_ValidityLeft'].value_counts().index, y=df_30_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_30_EYE['ET_ValidityRight'].value_counts().index, y=df_30_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_30_EYE['ET_ValidityLeft'] = df_30_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_30_EYE['ET_ValidityRight'] = df_30_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_30_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_30_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_30_EYE[df_30_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_30_EYE[df_30_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_30_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_30_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_30_EYE[df_30_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_30_EYE[df_30_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_30_EYE[df_30_EYE['ET_PupilLeft'] == -1].shape[0] / df_30_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_30_EYE[df_30_EYE['ET_PupilRight'] == -1].shape[0] / df_30_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_30_EYE[df_30_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_30_EYE[df_30_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_30_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_30_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_30_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_30_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_30_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_30_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_30_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_30_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_30_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_30_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_30_EYE['Timestamp'], df_30_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_30_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_30_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_30_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_30_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_30_EYE[col].fillna(df_30_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_30_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_30_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_30_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()