## Student 31

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_31_EYE = pd.read_csv('data/STData/31/31_EYE.csv')

In [None]:
df_31_EYE.head()

In [None]:
df_31_EYE.shape

In [None]:
df_31_EYE.columns

In [None]:
df_31_EYE.info()

In [None]:
df_31_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_31_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_31_EYE['QuestionKey'].unique()

In [None]:
df_31_EYE['Timestamp'] = pd.to_datetime(df_31_EYE['Timestamp'])

In [None]:
df_31_EYE.head(3)

In [None]:
df_31_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_31_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_31_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_31_EYE.isnull().sum()

In [None]:
df_31_EYE.dropna(inplace=True)

In [None]:
df_31_EYE.head()

In [None]:
df_31_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_31_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_31_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_31_EYE['ET_ValidityLeft'].unique()

In [None]:
df_31_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_31_EYE['ET_ValidityRight'].unique()

In [None]:
df_31_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_31_EYE['ET_ValidityLeft'].value_counts().index, y=df_31_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_31_EYE['ET_ValidityRight'].value_counts().index, y=df_31_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_31_EYE['ET_ValidityLeft'] = df_31_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_31_EYE['ET_ValidityRight'] = df_31_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_31_EYE.head(3)

In [None]:
df_31_EYE.describe()

In [None]:
df_31_EYE[df_31_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_31_EYE[df_31_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_31_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_31_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_31_EYE[df_31_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_31_EYE[df_31_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_31_EYE[df_31_EYE['ET_PupilLeft'] == -1].shape[0] / df_31_EYE.shape[0]

In [None]:
df_31_EYE[df_31_EYE['ET_PupilRight'] == -1].shape[0] / df_31_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_31_EYE[df_31_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_31_EYE[df_31_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_31_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_31_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_31_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_31_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_31_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_31_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_31_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_31_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_31_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_31_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_31_EYE['Timestamp'], df_31_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_31_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_31_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_31_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_31_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_31_EYE[col].fillna(df_31_EYE[col].mean(), inplace=True)

In [None]:
df_31_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_31_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_31_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_31_EYE = pd.read_csv('data/STData/31/31_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_31_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_31_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_31_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_31_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_31_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_31_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_31_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_31_EYE['Timestamp'] = pd.to_datetime(df_31_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_31_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_31_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_31_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_31_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_31_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_31_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_31_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_31_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_31_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_31_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_31_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_31_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_31_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_31_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_31_EYE['ET_ValidityLeft'].value_counts().index, y=df_31_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_31_EYE['ET_ValidityRight'].value_counts().index, y=df_31_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_31_EYE['ET_ValidityLeft'] = df_31_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_31_EYE['ET_ValidityRight'] = df_31_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_31_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_31_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_31_EYE[df_31_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_31_EYE[df_31_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_31_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_31_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_31_EYE[df_31_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_31_EYE[df_31_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_31_EYE[df_31_EYE['ET_PupilLeft'] == -1].shape[0] / df_31_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_31_EYE[df_31_EYE['ET_PupilRight'] == -1].shape[0] / df_31_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_31_EYE[df_31_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_31_EYE[df_31_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_31_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_31_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_31_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_31_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_31_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_31_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_31_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_31_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_31_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_31_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_31_EYE['Timestamp'], df_31_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_31_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_31_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_31_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_31_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_31_EYE[col].fillna(df_31_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_31_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_31_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_31_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 32

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_32_EYE = pd.read_csv('data/STData/32/32_EYE.csv')

In [None]:
df_32_EYE.head()

In [None]:
df_32_EYE.shape

In [None]:
df_32_EYE.columns

In [None]:
df_32_EYE.info()

In [None]:
df_32_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_32_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_32_EYE['QuestionKey'].unique()

In [None]:
df_32_EYE['Timestamp'] = pd.to_datetime(df_32_EYE['Timestamp'])

In [None]:
df_32_EYE.head(3)

In [None]:
df_32_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_32_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_32_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_32_EYE.isnull().sum()

In [None]:
df_32_EYE.dropna(inplace=True)

In [None]:
df_32_EYE.head()

In [None]:
df_32_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_32_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_32_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_32_EYE['ET_ValidityLeft'].unique()

In [None]:
df_32_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_32_EYE['ET_ValidityRight'].unique()

In [None]:
df_32_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_32_EYE['ET_ValidityLeft'].value_counts().index, y=df_32_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_32_EYE['ET_ValidityRight'].value_counts().index, y=df_32_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_32_EYE['ET_ValidityLeft'] = df_32_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_32_EYE['ET_ValidityRight'] = df_32_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_32_EYE.head(3)

In [None]:
df_32_EYE.describe()

In [None]:
df_32_EYE[df_32_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_32_EYE[df_32_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_32_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_32_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_32_EYE[df_32_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_32_EYE[df_32_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_32_EYE[df_32_EYE['ET_PupilLeft'] == -1].shape[0] / df_32_EYE.shape[0]

In [None]:
df_32_EYE[df_32_EYE['ET_PupilRight'] == -1].shape[0] / df_32_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_32_EYE[df_32_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_32_EYE[df_32_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_32_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_32_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_32_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_32_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_32_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_32_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_32_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_32_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_32_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_32_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_32_EYE['Timestamp'], df_32_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_32_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_32_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_32_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_32_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_32_EYE[col].fillna(df_32_EYE[col].mean(), inplace=True)

In [None]:
df_32_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_32_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_32_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_32_EYE = pd.read_csv('data/STData/32/32_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_32_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_32_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_32_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_32_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_32_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_32_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_32_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_32_EYE['Timestamp'] = pd.to_datetime(df_32_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_32_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_32_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_32_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_32_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_32_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_32_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_32_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_32_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_32_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_32_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_32_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_32_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_32_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_32_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_32_EYE['ET_ValidityLeft'].value_counts().index, y=df_32_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_32_EYE['ET_ValidityRight'].value_counts().index, y=df_32_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_32_EYE['ET_ValidityLeft'] = df_32_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_32_EYE['ET_ValidityRight'] = df_32_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_32_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_32_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_32_EYE[df_32_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_32_EYE[df_32_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_32_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_32_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_32_EYE[df_32_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_32_EYE[df_32_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_32_EYE[df_32_EYE['ET_PupilLeft'] == -1].shape[0] / df_32_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_32_EYE[df_32_EYE['ET_PupilRight'] == -1].shape[0] / df_32_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_32_EYE[df_32_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_32_EYE[df_32_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_32_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_32_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_32_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_32_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_32_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_32_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_32_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_32_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_32_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_32_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_32_EYE['Timestamp'], df_32_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_32_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_32_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_32_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_32_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_32_EYE[col].fillna(df_32_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_32_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_32_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_32_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 33

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_33_EYE = pd.read_csv('data/STData/33/33_EYE.csv')

In [None]:
df_33_EYE.head()

In [None]:
df_33_EYE.shape

In [None]:
df_33_EYE.columns

In [None]:
df_33_EYE.info()

In [None]:
df_33_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_33_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_33_EYE['QuestionKey'].unique()

In [None]:
df_33_EYE['Timestamp'] = pd.to_datetime(df_33_EYE['Timestamp'])

In [None]:
df_33_EYE.head(3)

In [None]:
df_33_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_33_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_33_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_33_EYE.isnull().sum()

In [None]:
df_33_EYE.dropna(inplace=True)

In [None]:
df_33_EYE.head()

In [None]:
df_33_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_33_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_33_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_33_EYE['ET_ValidityLeft'].unique()

In [None]:
df_33_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_33_EYE['ET_ValidityRight'].unique()

In [None]:
df_33_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_33_EYE['ET_ValidityLeft'].value_counts().index, y=df_33_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_33_EYE['ET_ValidityRight'].value_counts().index, y=df_33_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_33_EYE['ET_ValidityLeft'] = df_33_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_33_EYE['ET_ValidityRight'] = df_33_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_33_EYE.head(3)

In [None]:
df_33_EYE.describe()

In [None]:
df_33_EYE[df_33_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_33_EYE[df_33_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_33_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_33_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_33_EYE[df_33_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_33_EYE[df_33_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_33_EYE[df_33_EYE['ET_PupilLeft'] == -1].shape[0] / df_33_EYE.shape[0]

In [None]:
df_33_EYE[df_33_EYE['ET_PupilRight'] == -1].shape[0] / df_33_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_33_EYE[df_33_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_33_EYE[df_33_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_33_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_33_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_33_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_33_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_33_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_33_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_33_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_33_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_33_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_33_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_33_EYE['Timestamp'], df_33_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_33_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_33_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_33_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_33_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_33_EYE[col].fillna(df_33_EYE[col].mean(), inplace=True)

In [None]:
df_33_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_33_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_33_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_33_EYE = pd.read_csv('data/STData/33/33_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_33_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_33_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_33_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_33_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_33_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_33_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_33_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_33_EYE['Timestamp'] = pd.to_datetime(df_33_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_33_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_33_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_33_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_33_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_33_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_33_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_33_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_33_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_33_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_33_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_33_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_33_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_33_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_33_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_33_EYE['ET_ValidityLeft'].value_counts().index, y=df_33_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_33_EYE['ET_ValidityRight'].value_counts().index, y=df_33_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_33_EYE['ET_ValidityLeft'] = df_33_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_33_EYE['ET_ValidityRight'] = df_33_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_33_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_33_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_33_EYE[df_33_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_33_EYE[df_33_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_33_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_33_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_33_EYE[df_33_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_33_EYE[df_33_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_33_EYE[df_33_EYE['ET_PupilLeft'] == -1].shape[0] / df_33_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_33_EYE[df_33_EYE['ET_PupilRight'] == -1].shape[0] / df_33_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_33_EYE[df_33_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_33_EYE[df_33_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_33_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_33_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_33_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_33_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_33_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_33_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_33_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_33_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_33_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_33_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_33_EYE['Timestamp'], df_33_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_33_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_33_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_33_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_33_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_33_EYE[col].fillna(df_33_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_33_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_33_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_33_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 34

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_34_EYE = pd.read_csv('data/STData/34/34_EYE.csv')

In [None]:
df_34_EYE.head()

In [None]:
df_34_EYE.shape

In [None]:
df_34_EYE.columns

In [None]:
df_34_EYE.info()

In [None]:
df_34_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_34_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_34_EYE['QuestionKey'].unique()

In [None]:
df_34_EYE['Timestamp'] = pd.to_datetime(df_34_EYE['Timestamp'])

In [None]:
df_34_EYE.head(3)

In [None]:
df_34_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_34_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_34_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_34_EYE.isnull().sum()

In [None]:
df_34_EYE.dropna(inplace=True)

In [None]:
df_34_EYE.head()

In [None]:
df_34_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_34_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_34_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_34_EYE['ET_ValidityLeft'].unique()

In [None]:
df_34_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_34_EYE['ET_ValidityRight'].unique()

In [None]:
df_34_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_34_EYE['ET_ValidityLeft'].value_counts().index, y=df_34_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_34_EYE['ET_ValidityRight'].value_counts().index, y=df_34_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_34_EYE['ET_ValidityLeft'] = df_34_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_34_EYE['ET_ValidityRight'] = df_34_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_34_EYE.head(3)

In [None]:
df_34_EYE.describe()

In [None]:
df_34_EYE[df_34_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_34_EYE[df_34_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_34_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_34_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_34_EYE[df_34_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_34_EYE[df_34_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_34_EYE[df_34_EYE['ET_PupilLeft'] == -1].shape[0] / df_34_EYE.shape[0]

In [None]:
df_34_EYE[df_34_EYE['ET_PupilRight'] == -1].shape[0] / df_34_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_34_EYE[df_34_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_34_EYE[df_34_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_34_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_34_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_34_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_34_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_34_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_34_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_34_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_34_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_34_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_34_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_34_EYE['Timestamp'], df_34_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_34_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_34_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_34_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_34_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_34_EYE[col].fillna(df_34_EYE[col].mean(), inplace=True)

In [None]:
df_34_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_34_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_34_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_34_EYE = pd.read_csv('data/STData/34/34_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_34_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_34_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_34_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_34_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_34_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_34_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_34_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_34_EYE['Timestamp'] = pd.to_datetime(df_34_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_34_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_34_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_34_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_34_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_34_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_34_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_34_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_34_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_34_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_34_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_34_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_34_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_34_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_34_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_34_EYE['ET_ValidityLeft'].value_counts().index, y=df_34_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_34_EYE['ET_ValidityRight'].value_counts().index, y=df_34_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_34_EYE['ET_ValidityLeft'] = df_34_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_34_EYE['ET_ValidityRight'] = df_34_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_34_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_34_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_34_EYE[df_34_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_34_EYE[df_34_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_34_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_34_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_34_EYE[df_34_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_34_EYE[df_34_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_34_EYE[df_34_EYE['ET_PupilLeft'] == -1].shape[0] / df_34_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_34_EYE[df_34_EYE['ET_PupilRight'] == -1].shape[0] / df_34_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_34_EYE[df_34_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_34_EYE[df_34_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_34_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_34_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_34_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_34_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_34_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_34_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_34_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_34_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_34_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_34_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_34_EYE['Timestamp'], df_34_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_34_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_34_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_34_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_34_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_34_EYE[col].fillna(df_34_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_34_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_34_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_34_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 35

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_35_EYE = pd.read_csv('data/STData/35/35_EYE.csv')

In [None]:
df_35_EYE.head()

In [None]:
df_35_EYE.shape

In [None]:
df_35_EYE.columns

In [None]:
df_35_EYE.info()

In [None]:
df_35_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_35_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_35_EYE['QuestionKey'].unique()

In [None]:
df_35_EYE['Timestamp'] = pd.to_datetime(df_35_EYE['Timestamp'])

In [None]:
df_35_EYE.head(3)

In [None]:
df_35_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_35_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_35_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_35_EYE.isnull().sum()

In [None]:
df_35_EYE.dropna(inplace=True)

In [None]:
df_35_EYE.head()

In [None]:
df_35_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_35_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_35_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_35_EYE['ET_ValidityLeft'].unique()

In [None]:
df_35_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_35_EYE['ET_ValidityRight'].unique()

In [None]:
df_35_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_35_EYE['ET_ValidityLeft'].value_counts().index, y=df_35_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_35_EYE['ET_ValidityRight'].value_counts().index, y=df_35_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_35_EYE['ET_ValidityLeft'] = df_35_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_35_EYE['ET_ValidityRight'] = df_35_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_35_EYE.head(3)

In [None]:
df_35_EYE.describe()

In [None]:
df_35_EYE[df_35_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_35_EYE[df_35_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_35_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_35_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_35_EYE[df_35_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_35_EYE[df_35_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_35_EYE[df_35_EYE['ET_PupilLeft'] == -1].shape[0] / df_35_EYE.shape[0]

In [None]:
df_35_EYE[df_35_EYE['ET_PupilRight'] == -1].shape[0] / df_35_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_35_EYE[df_35_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_35_EYE[df_35_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_35_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_35_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_35_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_35_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_35_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_35_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_35_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_35_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_35_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_35_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_35_EYE['Timestamp'], df_35_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_35_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_35_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_35_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_35_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_35_EYE[col].fillna(df_35_EYE[col].mean(), inplace=True)

In [None]:
df_35_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_35_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_35_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_35_EYE = pd.read_csv('data/STData/35/35_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_35_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_35_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_35_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_35_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_35_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_35_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_35_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_35_EYE['Timestamp'] = pd.to_datetime(df_35_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_35_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_35_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_35_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_35_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_35_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_35_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_35_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_35_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_35_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_35_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_35_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_35_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_35_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_35_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_35_EYE['ET_ValidityLeft'].value_counts().index, y=df_35_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_35_EYE['ET_ValidityRight'].value_counts().index, y=df_35_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_35_EYE['ET_ValidityLeft'] = df_35_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_35_EYE['ET_ValidityRight'] = df_35_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_35_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_35_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_35_EYE[df_35_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_35_EYE[df_35_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_35_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_35_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_35_EYE[df_35_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_35_EYE[df_35_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_35_EYE[df_35_EYE['ET_PupilLeft'] == -1].shape[0] / df_35_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_35_EYE[df_35_EYE['ET_PupilRight'] == -1].shape[0] / df_35_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_35_EYE[df_35_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_35_EYE[df_35_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_35_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_35_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_35_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_35_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_35_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_35_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_35_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_35_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_35_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_35_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_35_EYE['Timestamp'], df_35_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_35_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_35_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_35_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_35_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_35_EYE[col].fillna(df_35_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_35_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_35_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_35_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 36

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_36_EYE = pd.read_csv('data/STData/36/36_EYE.csv')

In [None]:
df_36_EYE.head()

In [None]:
df_36_EYE.shape

In [None]:
df_36_EYE.columns

In [None]:
df_36_EYE.info()

In [None]:
df_36_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_36_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_36_EYE['QuestionKey'].unique()

In [None]:
df_36_EYE['Timestamp'] = pd.to_datetime(df_36_EYE['Timestamp'])

In [None]:
df_36_EYE.head(3)

In [None]:
df_36_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_36_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_36_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_36_EYE.isnull().sum()

In [None]:
df_36_EYE.dropna(inplace=True)

In [None]:
df_36_EYE.head()

In [None]:
df_36_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_36_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_36_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_36_EYE['ET_ValidityLeft'].unique()

In [None]:
df_36_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_36_EYE['ET_ValidityRight'].unique()

In [None]:
df_36_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_36_EYE['ET_ValidityLeft'].value_counts().index, y=df_36_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_36_EYE['ET_ValidityRight'].value_counts().index, y=df_36_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_36_EYE['ET_ValidityLeft'] = df_36_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_36_EYE['ET_ValidityRight'] = df_36_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_36_EYE.head(3)

In [None]:
df_36_EYE.describe()

In [None]:
df_36_EYE[df_36_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_36_EYE[df_36_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_36_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_36_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_36_EYE[df_36_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_36_EYE[df_36_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_36_EYE[df_36_EYE['ET_PupilLeft'] == -1].shape[0] / df_36_EYE.shape[0]

In [None]:
df_36_EYE[df_36_EYE['ET_PupilRight'] == -1].shape[0] / df_36_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_36_EYE[df_36_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_36_EYE[df_36_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_36_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_36_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_36_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_36_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_36_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_36_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_36_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_36_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_36_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_36_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_36_EYE['Timestamp'], df_36_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_36_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_36_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_36_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_36_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_36_EYE[col].fillna(df_36_EYE[col].mean(), inplace=True)

In [None]:
df_36_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_36_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_36_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_36_EYE = pd.read_csv('data/STData/36/36_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_36_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_36_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_36_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_36_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_36_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_36_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_36_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_36_EYE['Timestamp'] = pd.to_datetime(df_36_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_36_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_36_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_36_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_36_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_36_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_36_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_36_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_36_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_36_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_36_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_36_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_36_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_36_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_36_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_36_EYE['ET_ValidityLeft'].value_counts().index, y=df_36_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_36_EYE['ET_ValidityRight'].value_counts().index, y=df_36_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_36_EYE['ET_ValidityLeft'] = df_36_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_36_EYE['ET_ValidityRight'] = df_36_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_36_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_36_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_36_EYE[df_36_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_36_EYE[df_36_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_36_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_36_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_36_EYE[df_36_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_36_EYE[df_36_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_36_EYE[df_36_EYE['ET_PupilLeft'] == -1].shape[0] / df_36_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_36_EYE[df_36_EYE['ET_PupilRight'] == -1].shape[0] / df_36_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_36_EYE[df_36_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_36_EYE[df_36_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_36_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_36_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_36_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_36_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_36_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_36_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_36_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_36_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_36_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_36_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_36_EYE['Timestamp'], df_36_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_36_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_36_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_36_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_36_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_36_EYE[col].fillna(df_36_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_36_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_36_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_36_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 37

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_37_EYE = pd.read_csv('data/STData/37/37_EYE.csv')

In [None]:
df_37_EYE.head()

In [None]:
df_37_EYE.shape

In [None]:
df_37_EYE.columns

In [None]:
df_37_EYE.info()

In [None]:
df_37_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_37_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_37_EYE['QuestionKey'].unique()

In [None]:
df_37_EYE['Timestamp'] = pd.to_datetime(df_37_EYE['Timestamp'])

In [None]:
df_37_EYE.head(3)

In [None]:
df_37_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_37_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_37_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_37_EYE.isnull().sum()

In [None]:
df_37_EYE.dropna(inplace=True)

In [None]:
df_37_EYE.head()

In [None]:
df_37_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_37_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_37_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_37_EYE['ET_ValidityLeft'].unique()

In [None]:
df_37_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_37_EYE['ET_ValidityRight'].unique()

In [None]:
df_37_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_37_EYE['ET_ValidityLeft'].value_counts().index, y=df_37_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_37_EYE['ET_ValidityRight'].value_counts().index, y=df_37_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_37_EYE['ET_ValidityLeft'] = df_37_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_37_EYE['ET_ValidityRight'] = df_37_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_37_EYE.head(3)

In [None]:
df_37_EYE.describe()

In [None]:
df_37_EYE[df_37_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_37_EYE[df_37_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_37_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_37_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_37_EYE[df_37_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_37_EYE[df_37_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_37_EYE[df_37_EYE['ET_PupilLeft'] == -1].shape[0] / df_37_EYE.shape[0]

In [None]:
df_37_EYE[df_37_EYE['ET_PupilRight'] == -1].shape[0] / df_37_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_37_EYE[df_37_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_37_EYE[df_37_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_37_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_37_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_37_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_37_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_37_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_37_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_37_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_37_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_37_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_37_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_37_EYE['Timestamp'], df_37_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_37_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_37_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_37_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_37_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_37_EYE[col].fillna(df_37_EYE[col].mean(), inplace=True)

In [None]:
df_37_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_37_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_37_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_37_EYE = pd.read_csv('data/STData/37/37_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_37_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_37_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_37_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_37_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_37_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_37_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_37_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_37_EYE['Timestamp'] = pd.to_datetime(df_37_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_37_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_37_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_37_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_37_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_37_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_37_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_37_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_37_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_37_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_37_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_37_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_37_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_37_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_37_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_37_EYE['ET_ValidityLeft'].value_counts().index, y=df_37_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_37_EYE['ET_ValidityRight'].value_counts().index, y=df_37_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_37_EYE['ET_ValidityLeft'] = df_37_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_37_EYE['ET_ValidityRight'] = df_37_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_37_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_37_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_37_EYE[df_37_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_37_EYE[df_37_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_37_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_37_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_37_EYE[df_37_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_37_EYE[df_37_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_37_EYE[df_37_EYE['ET_PupilLeft'] == -1].shape[0] / df_37_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_37_EYE[df_37_EYE['ET_PupilRight'] == -1].shape[0] / df_37_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_37_EYE[df_37_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_37_EYE[df_37_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_37_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_37_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_37_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_37_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_37_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_37_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_37_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_37_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_37_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_37_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_37_EYE['Timestamp'], df_37_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_37_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_37_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_37_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_37_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_37_EYE[col].fillna(df_37_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_37_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_37_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_37_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Student 38

In [None]:
%load_ext cudf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_38_EYE = pd.read_csv('data/STData/38/38_EYE.csv')

In [None]:
df_38_EYE.head()

In [None]:
df_38_EYE.shape

In [None]:
df_38_EYE.columns

In [None]:
df_38_EYE.info()

In [None]:
df_38_EYE.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_38_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.  
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.  
- These missing values in `QuestionKey` require additional investigation and context-aware handling.  

In [None]:
df_38_EYE['QuestionKey'].unique()

In [None]:
df_38_EYE['Timestamp'] = pd.to_datetime(df_38_EYE['Timestamp'])

In [None]:
df_38_EYE.head(3)

In [None]:
df_38_EYE['QuestionKey'].fillna('None', inplace=True)

In [None]:
df_38_EYE['QuestionKey'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_38_EYE.isnull(), cmap='viridis')
plt.show()

In [None]:
df_38_EYE.isnull().sum()

In [None]:
df_38_EYE.dropna(inplace=True)

In [None]:
df_38_EYE.head()

In [None]:
df_38_EYE['Row'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_38_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

In [None]:
df_38_EYE.drop('Row', axis=1, inplace=True)

In [None]:
df_38_EYE['ET_ValidityLeft'].unique()

In [None]:
df_38_EYE['ET_ValidityLeft'].value_counts()

In [None]:
df_38_EYE['ET_ValidityRight'].unique()

In [None]:
df_38_EYE['ET_ValidityRight'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_38_EYE['ET_ValidityLeft'].value_counts().index, y=df_38_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_38_EYE['ET_ValidityRight'].value_counts().index, y=df_38_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

In [None]:
df_38_EYE['ET_ValidityLeft'] = df_38_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_38_EYE['ET_ValidityRight'] = df_38_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

In [None]:
df_38_EYE.head(3)

In [None]:
df_38_EYE.describe()

In [None]:
df_38_EYE[df_38_EYE['ET_ValidityLeft'] == 1].shape

In [None]:
df_38_EYE[df_38_EYE['ET_ValidityRight'] == 1].shape

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_38_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_38_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

In [None]:
df_38_EYE[df_38_EYE['ET_PupilLeft'] == -1].shape

In [None]:
df_38_EYE[df_38_EYE['ET_PupilRight'] == -1].shape

In [None]:
df_38_EYE[df_38_EYE['ET_PupilLeft'] == -1].shape[0] / df_38_EYE.shape[0]

In [None]:
df_38_EYE[df_38_EYE['ET_PupilRight'] == -1].shape[0] / df_38_EYE.shape[0]

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_38_EYE[df_38_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_38_EYE[df_38_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

In [None]:
df_38_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

In [None]:
df_38_EYE.head()

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_38_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_38_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

In [None]:
valid_left_ratio  = 1 - df_38_EYE['ET_ValidityLeft'].mean()
valid_right_ratio = 1 - df_38_EYE['ET_ValidityRight'].mean()

In [None]:
valid_left_ratio

In [None]:
valid_right_ratio

In [None]:
df_38_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_38_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_38_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
df_38_EYE.columns

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_38_EYE['Timestamp'], df_38_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_38_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_38_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
df_38_EYE.replace({-1: np.nan}, inplace=True)

In [None]:
numeric_cols = df_38_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_38_EYE[col].fillna(df_38_EYE[col].mean(), inplace=True)

In [None]:
df_38_EYE.head()

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_38_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_38_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Eye Tracking Data Analysis

This notebook performs exploratory data analysis and cleaning on eye-tracking data.

## Data Loading and Initial Inspection

In [None]:
%load_ext cudf

Import necessary libraries for data manipulation, analysis, and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import libraries for datashader, which can be used for visualizing large datasets.

In [None]:
import datashader as ds
import datashader.transfer_functions as tf

Set pandas display options to show all columns.

In [None]:
pd.set_option('display.max_columns', None)

Load the eye-tracking data from the specified CSV file into a pandas DataFrame.

In [None]:
df_38_EYE = pd.read_csv('data/STData/38/38_EYE.csv')

Display the first few rows of the DataFrame to get an initial look at the data structure and content.

In [None]:
df_38_EYE.head()

Check the dimensions (number of rows and columns) of the DataFrame.

In [None]:
df_38_EYE.shape

List the names of all columns in the DataFrame.

In [None]:
df_38_EYE.columns

Display concise information about the DataFrame, including the data types of each column and the number of non-null values. This helps identify columns with missing data.

In [None]:
df_38_EYE.info()

Calculate and display the number of missing values in each column.

In [None]:
df_38_EYE.isnull().sum()

Visualize the distribution of missing values using a heatmap. This provides a visual representation of which columns have missing data and the extent of missingness.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_38_EYE.isnull(), cmap='viridis')
plt.show()

# Notes & Observations

- We observe many **null** (or missing) values in the `QuestionKey` columns.
- The nulls in the `QuestionKey` column may not represent “true” nulls. Rather, they follow interval patterns, suggesting that during those periods no question was displayed.
- These missing values in `QuestionKey` require additional investigation and context-aware handling.

Display the unique values in the `QuestionKey` column to understand the different types of questions or states recorded.

In [None]:
df_38_EYE['QuestionKey'].unique()

Convert the `Timestamp` column to datetime objects. This is crucial for any time series analysis or manipulation.

In [None]:
df_38_EYE['Timestamp'] = pd.to_datetime(df_38_EYE['Timestamp'])

Display the first few rows again to see the effect of the timestamp conversion.

In [None]:
df_38_EYE.head(3)

Fill the missing values in the `QuestionKey` column with the string 'None'. This helps to explicitly mark periods where no question was active.

In [None]:
df_38_EYE['QuestionKey'].fillna('None', inplace=True)

Count the occurrences of each unique value in the `QuestionKey` column after filling missing values.

In [None]:
df_38_EYE['QuestionKey'].value_counts()

Display the heatmap of missing values again to confirm that the missing values in `QuestionKey` have been handled.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_38_EYE.isnull(), cmap='viridis')
plt.show()

Recalculate and display the number of missing values per column to confirm the changes after handling `QuestionKey`.

In [None]:
df_38_EYE.isnull().sum()

Drop rows that still contain any missing values. This is done after handling `QuestionKey` separately.

In [None]:
df_38_EYE.dropna(inplace=True)

Display the first few rows after dropping rows with missing values.

In [None]:
df_38_EYE.head()

Display the unique values in the `Row` column.

In [None]:
df_38_EYE['Row'].unique()

Plot a histogram of the `Row` column to visualize its distribution.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(df_38_EYE['Row'])
plt.show()

# Notes & Observations

- The `Row` column appears to be a simple row index and does not provide meaningful information relevant to the eye-tracking data itself. Therefore, it can be dropped.

Drop the `Row` column as it is not relevant for the analysis.

In [None]:
df_38_EYE.drop('Row', axis=1, inplace=True)

Display the unique values in the `ET_ValidityLeft` column.

In [None]:
df_38_EYE['ET_ValidityLeft'].unique()

Count the occurrences of each unique value in the `ET_ValidityLeft` column.

In [None]:
df_38_EYE['ET_ValidityLeft'].value_counts()

Display the unique values in the `ET_ValidityRight` column.

In [None]:
df_38_EYE['ET_ValidityRight'].unique()

Count the occurrences of each unique value in the `ET_ValidityRight` column.

In [None]:
df_38_EYE['ET_ValidityRight'].value_counts()

Visualize the counts of valid and invalid data for both left and right eyes using bar plots.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x=df_38_EYE['ET_ValidityLeft'].value_counts().index, y=df_38_EYE['ET_ValidityLeft'].value_counts().values)
plt.title('Count of ET_ValidityLeft')
plt.xlabel('Validity')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.barplot(x=df_38_EYE['ET_ValidityRight'].value_counts().index, y=df_38_EYE['ET_ValidityRight'].value_counts().values)
plt.title('Count of ET_ValidityRight')
plt.xlabel('Validity')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Notes & Observations

- The `ET_ValidityLeft` and `ET_ValidityRight` columns indicate the validity of the eye-tracking data for the left and right eye, respectively.
- Based on the value counts and the bar plots, it appears that a value of `0.0` represents valid eye-tracking data, while a value of `4.0` represents invalid data.
- Although the amount of invalid data is relatively small, removing these rows could introduce unwanted patterns or gaps in the time series data.
- Therefore, we will keep the data and replace the value `4.0` with `1.0` in both `ET_ValidityLeft` and `ET_ValidityRight` columns. This will indicate to a machine learning model that the eye tracker had invalid data at those specific points in time while maintaining the integrity of the time series.

Define a mapping to convert validity values from `0.0` and `4.0` to `0` and `1`.

In [None]:
validity_map = {4.0: 1.0, 0.0: 0.0}

Apply the mapping to the `ET_ValidityLeft` and `ET_ValidityRight` columns and convert the data type to integer.

In [None]:
df_38_EYE['ET_ValidityLeft'] = df_38_EYE['ET_ValidityLeft'].map(validity_map).astype(np.int8)
df_38_EYE['ET_ValidityRight'] = df_38_EYE['ET_ValidityRight'].map(validity_map).astype(np.int8)

Display the first few rows to see the updated validity columns.

In [None]:
df_38_EYE.head(3)

Display descriptive statistics for the DataFrame, including count, mean, standard deviation, min, max, and quartiles for numeric columns.

In [None]:
df_38_EYE.describe()

Check the number of rows where `ET_ValidityLeft` is 1 (invalid data for the left eye).

In [None]:
df_38_EYE[df_38_EYE['ET_ValidityLeft'] == 1].shape

Check the number of rows where `ET_ValidityRight` is 1 (invalid data for the right eye).

In [None]:
df_38_EYE[df_38_EYE['ET_ValidityRight'] == 1].shape

Visualize the distribution of -1 and 1 values across columns using heatmaps. This helps identify columns with placeholder or indicator values.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_38_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_38_EYE == 1, cmap='viridis')
plt.title('Heatmap of 1 Values')

plt.tight_layout()
plt.show()

Check the number of rows where `ET_PupilLeft` is -1.

In [None]:
df_38_EYE[df_38_EYE['ET_PupilLeft'] == -1].shape

Check the number of rows where `ET_PupilRight` is -1.

In [None]:
df_38_EYE[df_38_EYE['ET_PupilRight'] == -1].shape

Calculate the ratio of rows with -1 values in the `ET_PupilLeft` column to the total number of rows.

In [None]:
df_38_EYE[df_38_EYE['ET_PupilLeft'] == -1].shape[0] / df_38_EYE.shape[0]

Calculate the ratio of rows with -1 values in the `ET_PupilRight` column to the total number of rows.

In [None]:
df_38_EYE[df_38_EYE['ET_PupilRight'] == -1].shape[0] / df_38_EYE.shape[0]

Visualize the distribution of -1 values specifically for rows where validity is 1 (invalid data).

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_38_EYE[df_38_EYE['ET_ValidityLeft'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_38_EYE[df_38_EYE['ET_ValidityRight'] == 1] == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

# Notes & Observations

- The heatmaps reveal the distribution of -1 values across different columns.
- It is evident that the `-1` values are not randomly scattered but appear in specific columns, notably `ET_GazeLeftx`, `ET_GazeLefty`, `ET_GazeRightx`, `ET_GazeRighty`, `ET_PupilLeft`, `ET_PupilRight`, `ET_DistanceLeft`, `ET_DistanceRight`, `ET_CameraLeftX`, `ET_CameraLeftY`, `ET_CameraRightX`, and `ET_CameraRightY`.
- These `-1` values often coincide with instances where `ET_ValidityLeft` or `ET_ValidityRight` is 1, indicating invalid eye-tracking data. This suggests that `-1` is used as a placeholder for missing or invalid measurements in these columns when the eye tracker is not providing valid data for a particular eye.
- Given that over 70% of the data in the `ET_PupilLeft` and `ET_PupilRight` columns is marked as invalid (-1), these columns may not be suitable for analysis and could be considered for dropping.

Drop the `ET_PupilLeft` and `ET_PupilRight` columns due to the high percentage of invalid data.

In [None]:
df_38_EYE.drop(['ET_PupilLeft', 'ET_PupilRight'], axis=1, inplace=True)

Display the first few rows after dropping the pupil columns.

In [None]:
df_38_EYE.head()

Visualize the distribution of -1 and 1 values again after dropping the pupil columns.

In [None]:
plt.figure(figsize=(18, 8))

plt.subplot(1, 2, 1)
sns.heatmap(df_38_EYE == -1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.subplot(1, 2, 2)
sns.heatmap(df_38_EYE == 1, cmap='viridis')
plt.title('Heatmap of -1 Values')

plt.tight_layout()
plt.show()

Calculate the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio  = 1 - df_38_EYE['ET_ValidityLeft'].mean()

Display the ratio of valid data points for the left eye.

In [None]:
valid_left_ratio

Calculate the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio = 1 - df_38_EYE['ET_ValidityRight'].mean()

Display the ratio of valid data points for the right eye.

In [None]:
valid_right_ratio

Display the first few rows of the DataFrame.

In [None]:
df_38_EYE.head()

Visualize the distributions of numeric columns using histograms with Kernel Density Estimate (KDE) plots. This helps understand the shape and spread of the data in each numeric column.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_38_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_38_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

List the names of the columns in the DataFrame.

In [None]:
df_38_EYE.columns

Define a list of columns to be plotted over time.

In [None]:
cols = ['ET_GazeLeftx', 'ET_GazeLefty',
       'ET_GazeRightx', 'ET_GazeRighty', 'ET_TimeSignal', 'ET_DistanceLeft',
       'ET_DistanceRight', 'ET_CameraLeftX', 'ET_CameraLeftY',
       'ET_CameraRightX', 'ET_CameraRightY', 'ET_ValidityLeft',
       'ET_ValidityRight']

Plot each of the selected columns against the timestamp to visualize their trends and patterns over time. Markdown cells are added before each plot for better readability.

In [None]:
from IPython.display import display, Markdown

for col in cols:
    # Add a markdown cell before each plot for better separation and labeling
    display(Markdown(f'### {col} over Time'))
    plt.figure(figsize=(16, 10))
    plt.plot(df_38_EYE['Timestamp'], df_38_EYE[col])
    plt.xlabel("Timestamp") # Add x-axis label
    plt.ylabel(col) # Add y-axis label
    plt.show()

Visualize the distributions of numeric columns using boxplots. This helps identify potential outliers and the spread of the data.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_38_EYE.select_dtypes(include=np.number).columns

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(df_38_EYE[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

plt.tight_layout()
plt.show()

Replace the placeholder value -1 with NaN (Not a Number) in the DataFrame. This is done before imputation.

In [None]:
df_38_EYE.replace({-1: np.nan}, inplace=True)

Impute the missing values (NaN) in the numeric columns with the mean of each column.

In [None]:
numeric_cols = df_38_EYE.select_dtypes(include=np.number).columns

for col in numeric_cols:
    df_38_EYE[col].fillna(df_38_EYE[col].mean(), inplace=True)

Display the first few rows after imputing missing values.

In [None]:
df_38_EYE.head()

Visualize the distributions of numeric columns again using histograms with KDE plots after imputation. This helps assess the impact of imputation on the data distributions.

In [None]:
# Select only the numeric columns for plotting histograms, excluding time-related columns
numeric_cols = df_38_EYE.select_dtypes(include=np.number).columns
cols_to_plot = [col for col in numeric_cols if col not in ['UnixTime']]

# Calculate the number of rows and columns for the grid
n_cols = 4  # You can adjust the number of columns as needed
n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size as needed

for i, col in enumerate(cols_to_plot):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(df_38_EYE[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()