# Feature Engineering

## 1: Locate and Collect All Eye-Tracking Files

In [None]:
# Import Colab drive utility
from google.colab import drive

# Mount Google Drive at /content/drive
# This allows access to files stored in your Drive
drive.mount('/content/drive')


In [None]:
import glob
import os
import random

# Path to the folder containing all eye-tracking text files
# Each .txt file corresponds to one recording session
DATA_DIR = "/content/drive/MyDrive/RF_eye_tracking/RAN"

# Find all .txt files inside the directory
# glob returns a list of full file paths
all_files = glob.glob(os.path.join(DATA_DIR, "*.txt"))

# Print how many recordings we found
print("Total files found:", len(all_files))


## 2: Group Files by User ID

In [None]:
# Dictionary to group files by user
# Key   → user ID (e.g., 1, 2, 3)
# Value → list of files belonging to that user
user_files = {}

for f in all_files:
    # File format example: ID_001_1.txt
    # Split filename and extract the user ID (001 → 1)
    uid = int(os.path.basename(f).split("_")[1])

    # Add file to that user's list
    # If the key does not exist, create an empty list first
    user_files.setdefault(uid, []).append(f)

# Print number of unique users found
print("Total users found:", len(user_files))

## 3: Select a Subset of Users

In [None]:
# Number of users we want for demonstration
NUM_USERS = 10

# Count how many users are available
num_available_users = len(user_files)

if num_available_users == 0:
    # No data found → stop early
    print("Warning: No user files found. Check the folder path.")
    SELECTED_USERS = []

elif num_available_users < NUM_USERS:
    # If fewer than 10 users exist, use all of them
    print(f"Warning: Only {num_available_users} users found. Using all.")
    SELECTED_USERS = sorted(list(user_files.keys()))

else:
    # Randomly select 10 users
    SELECTED_USERS = sorted(random.sample(list(user_files.keys()), NUM_USERS))

print("Selected users:", SELECTED_USERS)

## 4: Collect Files for Selected Users

In [None]:
# List that will store only the files we want to analyze
selected_files = []

for u in SELECTED_USERS:
    # Add all files belonging to this user
    selected_files.extend(user_files[u])

print("Total selected files:", len(selected_files))


## 5: Load and Clean Eye-Tracking Files

In [None]:
import pandas as pd
import numpy as np

def load_file(path):
    """
    Load a BioEye eye-tracking file and remove invalid samples.
    """

    # Read the file using whitespace as separator
    # Skip the first row (header text)
    df = pd.read_csv(
        path,
        sep=r"\s+",
        skiprows=1,
        header=None,
        names=["SAMPLE", "X", "Y", "VALID", "XS", "YS"]
    )

    # Keep only rows where gaze data is valid
    # VALID == 1 means the eye tracker successfully detected gaze
    df = df[df["VALID"] == 1]

    # Reset row indices after filtering
    df = df.reset_index(drop=True)

    return df


## 6: Windowing (Temporal Segmentation)

### Split Long Signals into Time Windows

In [None]:
def window_by_index(df, win=6000, step=3000):
    """
    Break a long eye-tracking signal into overlapping time windows.

    win  = number of samples per window
    step = how far we move forward for the next window
    """

    windows = []

    # Slide a window over the data
    for i in range(0, len(df) - win + 1, step):
        # Extract a chunk of consecutive samples
        w = df.iloc[i:i + win]
        windows.append(w)

    return windows


## 6: Feature Engineering (Turning Motion into Numbers)

In [None]:
def extract_features(w):
    """
    Extract numerical features from one gaze window.
    """

    # Difference between consecutive gaze points (horizontal)
    dx = np.diff(w["X"])

    # Difference between consecutive gaze points (vertical)
    dy = np.diff(w["Y"])

    # Velocity magnitude (how fast the eye moves)
    vel = np.sqrt(dx**2 + dy**2)

    # Return a list of statistical features
    return [
        w["X"].mean(),            # Average horizontal gaze position
        w["X"].std(),             # Variability in horizontal gaze
        w["Y"].mean(),            # Average vertical gaze position
        w["Y"].std(),             # Variability in vertical gaze
        vel.mean(),               # Average eye movement speed
        vel.std(),                # Speed variability
        np.percentile(vel, 75),   # Fast movements (upper quartile)
        np.percentile(vel, 90)    # Very fast movements (saccades)
    ]


## 7: Build the Feature Matrix (X) and Labels (y)

In [None]:
# X → feature matrix (input to machine learning model)
# y → labels (user IDs)
X = []
y = []

for f in selected_files:
    # Extract user ID from filename
    uid = int(os.path.basename(f).split("_")[1])

    # Load and clean gaze data
    df = load_file(f)

    # Split recording into time windows
    windows = window_by_index(df)

    for w in windows:
        # Extract numerical features from window
        features = extract_features(w)

        # Store features and corresponding user ID
        X.append(features)
        y.append(uid)

## 8: Convert to NumPy Arrays

In [None]:
# Convert lists to NumPy arrays
X = np.array(X)
y = np.array(y)

print("Total samples:", X.shape[0])
print("Features per sample:", X.shape[1])
print("Number of unique users:", len(np.unique(y)))
