# Random Forest Model

## 1: Mount Google Drive

In [None]:
# Import Colab drive utility
from google.colab import drive

# Mount Google Drive at /content/drive
# This allows access to files stored in your Drive
drive.mount('/content/drive')


## 2: Locate Dataset Files

In [None]:
import glob
import os
import random

# Path to the directory containing BioEye .txt files
DATA_DIR = "/content/drive/MyDrive/RF_eye_tracking/RAN"

# Get full paths to all .txt files in the directory
all_files = glob.glob(os.path.join(DATA_DIR, "*.txt"))

print(f"Total files found: {len(all_files)}")

## 3: Group Files by User ID

In [None]:
# Dictionary: { user_id : [file1, file2, ...] }
user_files = {}

for f in all_files:
    # Filename format: ID_001_1.txt → extract user ID = 1
    uid = int(os.path.basename(f).split("_")[1])
    
    # Add file to that user's list
    user_files.setdefault(uid, []).append(f)

print(f"Total users found: {len(user_files)}")


## 4: Select a Subset of Users

In [None]:
# Number of users to sample for demonstration
NUM_USERS = 10

num_available_users = len(user_files)

if num_available_users == 0:
    print("No users found. Check directory path.")
    SELECTED_USERS = []

elif num_available_users < NUM_USERS:
    print(f"Only {num_available_users} users available. Using all.")
    SELECTED_USERS = sorted(user_files.keys())

else:
    # Randomly sample users (reproducible teaching demo)
    SELECTED_USERS = sorted(random.sample(list(user_files.keys()), NUM_USERS))

print("Selected users:", SELECTED_USERS)


 ## 5: Collect Files for Selected Users

In [None]:
# List of files corresponding to selected users
selected_files = []

for u in SELECTED_USERS:
    selected_files.extend(user_files[u])

print(f"Total selected files: {len(selected_files)}")


## 6: Load and Clean Eye-Tracking Files

In [None]:
import pandas as pd
import numpy as np

def load_file(path):
    """
    Load a BioEye .txt file and keep only valid gaze samples.
    """
    df = pd.read_csv(
        path,
        sep=r"\s+",
        skiprows=1,
        header=None,
        names=["SAMPLE", "X", "Y", "VALID", "XS", "YS"]
    )
    
    # Keep only valid gaze samples
    df = df[df["VALID"] == 1].reset_index(drop=True)
    
    return df


## 7: Windowing (Temporal Segmentation)


In [None]:
def window_by_index(df, win=6000, step=3000):
    """
    Split a gaze signal into overlapping windows.

    win  = window size (samples)
    step = overlap stride (samples)
    """
    return [
        df.iloc[i:i + win]
        for i in range(0, len(df) - win + 1, step)
    ]


## 8: Feature Extraction

In [None]:
def extract_features(w):
    """
    Extract simple statistical features from one gaze window.
    """

    # Compute point-to-point displacement
    dx = np.diff(w["X"])
    dy = np.diff(w["Y"])

    # Velocity magnitude (pixels/sample)
    vel = np.sqrt(dx**2 + dy**2)

    # Return feature vector
    return [
        w["X"].mean(),            # Mean horizontal gaze position
        w["X"].std(),             # Std of horizontal gaze
        w["Y"].mean(),            # Mean vertical gaze position
        w["Y"].std(),             # Std of vertical gaze
        vel.mean(),               # Mean velocity
        vel.std(),                # Velocity variability
        np.percentile(vel, 75),   # High-velocity behavior
        np.percentile(vel, 90)    # Extreme velocity behavior
    ]


## 9: Build Feature Matrix (X) and Labels (y)

In [None]:
X = []  # Feature vectors
y = []  # Corresponding user IDs

for f in selected_files:
    # Extract user ID from filename
    uid = int(os.path.basename(f).split("_")[1])

    # Load gaze data
    df = load_file(f)

    # Split into windows
    windows = window_by_index(df)

    for w in windows:
        X.append(extract_features(w))
        y.append(uid)

# Convert lists to NumPy arrays
X = np.array(X)
y = np.array(y)

print("Total samples:", X.shape[0])
print("Feature dimension:", X.shape[1])
print("Number of users:", len(np.unique(y)))


## 10: Train–Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Split data while preserving class balance
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


## 11: Train Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

# Train model
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)


## 12: Evaluation

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


## 13: Inspect Individual Predictions

In [None]:
i = 1  # index of test sample

print("True user ID:", y_test[i])
print("Predicted user ID:", y_pred[i])


## 14: Tabular View of Predictions

In [None]:
# Display first 5 predictions in a table
results_df = pd.DataFrame({
    "Index": range(5),
    "True_User_ID": y_test[:5],
    "Predicted_User_ID": y_pred[:5]
})

results_df
