# Random Forest Model

## 1: Locate Dataset Files

In [1]:
import glob
import os
import random

# Path to the directory containing BioEye .txt files
DATA_DIR = "C:\\Users\\admin\\Desktop\\HPC_ML Course\\RAN"

# Get full paths to all .txt files in the directory
all_files = glob.glob(os.path.join(DATA_DIR, "*.txt"))

print(f"Total files found: {len(all_files)}")

Total files found: 306


## 2: Group Files by User ID

In [2]:
# Dictionary: { user_id : [file1, file2, ...] }
user_files = {}

for f in all_files:
    # Filename format: ID_001_1.txt → extract user ID = 1
    uid = int(os.path.basename(f).split("_")[1])
    
    # Add file to that user's list
    user_files.setdefault(uid, []).append(f)

print(f"Total users found: {len(user_files)}")


Total users found: 153


## 3: Select a Subset of Users

In [3]:
# Number of users to sample for demonstration
NUM_USERS = 10

num_available_users = len(user_files)

if num_available_users == 0:
    print("No users found. Check directory path.")
    SELECTED_USERS = []

elif num_available_users < NUM_USERS:
    print(f"Only {num_available_users} users available. Using all.")
    SELECTED_USERS = sorted(user_files.keys())

else:
    # Randomly sample users (reproducible teaching demo)
    SELECTED_USERS = sorted(random.sample(list(user_files.keys()), NUM_USERS))

print("Selected users:", SELECTED_USERS)


Selected users: [42, 80, 142, 192, 195, 198, 215, 259, 283, 288]


 ## 4: Collect Files for Selected Users

In [4]:
# List of files corresponding to selected users
selected_files = []

for u in SELECTED_USERS:
    selected_files.extend(user_files[u])

print(f"Total selected files: {len(selected_files)}")


Total selected files: 20


## 5: Load and Clean Eye-Tracking Files

In [5]:
import pandas as pd
import numpy as np

def load_file(path):
    """
    Load a BioEye .txt file and keep only valid gaze samples.
    """
    df = pd.read_csv(
        path,
        sep=r"\s+",
        skiprows=1,
        header=None,
        names=["SAMPLE", "X", "Y", "VALID", "XS", "YS"]
    )
    
    # Keep only valid gaze samples
    df = df[df["VALID"] == 1].reset_index(drop=True)
    
    return df


## 6: Windowing (Temporal Segmentation)


In [6]:
def window_by_index(df, win=6000, step=3000):
    """
    Split a gaze signal into overlapping windows.

    win  = window size (samples)
    step = overlap stride (samples)
    """
    return [
        df.iloc[i:i + win]
        for i in range(0, len(df) - win + 1, step)
    ]


## 7: Feature Extraction

In [7]:
def extract_features(w):
    """
    Extract simple statistical features from one gaze window.
    """

    # Compute point-to-point displacement
    dx = np.diff(w["X"])
    dy = np.diff(w["Y"])

    # Velocity magnitude (pixels/sample)
    vel = np.sqrt(dx**2 + dy**2)

    # Return feature vector
    return [
        w["X"].mean(),            # Mean horizontal gaze position
        w["X"].std(),             # Std of horizontal gaze
        w["Y"].mean(),            # Mean vertical gaze position
        w["Y"].std(),             # Std of vertical gaze
        vel.mean(),               # Mean velocity
        vel.std(),                # Velocity variability
        np.percentile(vel, 75),   # High-velocity behavior
        np.percentile(vel, 90)    # Extreme velocity behavior
    ]


## 8: Build Feature Matrix (X) and Labels (y)

In [8]:
X = []  # Feature vectors
y = []  # Corresponding user IDs

for f in selected_files:
    # Extract user ID from filename
    uid = int(os.path.basename(f).split("_")[1])

    # Load gaze data
    df = load_file(f)

    # Split into windows
    windows = window_by_index(df)

    for w in windows:
        X.append(extract_features(w))
        y.append(uid)

# Convert lists to NumPy arrays
X = np.array(X)
y = np.array(y)

print("Total samples:", X.shape[0])
print("Feature dimension:", X.shape[1])
print("Number of users:", len(np.unique(y)))


Total samples: 134
Feature dimension: 8
Number of users: 10


## 9: Train–Test Split

The dataset is represented as:

$$
\mathcal{D} = \{(\mathbf{x}_i, y_i)\}_{i=1}^{N}
$$

The dataset is split into two disjoint sets:

$$
\mathcal{D}_{train} \cup \mathcal{D}_{test} = \mathcal{D},
\quad
\mathcal{D}_{train} \cap \mathcal{D}_{test} = \varnothing
$$

with the proportions:
$$
|\mathcal{D}_{train}| = 0.7N,
\quad
|\mathcal{D}_{test}| = 0.3N
$$

Stratification preserves class proportions
For each class $k$:

$$
P_{train}(y = k) \approx P_{test}(y = k)
$$


In [9]:
from sklearn.model_selection import train_test_split

# Split data while preserving class balance
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 93
Testing samples: 41


Given a feature vector $\mathbf{x}_i$, we want to predict the user identity $y_i$:

$$
f(\mathbf{x}_i) = y_i
$$

At each node, a decision tree splits the data using a threshold:

$$
x_j \le \theta
$$


The model parameters $\Theta$ are learned by minimizing classification error:

$$
\Theta^* = \arg\min_{\Theta}
\sum_{(\mathbf{x}_i, y_i) \in \mathcal{D}_{train}}
\mathbb{I}(f_\Theta(\mathbf{x}_i) \neq y_i)
$$


For each test sample:

$$
\hat{y}_i = f(\mathbf{x}_i),
\quad
(\mathbf{x}_i, y_i) \in \mathcal{D}_{test}
$$


## 10: Train Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

# Train model
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)


The Random Forest prediction is given by:

$$
\hat{y} = \operatorname{mode}
\left(
h_1(\mathbf{x}), h_2(\mathbf{x}), \dots, h_T(\mathbf{x})
\right)
$$


Classification accuracy is defined as:

$$
\text{Accuracy} =
\frac{1}{N}
\sum_{i=1}^{N}
\mathbb{I}(\hat{y}_i = y_i)
$$


## 11: Evaluation

In [11]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8048780487804879

Classification Report:
              precision    recall  f1-score   support

          42       0.80      1.00      0.89         4
          80       1.00      1.00      1.00         4
         142       0.80      1.00      0.89         4
         192       1.00      1.00      1.00         4
         195       0.57      1.00      0.73         4
         198       1.00      0.80      0.89         5
         215       0.80      1.00      0.89         4
         259       0.33      0.25      0.29         4
         283       1.00      0.75      0.86         4
         288       1.00      0.25      0.40         4

    accuracy                           0.80        41
   macro avg       0.83      0.81      0.78        41
weighted avg       0.83      0.80      0.79        41



Precision:
$$
\text{Precision} = \frac{TP}{TP + FP}
$$

Recall:
$$
\text{Recall} = \frac{TP}{TP + FN}
$$

The F1-score is the harmonic mean of precision and recall:

$$
\text{F1} =
2 \cdot \frac{\text{Precision} \cdot \text{Recall}}
{\text{Precision} + \text{Recall}}
$$



## 12: Inspect Individual Predictions

For a single test example:

$$
\mathbf{x}_i \rightarrow \hat{y}_i
$$

Correct prediction if:

$$
\hat{y}_i = y_i
$$


In [12]:
i = 1  # index of test sample

print("True user ID:", y_test[i])
print("Predicted user ID:", y_pred[i])


True user ID: 283
Predicted user ID: 283


## 13: Tabular View of Predictions

In [13]:
# Display first 5 predictions in a table
results_df = pd.DataFrame({
    "Index": range(5),
    "True_User_ID": y_test[:5],
    "Predicted_User_ID": y_pred[:5]
})

results_df


Unnamed: 0,Index,True_User_ID,Predicted_User_ID
0,0,259,195
1,1,283,283
2,2,288,288
3,3,198,198
4,4,142,142
