In [4]:
# Cell 1: Scan Sapimouse dataset and robustly read sessions
import os, glob
import pandas as pd
from pathlib import Path
from tqdm import tqdm

BASE_PATH = "data/unprocessed/sapimouse/sapimouse"   # <<-- adjust to your path
user_dirs = sorted([p for p in os.listdir(BASE_PATH) if p.lower().startswith("user")])

def read_session_csv(path, user_id):
    """
    Read a Sapimouse session CSV.
    - path: CSV file path
    - user_id: folder name like 'user1'
    Returns DataFrame with columns: user_id, timestamp (ms), button, state, x, y
    """
    # Try reading normally
    try:
        df = pd.read_csv(path, skipinitialspace=True, engine="python")
    except Exception:
        df = pd.read_csv(path, header=None, engine="python")

    # Standardize column names
    df.columns = [c.strip().lower() for c in df.columns]

    # If first col is "client timestamp", rename to "timestamp"
    if "client timestamp" in df.columns:
        df = df.rename(columns={"client timestamp": "timestamp"})
    elif "client_timestamp" in df.columns:
        df = df.rename(columns={"client_timestamp": "timestamp"})
    elif "timestamp" not in df.columns:
        df = df.rename(columns={df.columns[0]: "timestamp"})

    # Rename others if needed
    rename_map = {}
    if "button" not in df.columns: rename_map[df.columns[1]] = "button"
    if "state" not in df.columns:  rename_map[df.columns[2]] = "state"
    if "x" not in df.columns:      rename_map[df.columns[3]] = "x"
    if "y" not in df.columns:      rename_map[df.columns[4]] = "y"
    df = df.rename(columns=rename_map)

    # Cast numeric
    df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")
    df["x"] = pd.to_numeric(df["x"], errors="coerce")
    df["y"] = pd.to_numeric(df["y"], errors="coerce")

    # Add user_id
    df["user_id"] = user_id

    return df[["user_id", "timestamp", "button", "state", "x", "y"]]

# Collect file lists
all_1min, all_3min = [], []
for user in user_dirs:
    user_folder = os.path.join(BASE_PATH, user)
    csv_files = glob.glob(os.path.join(user_folder, "*.csv"))
    for fpath in csv_files:
        fname = os.path.basename(fpath).lower()
        if "1min" in fname:
            all_1min.append((user, fpath))
        elif "3min" in fname:
            all_3min.append((user, fpath))

print(f"Found {len(user_dirs)} users, {len(all_1min)} 1-min sessions, {len(all_3min)} 3-min sessions")

# Example read
test_user, test_path = all_1min[0]
print("Example file:", test_path)
print(read_session_csv(test_path, test_user).head())


Found 120 users, 122 1-min sessions, 122 3-min sessions
Example file: data/unprocessed/sapimouse/sapimouse\user1\session_2020_05_14_1min.csv
  user_id  timestamp    button state    x    y
0   user1      15585  NoButton  Move  967  706
1   user1      15601  NoButton  Move  958  700
2   user1      15617  NoButton  Move  901  679
3   user1      15634  NoButton  Move  818  647
4   user1      15651  NoButton  Move  690  599


In [6]:
# Cell 2: feature extraction and build df_1min and df_3min
import numpy as np

def extract_basic_features(raw_df, user_id_from_folder):
    """
    raw_df: DataFrame with columns ['user_id','timestamp','button','state','x','y']
    user_id_from_folder: folder name like 'user12'
    Returns dict of features for that session.
    """
    # enforce user_id
    user_id = user_id_from_folder

    # sort by timestamp
    df = raw_df.sort_values('timestamp').reset_index(drop=True).copy()

    # counts
    total_events = len(df)
    total_moves = (df['state'].str.lower() == 'move').sum()
    total_pressed = (df['state'].str.lower() == 'pressed').sum()
    total_released = (df['state'].str.lower() == 'released').sum()
    total_drags = (df['state'].str.lower() == 'drag').sum()
    clicks = total_pressed + total_released  # simple proxy

    # distances & speeds
    dx = df['x'].diff().fillna(0).astype(float)
    dy = df['y'].diff().fillna(0).astype(float)
    dt = df['timestamp'].diff().fillna(1).astype(float)  # ms
    dt[dt == 0] = 1.0  # avoid division by zero
    dist = np.sqrt(dx**2 + dy**2)
    speed = dist / dt

    avg_speed = float(speed.replace([np.inf, -np.inf], np.nan).fillna(0).mean())
    path_length = float(dist.sum())

    # idle time fraction: consider gaps > 500ms as idle
    idle_time = float(dt[dt > 500].sum())
    total_time = float(df['timestamp'].iloc[-1] - df['timestamp'].iloc[0]) if len(df) > 1 else 0.0
    idle_fraction = float(idle_time / total_time) if total_time > 0 else 0.0

    return {
        "user_id": user_id,
        "n_events": total_events,
        "total_moves": int(total_moves),
        "clicks": int(clicks),
        "total_drags": int(total_drags),
        "avg_speed_px_per_ms": avg_speed,
        "path_length_px": path_length,
        "idle_fraction": idle_fraction
    }

# Build feature tables
rows_1min, rows_3min = [], []

for user, path in tqdm(all_1min, desc="1min sessions"):
    raw = read_session_csv(path, user)
    feats = extract_basic_features(raw, user)
    feats['session_file'] = os.path.basename(path)
    rows_1min.append(feats)

for user, path in tqdm(all_3min, desc="3min sessions"):
    raw = read_session_csv(path, user)
    feats = extract_basic_features(raw, user)
    feats['session_file'] = os.path.basename(path)
    rows_3min.append(feats)

df_1min = pd.DataFrame(rows_1min)
df_3min = pd.DataFrame(rows_3min)

print("1min shape:", df_1min.shape)
print("3min shape:", df_3min.shape)
display(df_1min.head())
display(df_3min.head())


1min sessions:   0%|          | 0/122 [00:00<?, ?it/s]

1min sessions: 100%|██████████| 122/122 [00:03<00:00, 32.20it/s]
3min sessions: 100%|██████████| 122/122 [00:07<00:00, 17.06it/s]

1min shape: (122, 9)
3min shape: (122, 9)





Unnamed: 0,user_id,n_events,total_moves,clicks,total_drags,avg_speed_px_per_ms,path_length_px,idle_fraction,session_file
0,user1,2055,1591,102,362,1.12443,38617.726413,0.0,session_2020_05_14_1min.csv
1,user10,1760,1333,118,309,1.280491,37444.983442,0.093976,session_2020_05_14_1min.csv
2,user100,2512,2119,168,225,1.293277,54045.074824,0.0,session_2020_03_31_1min.csv
3,user101,1691,1417,67,207,0.616227,16718.778541,0.260148,session_2020_06_08_1min.csv
4,user102,2236,1742,168,326,1.120358,41592.445983,0.03715,session_2020_03_31_1min.csv


Unnamed: 0,user_id,n_events,total_moves,clicks,total_drags,avg_speed_px_per_ms,path_length_px,idle_fraction,session_file
0,user1,6293,4680,312,1301,1.179172,123674.594335,0.015372,session_2020_05_14_3min.csv
1,user10,6629,5095,489,1045,1.51699,167392.541949,0.107184,session_2020_05_14_3min.csv
2,user100,7881,6496,626,759,1.280644,168105.726338,0.003352,session_2020_03_31_3min.csv
3,user101,6725,5287,394,1044,0.984115,104725.463919,0.013842,session_2020_06_08_3min.csv
4,user102,6988,5390,453,1145,1.0091,119308.752369,0.030959,session_2020_03_31_3min.csv


In [None]:
# Cell 3: quick standardization (z-score) for numeric columns and save versioned CSVs
from sklearn.preprocessing import StandardScaler
import os

NUMERIC_COLS = [
    "n_events","total_moves","clicks","total_drags",
    "avg_speed_px_per_ms","path_length_px","idle_fraction"
]

def get_versioned_filename(base_name):
    """
    Returns a filename with incremental versioning: e.g.
    sapimouse_1min_features_v1.csv, sapimouse_1min_features_v2.csv, ...
    """
    version = 1
    while True:
        fname = f"{base_name}_v{version}.csv"
        if not os.path.exists(fname):
            return fname
        version += 1

scaler_1 = StandardScaler()

if not df_1min.empty:
    df_1num = df_1min[NUMERIC_COLS].fillna(0)
    df_1min_scaled = df_1min.copy()
    df_1min_scaled[NUMERIC_COLS] = scaler_1.fit_transform(df_1num)
    out1 = get_versioned_filename("sapimouse_1min_features")
    df_1min_scaled.to_csv(out1, index=False)
    print(f"Saved {out1}")

if not df_3min.empty:
    df_3num = df_3min[NUMERIC_COLS].fillna(0)
    df_3min_scaled = df_3min.copy()
    df_3min_scaled[NUMERIC_COLS] = scaler_1.fit_transform(df_3num)  # could also use a separate scaler
    out3 = get_versioned_filename("sapimouse_3min_features")
    df_3min_scaled.to_csv(out3, index=False)
    print(f"Saved {out3}")

# Quick sanity
if not df_1min.empty:
    print(df_1min_scaled.describe().loc[['mean','std']])


Saved sapimouse_1min_features_v1.csv
Saved sapimouse_3min_features_v1.csv
          n_events   total_moves        clicks   total_drags  \
mean -2.352399e-16 -7.689659e-17  2.775558e-16  5.460113e-17   
std   1.004124e+00  1.004124e+00  1.004124e+00  1.004124e+00   

      avg_speed_px_per_ms  path_length_px  idle_fraction  
mean         2.466151e-16   -1.879189e-16   7.189149e-17  
std          1.004124e+00    1.004124e+00   1.004124e+00  
