In [18]:
import os
import pandas as pd
import random


In [None]:
DATA_DIR = "D:\\ML_project\\Keystrokes_dataset\\files"
SAMPLES_PER_USER = 150
NUM_USERS = 30

random.seed(42)  # reproducibility

all_users = []

# Step 1: Get all .txt files
all_files = [
    file for file in os.listdir(DATA_DIR)
    if file.endswith(".txt")
]

In [20]:
# Step 2: Randomly select 30 user files
selected_files = random.sample(all_files, NUM_USERS)

print(f"Selected {len(selected_files)} user files")

Selected 30 user files


In [21]:
# Step 3: Process selected users only
for file in selected_files:
    file_path = os.path.join(DATA_DIR, file)

    df = pd.read_csv(
        file_path,
        sep="\t",
        engine="python",
        on_bad_lines="skip"
    )

# Remove duplicate keystroke entries
    df = df.drop_duplicates(
        subset=[
            "PARTICIPANT_ID",
            "KEYSTROKE_ID",
            "PRESS_TIME",
            "RELEASE_TIME"
        ]
    )

    #keep only rows where LETTER is not null
    df = df[df["LETTER"].notna()]

    #sample exactly 150 valid keystrokes

    if len(df) >= SAMPLES_PER_USER:
        df_sampled = df.sample(n=SAMPLES_PER_USER, random_state=42)
        all_users.append(df_sampled)
    else:
        print(f"Skipped {file} (only {len(df)} rows after deduplication)")


Skipped 179016_keystrokes.txt (only 68 rows after deduplication)
Skipped 403193_keystrokes.txt (only 29 rows after deduplication)
Skipped 70903_keystrokes.txt (only 36 rows after deduplication)
Skipped 61412_keystrokes.txt (only 80 rows after deduplication)


In [22]:
# Step 4: Combine all users
final_df = pd.concat(all_users, ignore_index=True)

final_df = final_df.drop_duplicates(
    subset=[
        "PARTICIPANT_ID",
        "KEYSTROKE_ID",
        "PRESS_TIME",
        "RELEASE_TIME"
    ]
)

final_df = final_df[final_df["LETTER"].notna()]

print("Final dataset shape after deduplication:", final_df.shape)

Final dataset shape after deduplication: (3900, 9)


In [23]:
OUTPUT_PATH = "D:\\ML_project\\Keystrokes_dataset\\final_dataset.csv"

final_df.to_csv(
    OUTPUT_PATH,
    index=False
)

print("CSV file saved successfully!")


CSV file saved successfully!


In [24]:
final_df.isnull().sum()

PARTICIPANT_ID     0
TEST_SECTION_ID    0
SENTENCE           0
USER_INPUT         0
KEYSTROKE_ID       0
PRESS_TIME         0
RELEASE_TIME       0
LETTER             0
KEYCODE            0
dtype: int64

In [25]:
# Ensure timestamps are numeric
final_df["PRESS_TIME"] = pd.to_numeric(final_df["PRESS_TIME"], errors="coerce")
final_df["RELEASE_TIME"] = pd.to_numeric(final_df["RELEASE_TIME"], errors="coerce")

# Create HOLD_TIME feature
final_df["HOLD_TIME"] = final_df["RELEASE_TIME"] - final_df["PRESS_TIME"]

In [26]:
print(final_df["HOLD_TIME"].isnull().sum())


0


In [27]:
print((final_df["HOLD_TIME"] < 0).sum())


0


In [28]:
print(final_df["HOLD_TIME"].describe())


count    3900.000000
mean      126.944872
std       104.731410
min         8.000000
25%        89.000000
50%       113.000000
75%       143.000000
max      2949.000000
Name: HOLD_TIME, dtype: float64


In [29]:
final_df.to_csv(
    "D:\\ML_project\\Keystrokes_dataset\\keystrokes_with_hold_time.csv",
    index=False
)


In [31]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.0 MB ? eta -:--:--
   ----- ---------------------------------- 1.0/8.0 MB 3.6 MB/s eta 0:00:02
   --------- ------------------------------ 1.8/8.0 MB 3.6 MB/s eta 0:00:02
   ------------- -------------------------- 2.6/8.0 MB 3.7 MB/s eta 0:00:02
   ---------------- ----------------------- 3.4/


[notice] A new release of pip is available: 25.0.1 -> 26.0
[notice] To update, run: C:\Users\USER\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [32]:
import numpy as np
from sklearn.model_selection import train_test_split

# Step 1: Select features and target
X = final_df[["HOLD_TIME"]]    # you can add more features later
y = final_df["PARTICIPANT_ID"]

# Step 2: Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,        # 80% train, 20% test
    random_state=42,
    stratify=y             # VERY IMPORTANT
)

# Step 3: Check shapes
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (3120, 1)
Testing set shape: (780, 1)


In [33]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data only
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [34]:
print("Training features mean:", np.mean(X_train_scaled))
print("Training features std:", np.std(X_train_scaled))

Training features mean: 1.1386902816668272e-18
Training features std: 1.0000000000000002


In [35]:
# Example:
print(X_train_scaled[:5])  # First 5 scaled samples

[[-0.07660121]
 [ 0.17127859]
 [ 0.17127859]
 [-0.4632937 ]
 [ 0.24068493]]
