In [3]:
import os
import pandas as pd
import random


In [4]:
DATA_DIR = "D:\\ML_project\\Keystrokes_dataset\\files"
SAMPLES_PER_USER = 150
NUM_USERS = 30

random.seed(42)  # reproducibility

all_users = []

# Step 1: Get all .txt files
all_files = [
    file for file in os.listdir(DATA_DIR)
    if file.endswith(".txt")
]

In [5]:
# Step 2: Randomly select 30 user files
selected_files = random.sample(all_files, NUM_USERS)

print(f"Selected {len(selected_files)} user files")

Selected 30 user files


In [6]:
# Step 3: Process selected users only
for file in selected_files:
    file_path = os.path.join(DATA_DIR, file)

    df = pd.read_csv(
        file_path,
        sep="\t",
        engine="python",
        on_bad_lines="skip"
    )

# Remove duplicate keystroke entries
    df = df.drop_duplicates(
        subset=[
            "PARTICIPANT_ID",
            "KEYSTROKE_ID",
            "PRESS_TIME",
            "RELEASE_TIME"
        ]
    )

    #keep only rows where LETTER is not null
    df = df[df["LETTER"].notna()]

    #sample exactly 150 valid keystrokes

    if len(df) >= SAMPLES_PER_USER:
        df_sampled = df.sample(n=SAMPLES_PER_USER, random_state=42)
        all_users.append(df_sampled)
    else:
        print(f"Skipped {file} (only {len(df)} rows after deduplication)")


Skipped 179016_keystrokes.txt (only 68 rows after deduplication)
Skipped 403193_keystrokes.txt (only 29 rows after deduplication)
Skipped 70903_keystrokes.txt (only 36 rows after deduplication)
Skipped 61412_keystrokes.txt (only 80 rows after deduplication)


In [7]:
# Step 4: Combine all users
final_df = pd.concat(all_users, ignore_index=True)

final_df = final_df.drop_duplicates(
    subset=[
        "PARTICIPANT_ID",
        "KEYSTROKE_ID",
        "PRESS_TIME",
        "RELEASE_TIME"
    ]
)

final_df = final_df[final_df["LETTER"].notna()]

print("Final dataset shape after deduplication:", final_df.shape)

Final dataset shape after deduplication: (3900, 9)


In [8]:
OUTPUT_PATH = "D:\\ML_project\\Keystrokes_dataset\\final_dataset.csv"

final_df.to_csv(
    OUTPUT_PATH,
    index=False
)

print("CSV file saved successfully!")


CSV file saved successfully!


In [9]:
final_df.isnull().sum()

PARTICIPANT_ID     0
TEST_SECTION_ID    0
SENTENCE           0
USER_INPUT         0
KEYSTROKE_ID       0
PRESS_TIME         0
RELEASE_TIME       0
LETTER             0
KEYCODE            0
dtype: int64

In [23]:
# Ensure timestamps are numeric
final_df["PRESS_TIME"] = pd.to_numeric(final_df["PRESS_TIME"], errors="coerce")
final_df["RELEASE_TIME"] = pd.to_numeric(final_df["RELEASE_TIME"], errors="coerce")

# Create HOLD_TIME feature
final_df["HOLD_TIME"] = final_df["RELEASE_TIME"] - final_df["PRESS_TIME"]
final_df["PRESS_PRESS"] = final_df.groupby("PARTICIPANT_ID")["PRESS_TIME"].diff()
final_df["RELEASE_RELEASE"] = final_df.groupby("PARTICIPANT_ID")["RELEASE_TIME"].diff()

In [24]:
final_df = final_df.dropna()

In [26]:
print(final_df["HOLD_TIME"].isnull().sum())


0


In [27]:
print((final_df["HOLD_TIME"] < 0).sum())


0


In [28]:
print(final_df["HOLD_TIME"].describe())





count    3874.000000
mean      126.957666
std       104.973956
min         8.000000
25%        89.000000
50%       113.000000
75%       143.000000
max      2949.000000
Name: HOLD_TIME, dtype: float64


In [29]:
final_df.to_csv(
    "D:\\ML_project\\Keystrokes_dataset\\keystrokes_with_hold_time.csv",
    index=False
)


In [30]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.2 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
import numpy as np
from sklearn.model_selection import train_test_split

# Step 1: Select features and target
X = final_df[["HOLD_TIME", "PRESS_PRESS", "RELEASE_RELEASE"]]
y = final_df["PARTICIPANT_ID"]

# Step 2: Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,        # 80% train, 20% test
    random_state=42,
    stratify=y             # VERY IMPORTANT
)

# Step 3: Check shapes
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (3099, 3)
Testing set shape: (775, 3)


In [38]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data only
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [39]:
print("Training features mean:", np.mean(X_train_scaled))
print("Training features std:", np.std(X_train_scaled))

Training features mean: -1.490328422859197e-17
Training features std: 1.0


In [40]:
# Example:
print(X_train_scaled[:5])  # First 5 scaled samples

[[-0.27977284 -0.09757644 -0.09776457]
 [-0.48367231  0.17909714  0.17889328]
 [ 0.98794998 -1.29391326 -1.29415469]
 [-0.30636842  0.88763801  0.88708465]
 [-0.56345906 -0.15328024 -0.15401885]]


In [41]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

k_values = range(1, 16)
accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

best_k = k_values[np.argmax(accuracies)]
print("Best k:", best_k)


Best k: 15


In [42]:
knn_final = KNeighborsClassifier(n_neighbors=best_k)
knn_final.fit(X_train_scaled, y_train)

y_final_pred = knn_final.predict(X_test_scaled)
final_accuracy = accuracy_score(y_test, y_final_pred)

print("Final accuracy:", final_accuracy)


Final accuracy: 0.11483870967741935
