In [18]:
import os
import pandas as pd
import random


In [19]:
DATA_DIR = "D:\\ML_project\\Keystrokes_dataset\\files"
SAMPLES_PER_USER = 150
NUM_USERS = 30

random.seed(42)  # reproducibility

all_users = []

# Step 1: Get all .txt files
all_files = [
    file for file in os.listdir(DATA_DIR)
    if file.endswith(".txt")
]

In [20]:
# Step 2: Randomly select 30 user files
selected_files = random.sample(all_files, NUM_USERS)

print(f"Selected {len(selected_files)} user files")

Selected 30 user files


In [21]:
# Step 3: Process selected users only
for file in selected_files:
    file_path = os.path.join(DATA_DIR, file)

    df = pd.read_csv(
        file_path,
        sep="\t",
        engine="python",
        on_bad_lines="skip"
    )

# Remove duplicate keystroke entries
    df = df.drop_duplicates(
        subset=[
            "PARTICIPANT_ID",
            "KEYSTROKE_ID",
            "PRESS_TIME",
            "RELEASE_TIME"
        ]
    )

    #keep only rows where LETTER is not null
    df = df[df["LETTER"].notna()]

    #sample exactly 150 valid keystrokes

    if len(df) >= SAMPLES_PER_USER:
        df_sampled = df.sample(n=SAMPLES_PER_USER, random_state=42)
        all_users.append(df_sampled)
    else:
        print(f"Skipped {file} (only {len(df)} rows after deduplication)")


Skipped 179016_keystrokes.txt (only 68 rows after deduplication)
Skipped 403193_keystrokes.txt (only 29 rows after deduplication)
Skipped 70903_keystrokes.txt (only 36 rows after deduplication)
Skipped 61412_keystrokes.txt (only 80 rows after deduplication)


In [22]:
# Step 4: Combine all users
final_df = pd.concat(all_users, ignore_index=True)

final_df = final_df.drop_duplicates(
    subset=[
        "PARTICIPANT_ID",
        "KEYSTROKE_ID",
        "PRESS_TIME",
        "RELEASE_TIME"
    ]
)

final_df = final_df[final_df["LETTER"].notna()]

print("Final dataset shape after deduplication:", final_df.shape)

Final dataset shape after deduplication: (3900, 9)


In [23]:
OUTPUT_PATH = "D:\\ML_project\\Keystrokes_dataset\\final_dataset.csv"

final_df.to_csv(
    OUTPUT_PATH,
    index=False
)

print("CSV file saved successfully!")


CSV file saved successfully!


In [24]:
final_df.isnull().sum()

PARTICIPANT_ID     0
TEST_SECTION_ID    0
SENTENCE           0
USER_INPUT         0
KEYSTROKE_ID       0
PRESS_TIME         0
RELEASE_TIME       0
LETTER             0
KEYCODE            0
dtype: int64