In [1]:
import os
import pandas as pd
import random


In [2]:
DATA_DIR = "D:\\ML_project\\Keystrokes_dataset\\files"
SAMPLES_PER_USER = 150
NUM_USERS = 30

random.seed(42)  # reproducibility

all_users = []

# Step 1: Get all .txt files
all_files = [
    file for file in os.listdir(DATA_DIR)
    if file.endswith(".txt")
]

In [3]:
# Step 2: Randomly select 30 user files
selected_files = random.sample(all_files, NUM_USERS)

print(f"Selected {len(selected_files)} user files")

Selected 30 user files


In [10]:
# Step 3: Process selected users only
for file in selected_files:
    file_path = os.path.join(DATA_DIR, file)

    df = pd.read_csv(
        file_path,
        sep="\t",
        engine="python",
        on_bad_lines="skip"
    )

# Remove duplicate keystroke entries
    df = df.drop_duplicates(
        subset=[
            "PARTICIPANT_ID",
            "KEYSTROKE_ID",
            "PRESS_TIME",
            "RELEASE_TIME"
        ]
    )

    if len(df) >= SAMPLES_PER_USER:
        df_sampled = df.sample(n=SAMPLES_PER_USER, random_state=42)
        all_users.append(df_sampled)
    else:
        print(f"Skipped {file} (only {len(df)} rows after deduplication)")


In [12]:
# Step 4: Combine all users
final_df = pd.concat(all_users, ignore_index=True)

final_df = final_df.drop_duplicates(
    subset=[
        "PARTICIPANT_ID",
        "KEYSTROKE_ID",
        "PRESS_TIME",
        "RELEASE_TIME"
    ]
)

print("Final dataset shape after deduplication:", final_df.shape)

Final dataset shape after deduplication: (4500, 9)


In [13]:
OUTPUT_PATH = "D:\\ML_project\\Keystrokes_dataset\\final_dataset.csv"

final_df.to_csv(
    OUTPUT_PATH,
    index=False
)

print("CSV file saved successfully!")


CSV file saved successfully!
