In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from pandas.errors import EmptyDataError
import os
from functools import partial
import re
from collections import Counter
from scipy.stats import skew, kurtosis

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone, BaseEstimator, TransformerMixin

# Set plot style
plt.rcParams['figure.figsize'] = (10, 6)
style.use('ggplot')

# Update these paths
user_root = r"..\data\Archived-users\Archived users"
keys_root = r"..\data\Archived-Data\Tappy Data"

# Read user files
def read_one_file(fn, root):
    out = dict()
    with open(os.path.join(root, fn), 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if ": " in line:
                k, v = line.split(": ")
                out[k] = v.strip()
        out['ID'] = re.findall(r'_(\w+)\.', fn)[0]
    return out

user_fn_list = os.listdir(user_root)
users_list = list(map(partial(read_one_file, root=user_root), user_fn_list))
users = pd.DataFrame(users_list)
users.replace(['------', ''], np.nan, inplace=True)

# Convert boolean columns
bool_cols = ['Levadopa', 'MAOB', 'Parkinsons', 'Tremors', 'Other']
for col in bool_cols:
    users[col] = users[col] == 'True'

# Read keypress files
def read_one_key_file(fn, root):
    try:
        # Read the file
        df = pd.read_csv(os.path.join(root, fn), delimiter='\t', header=None, on_bad_lines='skip', low_memory=False)

        # Drop the last column if the file has 9 columns (since the last col is NaN)
        if df.shape[1] == 9:
            df = df.iloc[:, :-1]

        # Ensure the DataFrame has exactly 8 columns
        if df.shape[1] != 8:
            print(f"Skipping file {fn} due to incorrect number of columns")
            return pd.DataFrame()

        # Assign column names
        df.columns = ['ID', 'Date', 'TS', 'Hand', 'HoldTime', 'Direction', 'LatencyTime', 'FlightTime']

        # Convert columns with known types
        df['ID'] = df['ID'].astype(str)
        df['Date'] = df['Date'].astype(str)
        df['TS'] = df['TS'].astype(str)
        df['Hand'] = df['Hand'].astype(str)
        df['Direction'] = df['Direction'].astype(str)

        # Convert numeric columns safely
        for col in ['HoldTime', 'LatencyTime', 'FlightTime']:
            df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to float, setting errors to NaN

        return df.dropna(subset=['HoldTime', 'LatencyTime', 'FlightTime'])  # Drop rows with NaNs in numeric cols

    except Exception as e:
        print(f"Error reading {fn}: {e}")
        return pd.DataFrame()

# Read all keypress files
keys_fn_list = os.listdir(keys_root)
keys_list = list(map(partial(read_one_key_file, root=keys_root), keys_fn_list))

# Concatenate all keypress data
keys = pd.concat(keys_list, ignore_index=True, axis=0)

# Print dataset stats
print("Total user IDs in key logs:", len(set(keys['ID'].unique())))
print("Total user IDs in user info:", len(set(users['ID'].unique())))

# Get valid users
user_w_sufficient_data = set(keys.groupby('ID').size()[keys.groupby('ID').size() >= 2000].index)
user_eligible = set(users[((users['Parkinsons']) & (users.get('Impact', pd.Series()) == 'Mild')) 
                           | (~users['Parkinsons']) & (~users['Levadopa'])]['ID'])
user_valid = user_w_sufficient_data.intersection(user_eligible)

# Filter valid keypress data
valid_keys = keys[(keys['HoldTime'] > 0) & (keys['LatencyTime'] > 0) 
                   & (keys['HoldTime'] < 2000) & (keys['LatencyTime'] < 2000)
                   & keys['ID'].isin(user_valid)]

# Aggregate features
hold_by_user = valid_keys[valid_keys['Hand'] != 'S'].groupby(['ID', 'Hand'])['HoldTime'].agg([np.mean, np.std, skew, kurtosis])
latency_by_user = valid_keys[valid_keys['Direction'].isin(['LL', 'LR', 'RL', 'RR'])].groupby(['ID', 'Direction'])['LatencyTime'].agg([np.mean, np.std, skew, kurtosis])

# Reshape data
hold_by_user_flat = hold_by_user.unstack()
hold_by_user_flat.columns = ['_'.join(col).strip() for col in hold_by_user_flat.columns.values]
hold_by_user_flat['mean_hold_diff'] = hold_by_user_flat['mean_L'] - hold_by_user_flat['mean_R']

latency_by_user_flat = latency_by_user.unstack()
latency_by_user_flat.columns = ['_'.join(col).strip() for col in latency_by_user_flat.columns.values]
latency_by_user_flat['mean_LR_RL_diff'] = latency_by_user_flat['mean_LR'] - latency_by_user_flat['mean_RL']
latency_by_user_flat['mean_LL_RR_diff'] = latency_by_user_flat['mean_LL'] - latency_by_user_flat['mean_RR']

# Combine datasets
combined = pd.concat([hold_by_user_flat, latency_by_user_flat], axis=1)

# Merge with user labels
full_set = pd.merge(combined.reset_index(), users[['ID', 'Parkinsons']], on='ID')
full_set.set_index('ID', inplace=True)
full_set.dropna(inplace=True)  # Remove rows with NaN values

# Save dataset for download
output_path = "../data/full_set.csv"
full_set.to_csv(output_path)

print(f"Dataset saved successfully as {output_path}")
print("Final dataset shape:", full_set.shape)
print(full_set.head())


Skipping file TBOPVHGLMX_1701.txtZone.Identifier due to incorrect number of columns
Skipping file MG8XVA5BFA_1609.txtZone.Identifier due to incorrect number of columns
Skipping file ZQ4UHHUPNZ_1703.txtZone.Identifier due to incorrect number of columns
Skipping file BFXNVL50DC_1610.txtZone.Identifier due to incorrect number of columns
Skipping file 9JRHCGCWAZ_1702.txtZone.Identifier due to incorrect number of columns
Skipping file YNESMGNGWK_1606.txtZone.Identifier due to incorrect number of columns
Skipping file 8MERXVINPN_1703.txtZone.Identifier due to incorrect number of columns
Skipping file RDJPGMWKAG_1701.txtZone.Identifier due to incorrect number of columns
Skipping file YAJFHG6OHD_1703.txtZone.Identifier due to incorrect number of columns
Skipping file HSOCXZDCZM_1702.txtZone.Identifier due to incorrect number of columns
Skipping file 1HOEBIGASW_1702.txtZone.Identifier due to incorrect number of columns
Skipping file G6OE5CXQPY_1703.txtZone.Identifier due to incorrec

  hold_by_user = valid_keys[valid_keys['Hand'] != 'S'].groupby(['ID', 'Hand'])['HoldTime'].agg([np.mean, np.std, skew, kurtosis])
  hold_by_user = valid_keys[valid_keys['Hand'] != 'S'].groupby(['ID', 'Hand'])['HoldTime'].agg([np.mean, np.std, skew, kurtosis])
  latency_by_user = valid_keys[valid_keys['Direction'].isin(['LL', 'LR', 'RL', 'RR'])].groupby(['ID', 'Direction'])['LatencyTime'].agg([np.mean, np.std, skew, kurtosis])
  latency_by_user = valid_keys[valid_keys['Direction'].isin(['LL', 'LR', 'RL', 'RR'])].groupby(['ID', 'Direction'])['LatencyTime'].agg([np.mean, np.std, skew, kurtosis])


Dataset saved successfully as full_set.csv
Final dataset shape: (262, 28)
                mean_L      mean_R      std_L      std_R    skew_L    skew_R  \
ID                                                                             
0EA27ICBLF   77.749454   79.306669  17.598336  24.609195  1.585104  4.130272   
0EA27ICBLF   77.749454   79.306669  17.598336  24.609195  1.585104  4.130272   
2JTCBKUP8T   89.355483   90.890535  22.041569  34.355843  0.795678  0.507986   
2JTCBKUP8T   89.355483   90.890535  22.041569  34.355843  0.795678  0.507986   
310NXPGJPD  153.521655  149.722970  43.931450  37.167793  1.351041  1.150740   

            kurtosis_L  kurtosis_R  mean_hold_diff     mean_LL  ...   skew_LR  \
ID                                                              ...             
0EA27ICBLF   11.521208   44.939249       -1.557215  263.580311  ...  0.884186   
0EA27ICBLF   11.521208   44.939249       -1.557215  263.580311  ...  0.884186   
2JTCBKUP8T    7.205031    2.213135       