In [1]:
import math
import os
import pickle
from pprint import pprint

import numpy as np
import pandas as pd

# Raw Data Format:
### User Data:

BirthYear | Gender | Parkinsons | Tremors | DiagnosisYear | Sided | UPDRS | Impact | Levadopa | DA | MAOB | Other

### Tappy Data

UserKey | Date | Timestamp | Hand | Hold Time | Direction | Latency Time | Flight Time

# Processed Data Format

## Data Decisions


Of all the users I select those that have the following characteristics (similar to previous study):

- Severity = Mild (Of those with PD)
    - Becuase the point is early diagnosis
- Medicatied = False
    - To avoid the interaction dopamine inhibitors will have on tremors
    
    
## Processed Data

Of the data that has the previous characteristics I include the following information:

| Parkinsons | Age | Gender | Date | Timestamp | Hand | Hold Time | Direction | Latency Time | Flight Time |

I leave out the following data so the processed data includes information someone not diagnosed with PD would know/information a program could easily pick up:

- Tremors
- Diagnosis Year
- Sided
- UPDRS
- Impact
- Levadopa
- DA
- MAOB
- Other

# Process User Data

Since there can be multiple "tappy" records for a single participant, I'll first get information from each of them

I will save the data in a dictionary:
 {UserID: dataframe}
 
As per the contraints laid out previously the dataframe will include the following:

- Parkinsons (0 if no, 1 if yes)
- Birth Year
- Gender (0 if male, 1 if female)
- Impact (0 if mild, 1 if moderate, 2 if severe, -1 if unknown)
- Medicated (0 if no, 1 if yes)

In [69]:
PATH_TO_USERS = os.getcwd() + "/Tappy Data/Users/"

# Useful lines of user data file
BIRTH_YEAR = 0
GENDER = 1
PARKINSONS = 2
IMPACT = 7
LEVADOPA = 8
DA = 9
MAOB = 10
OTHER = 11

def getUserFromFileName(filename):
    filename = filename[5:]
    return filename[:-4]
    
def createUserRow(filepath, user_id):
    """
    Extract data from user file at filepath.
    
    Creates and returns a dataframe with: 
        Parkinsons | BirthYear | Gender | Impact| Medicated
    
    Keyword Arguments:
    filepath - string, path to user file
    """
    ## Get user file ##
    f = open(filepath, 'r')
    lines = f.readlines()
    f.close()
    #pprint(lines)
    
    ## Extract data ##
    parkinsons = 0
    birth_year = 0
    gender = 0
    impact = -1
    medicated = 0
    
    # Parkinsons
    if "True" in lines[PARKINSONS]: parkinsons = 1
    
    # Birth Year
    year = [int(s) for s in lines[BIRTH_YEAR].split() if s.isdigit()]
    if year: birth_year = year[0]
    
    # Gender
    if "Female" in lines[GENDER]: gender = 1
        
    # Impact
    if "Mild" in lines[IMPACT][8:]: impact = 0
    elif "Med" in lines[IMPACT][8:]: impact = 1
    elif "Sev" in lines[IMPACT][8:]: impact = 2
    
    # Medicated
    if "True" in lines[LEVADOPA]: medicated = 1
    elif "True" in lines[DA]: medicated = 1
    elif "True" in lines[MAOB]: medicated = 1
    elif "True" in lines[OTHER]: medicated = 1
    
    return [user_id, parkinsons, birth_year, gender, impact, medicated]


users = [f for f in os.listdir(PATH_TO_USERS)]

column_names = ["UserID", "Parkinsons", "BirthYear", "Gender", "Impact", "Medicated"]
all_users = pd.DataFrame(columns=column_names)

i = 0
for user in users:
    all_users.loc[i] = createUserRow(PATH_TO_USERS + user, getUserFromFileName(user))
    i += 1
    
pd.set_option('display.max_rows', 230)

all_users.set_index("UserID", inplace=True)

selected_users = all_users[all_users["Impact"] < 1]
selected_users = selected_users[selected_users["Medicated"] == 0]

#selected_users["Parkinsons"].value_counts()

selected_users

0    58
1    14
Name: Parkinsons, dtype: int64

### Save/Load User Data

In [3]:
PATH_TO_SAVED_DATA = os.getcwd() + "/Processed Data/"

# Save
#pickle.dump(all_users, open(PATH_TO_SAVED_DATA + "all_users.p", "wb"))
#pickle.dump(selected_users, open(PATH_TO_SAVED_DATA + "selected_users.p", "wb"))

# Load
#all_users = pickle.load(open(PATH_TO_SAVED_DATA + "all_users.p", "rb"))
selected_users = pickle.load(open(PATH_TO_SAVED_DATA + "selected_users.p", "rb"))

## Process Keystroke Data

Using the selected users produce 2 numpy arrays:

1. Labels
    
    Ex: \[0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...\]
    
    Length = # participants


2. Sequence Data
    
    Sequence data = \[Sequence 1, Sequence  2, ..., Sequence 11107\]
    
    Each sequence = \[Observation 1, Observation 2, ..., Observation 250\]
    
    Each observation = \[Age, Gender, Hand, HoldTime, Direction, LatencyTime, FlightTime\]


2. Feature Data

    Each sequence is comprised of a set of features
    
    | Age | Gender | % Left hand usage | Mean flight | Mean latency | hold1 | hold2 | ... | hold 250 |
    
    **This means that the input size is now 255**
    

**Note:** Each sequence laid out above corresponds to 250 observation subsets/time steps of one month of a user's collected data

* Sample/Batch size = 11106

* Time Steps = 250

* Features = 7

In [13]:
PATH_TO_KEYSTROKE = os.getcwd() + "/Tappy Data/Keystrokes/"

# Keystoke data useful indices
DATE = 1 #YYMMDD
HAND = 3 # L or R
HOLD_TIME = 4 # mmmm.m ms
DIRECTION = 5 # LL, LR, LR, RR, S
LATENCY_TIME = 6 # ms
FLIGHT_TIME = 7 # ms

def divideSequence(sequence, n):
    for i in range(0, len(sequence), n):
        yield sequence[i:i+n]
    
    
def genFileSequences(filepath, birth_year, gender, label):
    sequence = []
    labels = [label]
    
    total_flight_time = 0
    total_latency_time = 0
    total_left_hand = 0
    total_keystrokes = 0
    
    f = open(filepath, 'r')
    lines = f.readlines()
    f.close()

    for line in lines:
        total_keystrokes += 1
        
        line = line.split("\t")
        try:
            if birth_year != 0:
                age = int(str(line[DATE])[:2]) + 2000
                age -= birth_year
            else: age = 0

            if line[HAND] == "L": 
                hand = 0
                total_left_hand += 1
            elif line[HAND] == "R": hand = 1
            elif line[HAND] == "S": hand = 2
            else: hand = -1

            if line[DIRECTION] == "LL": direction = 0
            elif line[DIRECTION] == "LR": direction = 1
            elif line[DIRECTION] == "RR": direction = 2
            elif line[DIRECTION] == "RL": direction = 3
            elif line[DIRECTION] == "SS": direction = 4
            elif line[DIRECTION] == "SL": direction = 5
            else: direction = -1
            
            try:
                age = float(age)
                gender = float(gender)
                hand = float(hand)
                hold_time = float(line[HOLD_TIME])
                direction = float(direction)
                latency_time = float(line[LATENCY_TIME])
                total_latency_time += latency_time
                flight_time = float(line[FLIGHT_TIME])
                total_flight_time += flight_time
            except:
                continue
            
            #sequence.append([age, gender, hand, hold_time, direction, latency_time, flight_time])
            feature = hold_time / 1000 # Decaseconds
            sequence.append(feature)
            
            avg_flight = (total_flight_time / total_keystrokes) / 1000
            avg_latency = (total_latency_time / total_keystrokes) / 1000
            left_hand_percent = total_left_hand / total_keystrokes
            
            if avg_flight > 1 or avg_latency > 1 or feature > 1 or left_hand_percent > 1:
                #print(avg_flight, avg_latency, feature, left_hand_percent)
                continue # Small handful of data we're skipping over # hold time = 8400 minutes
                
            if avg_flight < 0 or avg_latency < 0 or feature < 0 or left_hand_percent < 0:
                #print(avg_flight, avg_latency, feature, left_hand_percent)
                continue # None should be negative
            
            
        except ValueError:
            continue
        
        
    sequence = list(divideSequence(sequence, 100))
    if sequence[-1] != 100: sequence = sequence[:-1]
    
    # Add non-sequence features
    for obs in sequence:
        obs.insert(0, avg_latency)
        obs.insert(0, avg_flight)
        obs.insert(0, left_hand_percent)
        obs.insert(0, gender)
        obs.insert(0, age / 1000)
    
    labels = labels * len(sequence)
    return sequence, labels

def genUserSequences(user, birth_year, gender, label):
    user_sequences = []
    user_labels = []
    
    # Get all files by that user and add to np array
    #i = 0
    for f in os.listdir(PATH_TO_KEYSTROKE):
        if f[:10] == user:
            filepath = PATH_TO_KEYSTROKE + f
            new_sequences, new_labels = (genFileSequences(filepath, \
                                                          birth_year, \
                                                          gender, \
                                                          label))
            user_sequences += new_sequences
            user_labels += new_labels
    
    return user_sequences, user_labels

def genSequenceData(users):
    full_data = []
    labels = []
    
    # Add each user's data to full sequence data
    for user in users.index:
        
        birth_year = users._get_value(user, "BirthYear")
        gender = users._get_value(user, "Gender")
        label = users._get_value(user, "Parkinsons")
        
        new_data, new_labels = genUserSequences(user, birth_year, gender, label)
        full_data += new_data
        labels += new_labels
    return full_data, labels
            
        
        

sequence_data, labels = genSequenceData(selected_users)
sequence_data = np.asarray(sequence_data)
labels = np.asarray(labels)
#labels = np.asarray([[label] for label in labels])

In [14]:
print(sequence_data[0])

[0.065      0.         0.45254179 0.15155905 0.23952548 0.0625
 0.0625     0.0938     0.0703     0.0703     0.0859     0.0938
 0.0781     0.0859     0.0781     0.0781     0.0859     0.0781
 0.0781     0.0703     0.0703     0.0781     0.0703     0.0703
 0.1016     0.0781     0.0703     0.0469     0.0781     0.0859
 0.0781     0.0703     0.0859     0.0547     0.0781     0.0625
 0.0781     0.125      0.0781     0.1719     0.0703     0.1328
 0.1016     0.0781     0.0781     0.0781     0.0703     0.0625
 0.0781     0.0625     0.0625     0.0781     0.0703     0.0859
 0.0625     0.1016     0.0781     0.0625     0.0703     0.0781
 0.0547     0.0703     0.0625     0.0703     0.0781     0.0781
 0.0781     0.0938     0.0703     0.0859     0.0938     0.1406
 0.0547     0.0781     0.1016     0.0859     0.1016     0.0625
 0.0781     0.1094     0.0781     0.1094     0.0625     0.0781
 0.0938     0.0703     0.0625     0.0781     0.0625     0.0703
 0.0859     0.0703     0.0625     0.0781     0.0781    

In [15]:
for sequence in sequence_data:
    for observation in sequence:
        observation = observation.astype("float64")

In [16]:
pprint(sequence_data)
print(sequence_data.shape)
print(labels.shape)
print(np.bincount(labels))

array([[0.065     , 0.        , 0.45254179, ..., 0.0703    , 0.1016    ,
        0.0781    ],
       [0.065     , 0.        , 0.45254179, ..., 0.0156    , 0.0859    ,
        0.0625    ],
       [0.065     , 0.        , 0.45254179, ..., 0.0703    , 0.0703    ,
        0.0547    ],
       ...,
       [0.        , 0.        , 0.4736227 , ..., 0.0859    , 0.0781    ,
        0.0625    ],
       [0.        , 0.        , 0.4736227 , ..., 0.0781    , 0.0776    ,
        0.0776    ],
       [0.067     , 1.        , 0.50299401, ..., 0.0859    , 0.125     ,
        0.1016    ]])
(27874, 105)
(27874,)
[25377  2497]


## Save/Load Processed Data

In [418]:
PATH_TO_SAVED_DATA = os.getcwd() + "/Processed Data/"

# Save
pickle.dump(sequence_data, open(PATH_TO_SAVED_DATA + "processed_255.p", "wb"))
pickle.dump(labels, open(PATH_TO_SAVED_DATA + "labels_255.p", "wb"))

# Load
processed_data = pickle.load(open(PATH_TO_SAVED_DATA + "processed_255.p", "rb"))
labels = pickle.load(open(PATH_TO_SAVED_DATA + "labels_255.p", "rb"))

# Split into Training and Testing

In [419]:
train_size = math.ceil(len(processed_data) * .7)
test_size = math.floor(len(processed_data) * .3)

print("Total Data:",len(processed_data))
print("Training:",train_size)
print("Testing:",test_size)
print("Train + Test:",train_size + test_size)


training_labels = labels[:train_size]
testing_labels = labels[train_size:]

train_split = np.bincount(training_labels)
print("Training Split:\n\t","PD = 0:", train_split[0], " PD = 1:", train_split[1])
print("\t", \
      train_split[0] / (train_split[0] + train_split[1]), \
      "%:", \
      train_split[1] / (train_split[0] + train_split[1]), \
      "%")

test_split = np.bincount(testing_labels)
print("Testing Split:\n\t","PD = 0:",test_split[0]," PD = 1:",test_split[1])
print("\t", \
      test_split[0] / (test_split[0] + test_split[1]), \
      "%:", \
      test_split[1] / (test_split[0] + test_split[1]), \
      "%")

training_data = processed_data[:train_size]
testing_data = processed_data[train_size:]

print("Training Data size:", len(training_data))
print("Testing Data size:", len(testing_data))

Total Data: 11106
Training: 7775
Testing: 3331
Train + Test: 11106
Training Split:
	 PD = 0: 7043  PD = 1: 732
	 0.9058520900321544 %: 0.09414790996784565 %
Testing Split:
	 PD = 0: 3073  PD = 1: 258
	 0.9225457820474332 %: 0.0774542179525668 %
Training Data size: 7775
Testing Data size: 3331


## Save Training and Testing Data

In [420]:
PATH_TO_SAVED_DATA = os.getcwd() + "/Processed Data/"

SAVE = 1

# Save
if SAVE:
    pickle.dump(training_labels, open(PATH_TO_SAVED_DATA + "train_labels_255.p", "wb"))
    pickle.dump(testing_labels, open(PATH_TO_SAVED_DATA + "test_labels_255.p", "wb"))
    pickle.dump(training_data, open(PATH_TO_SAVED_DATA + "train_data_255.p", "wb"))
    pickle.dump(testing_data, open(PATH_TO_SAVED_DATA + "test_data_255.p", "wb"))
# Load
else:

    training_labels = pickle.load(open(PATH_TO_SAVED_DATA + "train_labels_1255.p", "rb"))
    testing_labels = pickle.load(open(PATH_TO_SAVED_DATA + "test_labels_255.p", "rb"))
    training_data = pickle.load(open(PATH_TO_SAVED_DATA + "train_data_255.p", "rb"))
    testing_data = pickle.load(open(PATH_TO_SAVED_DATA + "test_data_255.p", "rb"))

# Balanced Data

In [8]:
positive_rows = []

for i in range(len(labels)):
    if labels[i] == 1:
        positive_rows.append(i)
        
positive_examples = np.asarray([[]])

for i in range(len(sequence_data)):
    if i not in positive_rows:
        if i-1 in positive_rows:
            print("START:",i)
        if i+1 in positive_rows:
            print("END:",i)

END: 32938
START: 36243
END: 37014
START: 39511
END: 45292
START: 46058
END: 57945
START: 57969
END: 72543
START: 75167
END: 97961
START: 100436
END: 101054
START: 101874
END: 118706
START: 118717
END: 118732
START: 118763


KeyboardInterrupt: 

In [10]:
positive_examples = np.asarray(sequence_data[6586:7246])
positive_examples = np.append(positive_examples, sequence_data[7398:7896], axis=0)
positive_examples = np.append(positive_examples, sequence_data[9050:9202], axis=0)
positive_examples = np.append(positive_examples, sequence_data[11574:11578], axis=0)
positive_examples = np.append(positive_examples, sequence_data[14485:15007], axis=0)
positive_examples = np.append(positive_examples, sequence_data[19550:20040], axis=0)
positive_examples = np.append(positive_examples, sequence_data[20159:20322], axis=0)
positive_examples = np.append(positive_examples, sequence_data[23684:23686], axis=0)
positive_examples = np.append(positive_examples, sequence_data[23689:23695], axis=0)

print(positive_examples.shape)

negative_examples = np.asarray(sequence_data[0:6586])
negative_examples = np.append(negative_examples, sequence_data[7246:7398], axis=0)
negative_examples = np.append(negative_examples, sequence_data[7896:9050], axis=0)
negative_examples = np.append(negative_examples, sequence_data[9202:11574], axis=0)
negative_examples = np.append(negative_examples, sequence_data[11578:14485], axis=0)
negative_examples = np.append(negative_examples, sequence_data[15007:19550], axis=0)
negative_examples = np.append(negative_examples, sequence_data[20040:20159], axis=0)
negative_examples = np.append(negative_examples, sequence_data[20322:23684], axis=0)
negative_examples = np.append(negative_examples, sequence_data[23686:23689], axis=0)
negative_examples = np.append(negative_examples, sequence_data[23695:], axis=0)

np.random.shuffle(negative_examples)

negative_examples = negative_examples[:2497]

print(negative_examples.shape)

balanced_data = np.append(positive_examples, negative_examples, axis=0)
balanced_labels = np.ones(2497)
balanced_labels = np.append(balanced_labels, np.zeros(2497))

print(balanced_data.shape)
print(balanced_labels.shape)

(2497, 105)
(2497, 105)
(4994, 105)
(4994,)


In [11]:
positive_training = positive_examples[0:1748]
positive_testing = positive_examples[1748:]
negative_training = negative_examples[0:1748]
negative_testing = negative_examples[1748:]

training_labels = np.ones(1748)
training_labels = np.append(training_labels, np.zeros(1748))
training = np.append(positive_training, negative_training, axis=0)

print(training_labels.shape)
print(training.shape)


testing_labels = np.ones(749)
testing_labels = np.append(testing_labels, np.zeros(749))
testing = np.append(positive_testing, negative_testing, axis=0)

print(testing_labels.shape)
print(testing.shape)

(3496,)
(3496, 105)
(1498,)
(1498, 105)


In [12]:
PATH_TO_SAVED_DATA = os.getcwd() + "/Processed Data/"

SAVE = 1

# Save
if SAVE:
    pickle.dump(training_labels, open(PATH_TO_SAVED_DATA + "train_labels_bal_105.p", "wb"))
    pickle.dump(testing_labels, open(PATH_TO_SAVED_DATA + "test_labels_bal_105.p", "wb"))
    pickle.dump(training, open(PATH_TO_SAVED_DATA + "train_data_bal_105.p", "wb"))
    pickle.dump(testing, open(PATH_TO_SAVED_DATA + "test_data_bal_105.p", "wb"))
# Load
else:

    training_labels = pickle.load(open(PATH_TO_SAVED_DATA + "train_labels_bal_105.p", "rb"))
    testing_labels = pickle.load(open(PATH_TO_SAVED_DATA + "test_labels_bal_105.p", "rb"))
    training = pickle.load(open(PATH_TO_SAVED_DATA + "train_data_bal_105.p", "rb"))
    testing = pickle.load(open(PATH_TO_SAVED_DATA + "test_data_bal_105.p", "rb"))