In [1]:
# uses list of created features from here:
# https://ietresearch.onlinelibrary.wiley.com/doi/full/10.1049/iet-bmt.2018.5126

# x velocity (mean, min, max, std)
# y velocity (mean, min, max, std)
# velocity (mean, min, max, std)
# acceleration (mean, min, max, std)
# jerk (mean, min, max, std)
# angular velocity (mean, min, max, std)
# curvature time series (mean, min, max, std)
#X type (3 choices: mouse movement(MM), point click (PC), drag and drop (DD))
# elapsed time
# trajectory length
# distance end-to-end
# direction (8 choices: [0*x, 45*x)deg with x=1...8)
# straightness
# num of points
# sum of angles
#X largest deviation
#X num critical points
#X acceleration time at beginning

In [2]:
import pandas as pd
import numpy as np
import os
#from math import sqrt, atan2

In [3]:
from google.colab import drive
drive.mount('/content/drive')
root_path = os.path.abspath('./drive/MyDrive/PAM Detection/archive/Data')
out_path = os.path.abspath('./drive/MyDrive/PAM Detection')

#root_path = os.path.abspath('./archive/Data')

Mounted at /content/drive


In [4]:
user1_path = os.path.join(root_path, 'user 1')
user2_path = os.path.join(root_path, 'user 2')

In [5]:
# read data from all files
def read_data(user_path: str, file_path: str, index_col:str):
    data = pd.read_csv(os.path.join(user_path, file_path), sep='\t', index_col=index_col)
    data.index = pd.to_datetime(data.index)
    data.drop(data.filter(regex="Unnamed: "), axis=1, inplace=True)
    return data

mousedata_user1 = read_data(user1_path, 'mousedata.tsv', 'Time')
#inactivity_user1 = read_data(user1_path, 'inactivity.tsv', 'Stopped_Time')
usercondition_user1 = read_data(user1_path, 'usercondition.tsv', 'Time')

mousedata_user2 = read_data(user2_path, 'mousedata.tsv', 'Time')
#inactivity_user2 = read_data(user2_path, 'inactivity.tsv', 'Stopped_Time')
usercondition_user2 = read_data(user2_path, 'usercondition.tsv', 'Time')

print(mousedata_user1.head(5))

                           Event_Type    X    Y   Daylight
Time                                                      
2021-09-10 11:59:42.515770       Move  518  381  Afternoon
2021-09-10 11:59:42.523750       Move  511  388  Afternoon
2021-09-10 11:59:42.531727       Move  509  393  Afternoon
2021-09-10 11:59:42.539705       Move  505  397  Afternoon
2021-09-10 11:59:42.547684       Move  501  399  Afternoon


In [6]:
# remove duplicated times
print(mousedata_user1.shape)

mousedata_user1 = mousedata_user1[~mousedata_user1.index.duplicated(keep='first')]
mousedata_user2 = mousedata_user2[~mousedata_user2.index.duplicated(keep='first')]

print(mousedata_user1.shape)

(870384, 4)
(864357, 4)


In [7]:
#print(inactivity_user1)

#inactivity_user1.index.names = ['Time']
#inactivity_user2.index.names = ['Time']

#inactivity_user1 = inactivity_user1[inactivity_user1['Type'] == 'Mouse']
#inactivity_user2 = inactivity_user2[inactivity_user2['Type'] == 'Mouse']

#print(inactivity_user1)

In [8]:
print(mousedata_user1.head(5))

                           Event_Type    X    Y   Daylight
Time                                                      
2021-09-10 11:59:42.515770       Move  518  381  Afternoon
2021-09-10 11:59:42.523750       Move  511  388  Afternoon
2021-09-10 11:59:42.531727       Move  509  393  Afternoon
2021-09-10 11:59:42.539705       Move  505  397  Afternoon
2021-09-10 11:59:42.547684       Move  501  399  Afternoon


In [9]:
# filter to desired columns
def filter_cols(dataframe, cols_to_keep):
    dataframe.drop(dataframe.columns.difference(cols_to_keep), axis=1, inplace=True)

print(mousedata_user1.dtypes)

filter_cols(mousedata_user1, ['Event_Type', 'X', 'Y'])
#filter_cols(inactivity_user1, ['Type'])
filter_cols(usercondition_user1, ['PAM_Val'])

filter_cols(mousedata_user2, ['Event_Type', 'X', 'Y'])
#filter_cols(inactivity_user2, ['Type'])
filter_cols(usercondition_user2, ['PAM_Val'])

print(mousedata_user1.dtypes)

Event_Type    object
X              int64
Y              int64
Daylight      object
dtype: object
Event_Type    object
X              int64
Y              int64
dtype: object


In [10]:
# groups PAM values into groups of 4
# (1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12), (13, 14, 15, 16) => (1), (2), (3), (4)

print(usercondition_user1.PAM_Val.value_counts())


usercondition_user1['PAM_Val'] = usercondition_user1['PAM_Val'].map(lambda x: (x // 4) + 1)
usercondition_user2['PAM_Val'] = usercondition_user2['PAM_Val'].map(lambda x: (x // 4) + 1)

print(usercondition_user1.PAM_Val.value_counts())

2     7
3     4
8     4
14    3
7     3
4     2
9     2
1     2
6     2
5     1
13    1
15    1
10    1
Name: PAM_Val, dtype: int64
1    13
2     8
3     7
4     5
Name: PAM_Val, dtype: int64


In [11]:
# splits into trajectories and assigns unique identifier and delta time (not correct at beginning of trajectories!!!)
def split_into_trajectories(mousedata, split_threshold):
    user = mousedata.assign(delta_time = mousedata.index.to_series().diff().fillna(pd.Timedelta(0.0)).astype(np.int64) * 1e-9)
    #print(comb_user)

    split_locations = user[user['delta_time'] >= split_threshold].index.to_numpy()
    #split_threshold = comb_user['delta_time'].describe()['mean'] + comb_user['delta_time'].describe()['std']
    #split_locations = comb_user[comb_user['delta_time'] > split_threshold].reset_index().filter(['Time'])     # start timesteps
    #print(comb_user['delta_time'].describe())
    #print('splitting at:', split_threshold)
    #print(split_locations)
    #print(split_locations[split_locations.duplicated()])


    def return_seq_num_list(data, index, start_timestamp=None, end_timestamp=None):
        if start_timestamp is None and end_timestamp is not None:
            temp_df = data.loc[data.index < end_timestamp]
        if start_timestamp is not None and end_timestamp is None:
            temp_df = data.loc[data.index >= start_timestamp]
        if start_timestamp is not None and end_timestamp is not None:
            temp_df = data.loc[(data.index >= start_timestamp) & (data.index < end_timestamp)]

        if len(temp_df) == 0:
            print(len(temp_df), 'Will skip')
            return [], index
        else:
            index_list = [index for i in range(len(temp_df))]
            return index_list, index + 1

    index_list, seq_num = return_seq_num_list(user, 0, end_timestamp=split_locations[0])
    for index in range(len(split_locations) - 1):
        temp_index_list, seq_num = return_seq_num_list(user, seq_num, split_locations[index], split_locations[index + 1])
        index_list += temp_index_list
    temp_index_list, seq_num = return_seq_num_list(user, seq_num, start_timestamp=split_locations[len(split_locations) - 1])
    index_list += temp_index_list

    user = user.assign(seq_num = index_list)

    return user

split_threshold = 1.0
user1 = split_into_trajectories(mousedata_user1, split_threshold)
print('user 1 done')
user2 = split_into_trajectories(mousedata_user2, split_threshold)
print('user 2 done')

print(user1)

user 1 done
user 2 done
                               Event_Type    X    Y  delta_time  seq_num
Time                                                                    
2021-09-10 11:59:42.515770           Move  518  381    0.000000        0
2021-09-10 11:59:42.523750           Move  511  388    0.007980        0
2021-09-10 11:59:42.531727           Move  509  393    0.007977        0
2021-09-10 11:59:42.539705           Move  505  397    0.007978        0
2021-09-10 11:59:42.547684           Move  501  399    0.007979        0
...                                   ...  ...  ...         ...      ...
2021-09-14 18:58:25.083623   Left_Pressed  932   13    0.099734     6711
2021-09-14 18:58:25.199313  Left_Released  932   13    0.115690     6711
2021-09-14 18:58:25.600241           Move  933   13    0.400928     6711
2021-09-14 18:58:27.096237   Left_Pressed  933   13    1.495996     6712
2021-09-14 18:58:27.238855  Left_Released  933   13    0.142618     6712

[864357 rows x 5 columns]


In [12]:
def add_pam(data, user_condition_data):
    def return_pam_list(data, pam_val, start_timestamp=None, end_timestamp=None):
        if start_timestamp is None and end_timestamp is not None:
            temp_df = data.loc[data.index < end_timestamp]
        if start_timestamp is not None and end_timestamp is None:
            temp_df = data.loc[data.index >= start_timestamp]
        if start_timestamp is not None and end_timestamp is not None:
            temp_df = data.loc[(data.index >= start_timestamp) & (data.index < end_timestamp)]

        index_list = [pam_val for i in range(len(temp_df))]
        return index_list

    split_locations = user_condition_data.reset_index().filter(['Time', 'PAM_Val'])     # start timesteps
    pam_list = return_pam_list(data, split_locations.loc[0]['PAM_Val'], end_timestamp=split_locations.loc[0]['Time'])
    for index in range(len(user_condition_data) - 1):
        pam_list += return_pam_list(data, split_locations.loc[index]['PAM_Val'], start_timestamp=split_locations.loc[index]['Time'], end_timestamp=split_locations.loc[index+1]['Time'])
    pam_list += return_pam_list(data, split_locations.loc[len(split_locations) - 1]['PAM_Val'], start_timestamp=split_locations.loc[len(split_locations) - 1]['Time'])

    data = data.assign(PAM_Val=pam_list)
    return data

comb_user1 = add_pam(user1, usercondition_user1)
comb_user2 = add_pam(user2, usercondition_user2)

print(comb_user1)

                               Event_Type    X    Y  delta_time  seq_num  \
Time                                                                       
2021-09-10 11:59:42.515770           Move  518  381    0.000000        0   
2021-09-10 11:59:42.523750           Move  511  388    0.007980        0   
2021-09-10 11:59:42.531727           Move  509  393    0.007977        0   
2021-09-10 11:59:42.539705           Move  505  397    0.007978        0   
2021-09-10 11:59:42.547684           Move  501  399    0.007979        0   
...                                   ...  ...  ...         ...      ...   
2021-09-14 18:58:25.083623   Left_Pressed  932   13    0.099734     6711   
2021-09-14 18:58:25.199313  Left_Released  932   13    0.115690     6711   
2021-09-14 18:58:25.600241           Move  933   13    0.400928     6711   
2021-09-14 18:58:27.096237   Left_Pressed  933   13    1.495996     6712   
2021-09-14 18:58:27.238855  Left_Released  933   13    0.142618     6712   

           

In [13]:
def get_features(user):
  '''
  x velocity (mean, min, max, std)
  y velocity (mean, min, max, std)
  velocity (mean, min, max, std)
  acceleration (mean, min, max, std)
  jerk (mean, min, max, std)
  angular velocity (mean, min, max, std)
  curvature time series (mean, min, max, std)
  type (3 choices: mouse movement(MM), point click (PC), drag and drop (DD))
  elapsed time
  trajectory length
  distance end-to-end
  direction (8 choices: [0*x, 45*x)deg with x=1...8)
  straightness
  num of points
  sum of angles
  largest deviation
  num critical points
  acceleration time at beginning
  '''
  user_grouped = user.groupby('seq_num')

  time_features_df = pd.DataFrame()

  #features_df = pd.DataFrame(columns=['seq_num',
  #                                 'x_vel_mean', 'x_vel_min', 'x_vel_max', 'x_vel_std',
  #                                 'y_vel_mean', 'y_vel_min', 'y_vel_max', 'y_vel_std',
  #                                 'vel_mean', 'vel_min', 'vel_max', 'vel_std',
  #                                 'acc_mean', 'acc_min', 'acc_max', 'acc_std',
  #                                 'jerk_mean', 'jerk_min', 'jerk_max', 'jerk_std',
  #                                 'ang_vel_mean', 'ang_vel_min', 'ang_vel_max', 'ang_vel_std',
  #                                 'curve_mean', 'curve_min', 'curve_max', 'curve_std',
  #                                 'type', 'elapsed_time', 'traj_len', 'dist', 'direction', 'straightness', 'num_points', 'angle_sum', 'max_dev', 'num_crit_points', 'begin_acc_time'])
  #features_df = pd.DataFrame(columns=['seq_num',
  #                                 'x_vel_mean', 'x_vel_min', 'x_vel_max', 'x_vel_std',
  #                                 'y_vel_mean', 'y_vel_min', 'y_vel_max', 'y_vel_std',
  #                                 'vel_mean', 'vel_min', 'vel_max', 'vel_std',
  #                                 'acc_mean', 'acc_min', 'acc_max', 'acc_std',
  #                                 'jerk_mean', 'jerk_min', 'jerk_max', 'jerk_std',
  #                                 'ang_vel_mean', 'ang_vel_min', 'ang_vel_max', 'ang_vel_std',
  #                                 'curve_mean', 'curve_min', 'curve_max', 'curve_std',
  #                                 'elapsed_time', 'traj_len', 'dist', 'direction', 'straightness', 'num_points', 'angle_sum'])

  for seq_num, seq in user_grouped:
    df = seq.copy()
    # time series features
    df = df.assign(delta_x = df['X'].diff().fillna(0.0))
    df = df.assign(delta_y = df['Y'].diff().fillna(0.0))
    df = df.assign(delta_x_vel = df['delta_x'] / df['delta_time'])
    df = df.assign(delta_y_vel = df['delta_y'] / df['delta_time'])
    #print(df['delta_x_vel']**2)
    df = df.assign(delta_vel = np.sqrt(df['delta_x_vel']**2 + df['delta_y_vel']**2))
    df = df.assign(delta_acc = df['delta_vel'] / df['delta_time'])
    df = df.assign(delta_jerk = df['delta_acc'] / df['delta_time'])
    df = df.assign(delta_ang = np.arctan2(df['delta_y'], df['delta_x']))
    df = df.assign(delta_ang_vel = df['delta_ang'] / df['delta_time'])
    df = df.assign(delta_traj_len = np.sqrt(df['delta_x']**2 + df['delta_y']**2))
    df = df.assign(delta_curve = df['delta_ang'] / df['delta_traj_len'])

    # used features
    features = {}

    #print(df['PAM_Val'][-1])
    #features['seq_num'] = seq_num
    #features['PAM_Val'] = df['PAM_Val'][-1]

    features['x_vel_mean'] = df['delta_x_vel'].mean()
    features['x_vel_min'] = df['delta_x_vel'].min()
    features['x_vel_max'] = df['delta_x_vel'].max()
    features['x_vel_std'] = df['delta_x_vel'].std()

    features['y_vel_mean'] = df['delta_y_vel'].mean()
    features['y_vel_min'] = df['delta_y_vel'].min()
    features['y_vel_max'] = df['delta_y_vel'].max()
    features['y_vel_std'] = df['delta_y_vel'].std()

    features['vel_mean'] = df['delta_vel'].mean()
    features['vel_min'] = df['delta_vel'].min()
    features['vel_max'] = df['delta_vel'].max()
    features['vel_std'] = df['delta_vel'].std()

    features['acc_mean'] = df['delta_acc'].mean()
    features['acc_min'] = df['delta_acc'].min()
    features['acc_max'] = df['delta_acc'].max()
    features['acc_std'] = df['delta_acc'].std()

    features['jerk_mean'] = df['delta_jerk'].mean()
    features['jerk_min'] = df['delta_jerk'].min()
    features['jerk_max'] = df['delta_jerk'].max()
    features['jerk_std'] = df['delta_jerk'].std()

    features['ang_vel_mean'] = df['delta_ang_vel'].mean()
    features['ang_vel_min'] = df['delta_ang_vel'].min()
    features['ang_vel_max'] = df['delta_ang_vel'].max()
    features['ang_vel_std'] = df['delta_ang_vel'].std()

    features['curve_mean'] = df['delta_curve'].mean()
    features['curve_min'] = df['delta_curve'].min()
    features['curve_max'] = df['delta_curve'].max()
    features['curve_std'] = df['delta_curve'].std()

    # features['type'] =
    #print((df.index[-1] - df.index[0]).total_seconds())
    features['elapsed_time'] = (df.index[-1] - df.index[0]).total_seconds()
    features['traj_len'] = df['delta_traj_len'].sum()
    features['dist'] = np.sqrt((df.iloc[-1:]['X'] - df.iloc[0]['X'])**2 + (df.iloc[-1:]['Y'] - df.iloc[0]['Y'])**2)
    features['direction'] = np.arctan2((df.iloc[-1:]['Y'] - df.iloc[0]['Y']), (df.iloc[-1:]['X'] - df.iloc[0]['X']))
    features['straightness'] = features['dist'] / features['traj_len']
    features['num_points'] = len(df)
    features['angle_sum'] = df['delta_ang'].sum()
    # features['max_dev'] =
    # features['num_crit_points'] =

    #neg_acc_start = df[df['delta_acc'] <= 0.0].index[0]
    #print(df.index.get_loc(neg_acc_start))
    #print(neg_acc_start)
    #print(df.index[df.index.get_loc(neg_acc_start)-1])
    #features['begin_acc_time'] = (df.index[df.index.get_loc(neg_acc_start)-1] - df.index[0]).total_seconds()

    df = pd.concat([df, pd.DataFrame(features)], axis=1)
    time_features_df = pd.concat([time_features_df, df], axis=0)
    #print(features)

    #print(features_df.shape)
    #print('==========================================')

  return time_features_df

user1_time_features = get_features(comb_user1)
print('user 1 done')
user2_time_features = get_features(comb_user2)
print('user 2 done')

print(user1_time_features)

user 1 done
user 2 done
                               Event_Type    X    Y  delta_time  seq_num  \
Time                                                                       
2021-09-10 11:59:42.515770           Move  518  381    0.000000        0   
2021-09-10 11:59:42.523750           Move  511  388    0.007980        0   
2021-09-10 11:59:42.531727           Move  509  393    0.007977        0   
2021-09-10 11:59:42.539705           Move  505  397    0.007978        0   
2021-09-10 11:59:42.547684           Move  501  399    0.007979        0   
...                                   ...  ...  ...         ...      ...   
2021-09-14 18:58:25.083623   Left_Pressed  932   13    0.099734     6711   
2021-09-14 18:58:25.199313  Left_Released  932   13    0.115690     6711   
2021-09-14 18:58:25.600241           Move  933   13    0.400928     6711   
2021-09-14 18:58:27.096237   Left_Pressed  933   13    1.495996     6712   
2021-09-14 18:58:27.238855  Left_Released  933   13    0.142618 

In [14]:
comb_user1.to_csv(os.path.join(out_path, 'user1_feat_act.csv'))
comb_user2.to_csv(os.path.join(out_path, 'user2_feat_act.csv'))

user1_time_features.to_csv(os.path.join(out_path, 'user1_feat_act_time.csv'))
user2_time_features.to_csv(os.path.join(out_path, 'user2_feat_act_time.csv'))

#user1_features.to_csv(os.path.join(out_path, 'user1_eng_feat_act_feats.csv'))
#user2_features.to_csv(os.path.join(out_path, 'user2_eng_feat_act_feats.csv'))