In [1]:
import os
import pandas as pd
from math import floor, ceil
from statistics import mean, median, mode

In [2]:
# Goal dataframe structure
# timestamp (min), speed, keystrokes, active window (app name), inactive time, PAM

# speed: average
# keystrokes: sum
# active window: mode
# inactive time: sum
# PAM: none (no duplicates in a minute)

# mousedata.tsv not needed

In [3]:
# PAM (photographic affect meter): a tool/app that measures emotion (aka affect)
# by showing photographs and asking users to select the one that best matches their mood

# Valence: pleasurable or not
# Arousal: activation or deactivation

#                 High Arousal
#             +------------------+
#             | 6 | 8 || 14 | 16 |
#             +------------------+
#             | 5 | 7 || 13 | 15 |
# Low Valence +==================+ High Valence
#             | 2 | 4 || 10 | 12 |
#             +------------------+
#             | 1 | 3 ||  9 | 11 |
#             +------------------+
#                 Low Arousal

# https://dl.acm.org/doi/10.1145/1978942.1979047



# looks like we can group values into buckets of 4
# (1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12), (13, 14, 15, 16)

In [4]:
root_path = os.path.abspath('./archive/Data')

user1_path = os.path.join(root_path, 'user 1')
user2_path = os.path.join(root_path, 'user 2')

In [5]:
# read data from all files
def read_data(user_path: str, file_path: str):
    data = pd.read_csv(os.path.join(user_path, file_path), sep='\t')
    data.drop(data.filter(regex="Unnamed: "), axis=1, inplace=True)
    return data

mousedata_user1 = read_data(user1_path, 'mousedata.tsv')
usercondition_user1 = read_data(user1_path, 'usercondition.tsv')

mousedata_user2 = read_data(user2_path, 'mousedata.tsv')
usercondition_user2 = read_data(user2_path, 'usercondition.tsv')

In [6]:
print(mousedata_user1.head(5))

                         Time Event_Type    X    Y   Daylight
0  2021-09-10 11:59:42.515770       Move  518  381  Afternoon
1  2021-09-10 11:59:42.523750       Move  511  388  Afternoon
2  2021-09-10 11:59:42.531727       Move  509  393  Afternoon
3  2021-09-10 11:59:42.539705       Move  505  397  Afternoon
4  2021-09-10 11:59:42.547684       Move  501  399  Afternoon


In [7]:
print(mousedata_user1.head(5))

                         Time Event_Type    X    Y   Daylight
0  2021-09-10 11:59:42.515770       Move  518  381  Afternoon
1  2021-09-10 11:59:42.523750       Move  511  388  Afternoon
2  2021-09-10 11:59:42.531727       Move  509  393  Afternoon
3  2021-09-10 11:59:42.539705       Move  505  397  Afternoon
4  2021-09-10 11:59:42.547684       Move  501  399  Afternoon


In [8]:
# filter to desired columns
def filter_cols(dataframe, cols_to_keep):
    dataframe.drop(dataframe.columns.difference(cols_to_keep), axis=1, inplace=True)

print(mousedata_user1.dtypes)

filter_cols(mousedata_user1, ['Time', 'Event_Type', 'X', 'Y'])
filter_cols(usercondition_user1, ['Time', 'PAM_Val'])

filter_cols(mousedata_user2, ['Time', 'Event_Type', 'X', 'Y'])
filter_cols(usercondition_user2, ['Time', 'PAM_Val'])

print(mousedata_user1.dtypes)

Time          object
Event_Type    object
X              int64
Y              int64
Daylight      object
dtype: object
Time          object
Event_Type    object
X              int64
Y              int64
dtype: object


In [9]:
# groups PAM values into groups of 4
# (1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12), (13, 14, 15, 16) => (1), (2), (3), (4)

print(usercondition_user1.PAM_Val.value_counts())

usercondition_user1['PAM_Val'] = usercondition_user1['PAM_Val'].replace([1, 2, 3, 4], 1)
usercondition_user1['PAM_Val'] = usercondition_user1['PAM_Val'].replace([5, 6, 7, 8], 2)
usercondition_user1['PAM_Val'] = usercondition_user1['PAM_Val'].replace([9, 10, 11, 12], 3)
usercondition_user1['PAM_Val'] = usercondition_user1['PAM_Val'].replace([13, 14, 15, 16], 4)

usercondition_user2['PAM_Val'] = usercondition_user2['PAM_Val'].replace([1, 2, 3, 4], 1)
usercondition_user2['PAM_Val'] = usercondition_user2['PAM_Val'].replace([5, 6, 7, 8], 2)
usercondition_user2['PAM_Val'] = usercondition_user2['PAM_Val'].replace([9, 10, 11, 12], 3)
usercondition_user2['PAM_Val'] = usercondition_user2['PAM_Val'].replace([13, 14, 15, 16], 4)

print(usercondition_user1.PAM_Val.value_counts())

PAM_Val
2     7
3     4
8     4
14    3
7     3
4     2
9     2
1     2
6     2
5     1
13    1
15    1
10    1
Name: count, dtype: int64
PAM_Val
1    15
2    10
4     5
3     3
Name: count, dtype: int64


In [10]:
print('User 1')
print(usercondition_user1.PAM_Val.value_counts())
print()
print('User 2')
print(usercondition_user2.PAM_Val.value_counts())

User 1
PAM_Val
1    15
2    10
4     5
3     3
Name: count, dtype: int64

User 2
PAM_Val
2    37
1    14
3    11
4     3
Name: count, dtype: int64


In [11]:
mousedata_user1 = mousedata_user1.set_index(pd.DatetimeIndex(mousedata_user1['Time']))
mousedata_user2 = mousedata_user2.set_index(pd.DatetimeIndex(mousedata_user2['Time']))
usercondition_user1 = usercondition_user1.set_index(pd.DatetimeIndex(usercondition_user1['Time']))
usercondition_user2 = usercondition_user2.set_index(pd.DatetimeIndex(usercondition_user2['Time']))

mousedata_user1.drop(columns='Time', inplace=True)
mousedata_user2.drop(columns='Time', inplace=True)
usercondition_user1.drop(columns='Time', inplace=True)
usercondition_user2.drop(columns='Time', inplace=True)

print(mousedata_user1.head(5))

                           Event_Type    X    Y
Time                                           
2021-09-10 11:59:42.515770       Move  518  381
2021-09-10 11:59:42.523750       Move  511  388
2021-09-10 11:59:42.531727       Move  509  393
2021-09-10 11:59:42.539705       Move  505  397
2021-09-10 11:59:42.547684       Move  501  399


In [12]:
import numpy as np

#print(mousedata_user1.index.to_series().diff().head(50))
print(mousedata_user1.index.to_series().diff().fillna(pd.Timedelta(0.0)).astype(np.int64))

Time
2021-09-10 11:59:42.515770             0
2021-09-10 11:59:42.523750       7980000
2021-09-10 11:59:42.531727       7977000
2021-09-10 11:59:42.539705       7978000
2021-09-10 11:59:42.547684       7979000
                                 ...    
2021-09-14 18:58:25.083623      99734000
2021-09-14 18:58:25.199313     115690000
2021-09-14 18:58:25.600241     400928000
2021-09-14 18:58:27.096237    1495996000
2021-09-14 18:58:27.238855     142618000
Name: Time, Length: 870384, dtype: int64


In [13]:
print(usercondition_user1.head(5))

                            PAM_Val
Time                               
2021-09-10 12:03:49.599397        4
2021-09-10 12:05:18.375074        1
2021-09-10 12:56:42.248174        1
2021-09-10 13:27:00.550791        1
2021-09-10 13:57:10.866776        2


In [14]:
# assigns pam value, unique identifier, and delta time to each sequence
comb_user1 = mousedata_user1.loc[mousedata_user1.index <= usercondition_user1.index[0]]

comb_user1 = comb_user1.assign(PAM_Val = usercondition_user1.iloc[0].PAM_Val)
comb_user1 = comb_user1.assign(seq_num = 0)
comb_user1 = comb_user1.assign(delta_time = comb_user1.index.to_series().diff().fillna(pd.Timedelta(0.0)).astype(np.int64))

for index, (start_timestamp, end_timestamp) in enumerate(zip(usercondition_user1.index, usercondition_user1.index[1:])):
    temp_dataframe = mousedata_user1.loc[(mousedata_user1.index > start_timestamp) & (mousedata_user1.index <= end_timestamp)]

    temp_dataframe = temp_dataframe.assign(PAM_Val = usercondition_user1.loc[end_timestamp].PAM_Val)
    temp_dataframe = temp_dataframe.assign(seq_num = index + 1)
    temp_dataframe = temp_dataframe.assign(delta_time = temp_dataframe.index.to_series().diff().fillna(pd.Timedelta(0.0)).astype(np.int64))

    comb_user1 = pd.concat([comb_user1, temp_dataframe])

# ================================ #

comb_user2 = mousedata_user2.loc[mousedata_user2.index <= usercondition_user2.index[0]]

comb_user2 = comb_user2.assign(PAM_Val = usercondition_user2.iloc[0].PAM_Val)
comb_user2 = comb_user2.assign(seq_num = 0)
comb_user2 = comb_user2.assign(delta_time = comb_user2.index.to_series().diff().fillna(pd.Timedelta(0.0)).astype(np.int64))

for index, (start_timestamp, end_timestamp) in enumerate(zip(usercondition_user2.index, usercondition_user2.index[1:])):
    temp_dataframe = mousedata_user2.loc[(mousedata_user2.index > start_timestamp) & (mousedata_user2.index <= end_timestamp)]

    temp_dataframe = temp_dataframe.assign(PAM_Val = usercondition_user2.loc[end_timestamp].PAM_Val)
    temp_dataframe = temp_dataframe.assign(seq_num = index + 1)
    temp_dataframe = temp_dataframe.assign(delta_time = temp_dataframe.index.to_series().diff().fillna(pd.Timedelta(0.0)).astype(np.int64))

    comb_user2 = pd.concat([comb_user2, temp_dataframe])

print(comb_user1)

                               Event_Type    X    Y  PAM_Val  seq_num   
Time                                                                    
2021-09-10 11:59:42.515770           Move  518  381        4        0  \
2021-09-10 11:59:42.523750           Move  511  388        4        0   
2021-09-10 11:59:42.531727           Move  509  393        4        0   
2021-09-10 11:59:42.539705           Move  505  397        4        0   
2021-09-10 11:59:42.547684           Move  501  399        4        0   
...                                   ...  ...  ...      ...      ...   
2021-09-14 18:31:29.481415           Move  424  676        1       32   
2021-09-14 18:31:29.498370           Move  429  673        1       32   
2021-09-14 18:31:29.514327           Move  432  671        1       32   
2021-09-14 18:31:29.666919   Left_Pressed  432  671        1       32   
2021-09-14 18:31:29.747703  Left_Released  432  671        1       32   

                            delta_time  
Time     

In [15]:
print(comb_user1.head(10))

                           Event_Type    X    Y  PAM_Val  seq_num  delta_time
Time                                                                         
2021-09-10 11:59:42.515770       Move  518  381        4        0           0
2021-09-10 11:59:42.523750       Move  511  388        4        0     7980000
2021-09-10 11:59:42.531727       Move  509  393        4        0     7977000
2021-09-10 11:59:42.539705       Move  505  397        4        0     7978000
2021-09-10 11:59:42.547684       Move  501  399        4        0     7979000
2021-09-10 11:59:42.555664       Move  500  401        4        0     7980000
2021-09-10 11:59:42.564639       Move  498  403        4        0     8975000
2021-09-10 11:59:42.571621       Move  498  403        4        0     6982000
2021-09-10 11:59:42.579599       Move  495  407        4        0     7978000
2021-09-10 11:59:42.588576       Move  495  408        4        0     8977000


In [16]:
print(comb_user2.head(10))

                           Event_Type    X    Y  PAM_Val  seq_num  delta_time
Time                                                                         
2021-09-07 08:40:02.135293       Move  917  654        1        0           0
2021-09-07 08:40:02.180209       Move  915  654        1        0    44916000
2021-09-07 08:40:02.247588       Move  912  654        1        0    67379000
2021-09-07 08:40:02.997540       Move  914  654        1        0   749952000
2021-09-07 08:40:03.003413       Move  914  652        1        0     5873000
2021-09-07 08:40:03.014151       Move  917  652        1        0    10738000
2021-09-07 08:40:03.025856       Move  919  652        1        0    11705000
2021-09-07 08:40:03.028787       Move  919  649        1        0     2931000
2021-09-07 08:40:03.035623       Move  919  647        1        0     6836000
2021-09-07 08:40:03.038552       Move  922  647        1        0     2929000


In [17]:
print('User 1:')
print(str(sum(usercondition_user1.PAM_Val.value_counts().values)) + ' sequences')
print(str(comb_user1.shape[0]) + ' timestamps')
print()
print('--- NaN Count ---')
print(comb_user1.isna().sum())
print()
print('--- PAM distribution ---')
print(usercondition_user1.PAM_Val.value_counts())

User 1:
33 sequences
844221 timestamps

--- NaN Count ---
Event_Type    0
X             0
Y             0
PAM_Val       0
seq_num       0
delta_time    0
dtype: int64

--- PAM distribution ---
PAM_Val
1    15
2    10
4     5
3     3
Name: count, dtype: int64


In [18]:
print('User 2:')
print(str(sum(usercondition_user2.PAM_Val.value_counts().values)) + ' sequences')
print(str(comb_user2.shape[0]) + ' timestamps')
print()
print('--- NaN Count ---')
print(comb_user2.isna().sum())
print()
print('--- PAM distribution ---')
print(usercondition_user2.PAM_Val.value_counts())

User 2:
65 sequences
4272858 timestamps

--- NaN Count ---
Event_Type    0
X             0
Y             0
PAM_Val       0
seq_num       0
delta_time    0
dtype: int64

--- PAM distribution ---
PAM_Val
2    37
1    14
3    11
4     3
Name: count, dtype: int64


In [19]:
comb_user1.to_csv('./user1_preprocessed_4.csv')
comb_user2.to_csv('./user2_preprocessed_4.csv')