In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

import json
import pickle
import re

import matplotlib.pyplot as plt

# Variables

Here is the selected variables:

- `TRIAL_INDEX`
- `EYE_USED`
- `CURRENT_FIX_X`
- `CURRENT_FIX_Y`
- `CURRENT_FIX_START`
- `CURRENT_FIX_DURATION`
- `NEXT_SAC_END_X`
- `NEXT_SAC_END_Y`
- `NEXT_SAC_AMPLITUDE`
- `NEXT_SAC_DIRECTION`
- `NEXT_SAC_DURATION`
- `NEXT_SAC_ANGLE`
- `NEXT_SAC_AVG_VELOCITY`

Since the Timestamp of the events is not at the right time, we need to synchronize Mouse data and eyes data. To do such, we sent a MSG containing `TRIAL_START=XXX-XX-XX XX:XX:XX` at the begining of the trial and `TRIAL_END=XXX-XX-XX XX:XX:XX` at the end.

# Load Data

In [3]:
part_id = 1

In [4]:
df = pd.read_csv("./data/part_{0}/part_{0}.csv".format(part_id), sep="\t")
df.head()

Unnamed: 0,TRIAL_INDEX,EYE_USED,CURRENT_FIX_X,CURRENT_FIX_Y,CURRENT_FIX_START,CURRENT_FIX_DURATION,NEXT_SAC_END_X,NEXT_SAC_END_Y,NEXT_SAC_AMPLITUDE,NEXT_SAC_DIRECTION,NEXT_SAC_DURATION,NEXT_SAC_ANGLE,NEXT_SAC_AVG_VELOCITY
0,1,RIGHT,75330,39970,6,71,74320,59130,507,DOWN,154,-9254,3292
1,1,RIGHT,74180,60260,231,151,71890,50690,273,UP,28,10499,9763
2,1,RIGHT,71820,51930,410,180,95760,51380,692,RIGHT,34,382,20359
3,1,RIGHT,95150,51770,624,156,109010,51950,398,RIGHT,32,28,12447
4,1,RIGHT,109050,52430,812,169,57650,12650,1806,LEFT,128,14238,14110


In [5]:
msg = pd.read_csv("./data/part_{0}/part_{0}_msg.csv".format(part_id), sep="\t")
msg.head()

Unnamed: 0,TRIAL_INDEX,CURRENT_MSG_TIME,CURRENT_MSG_TEXT
0,1,7,TRIAL START = 2018-04-03 14:52:01
1,1,59993,TRIAL END = 2018-04-03 14:53:01
2,2,6,TRIAL START = 2018-04-03 14:53:51
3,2,60288,TRIAL END = 2018-04-03 14:54:51
4,3,5,TRIAL START = 2018-04-03 14:55:58


In [6]:
mrs_json = json.load(open("./data/part_{0}/records-{0}.mrs".format(part_id)))

In [7]:
config = pickle.load(open("./data/part_{0}/part_{0}.cfg".format(part_id), 'rb'))

# Extract data

In [8]:
# Extract scroll data from json file
def extract_scroll(mrs_json, idx):
    # There is two key format: scroll|mouse-website_id-part_id or scroll|mouse-website_id
    # So we need to check that out
    r = re.compile("scroll-"+str(idx)+"(?!\d)")

    for item in mrs_json:
        match = list(filter(r.match, list(item.keys())))
        
        if(len(match) > 0):
            return pd.DataFrame(item[match[0]])
    
    return None

In [9]:
# Build scroll offset dataset
df_scroll = pd.DataFrame()
for i in range(1,19):
    scroll = None
    scroll = extract_scroll(mrs_json, i)
    scroll["website_id"] = i
    df_scroll = pd.concat([df_scroll, scroll])

In [10]:
# Extract right Datetime
# Timestamp is gave by `new Date().getTime()` in Javascript which is in ms
# And since this same function give UTC time, we need to add 1H


time_to_add = 0
# There was a time change on 25 March 2018, so the time shift between the datasets is not 1h anymore but 2h
if(part_id > 700 and part_id < 712):
    time_to_add = 1
else:
    time_to_add = 2

df_scroll["datetime"] = pd.to_datetime(df_scroll["timestamp"], unit="ms") + pd.Timedelta(hours=time_to_add)

In [11]:
df_scroll.head()

Unnamed: 0,offset,timestamp,website_id,datetime
0,1,1522761550855,1,2018-04-03 15:19:10.855
1,7,1522761550869,1,2018-04-03 15:19:10.869
2,18,1522761550883,1,2018-04-03 15:19:10.883
3,26,1522761550890,1,2018-04-03 15:19:10.890
4,37,1522761550897,1,2018-04-03 15:19:10.897


# Messages cleaning

In [12]:
# Delete useless msg
msg = msg[~msg["CURRENT_MSG_TEXT"].str.contains("!MODE RECORD")].reset_index(drop=True)
msg.head()

Unnamed: 0,TRIAL_INDEX,CURRENT_MSG_TIME,CURRENT_MSG_TEXT
0,1,7,TRIAL START = 2018-04-03 14:52:01
1,1,59993,TRIAL END = 2018-04-03 14:53:01
2,2,6,TRIAL START = 2018-04-03 14:53:51
3,2,60288,TRIAL END = 2018-04-03 14:54:51
4,3,5,TRIAL START = 2018-04-03 14:55:58


In [13]:
# Extract message datetime in a new column
def split_equal(row):
    string = row["CURRENT_MSG_TEXT"].split("=")
    row["EVENT_NAME"] = string[0].strip()
    row["EVENT_DATETIME"] = string[1].strip()
    
    del row["CURRENT_MSG_TEXT"]
    
    return row

msg = msg.apply(split_equal, axis=1)
msg.head()

Unnamed: 0,TRIAL_INDEX,CURRENT_MSG_TIME,EVENT_NAME,EVENT_DATETIME
0,1,7,TRIAL START,2018-04-03 14:52:01
1,1,59993,TRIAL END,2018-04-03 14:53:01
2,2,6,TRIAL START,2018-04-03 14:53:51
3,2,60288,TRIAL END,2018-04-03 14:54:51
4,3,5,TRIAL START,2018-04-03 14:55:58


In [14]:
msg["EVENT_DATETIME"] = pd.to_datetime(msg["EVENT_DATETIME"])

# Other cleaning

In [15]:
# Left, Right, Up or Down
df["NEXT_SAC_DIRECTION"] = df["NEXT_SAC_DIRECTION"].astype("category")
df["NEXT_SAC_DIRECTION"].cat.categories

Index(['.', 'DOWN', 'LEFT', 'RIGHT', 'UP'], dtype='object')

In [16]:
df["CURRENT_FIX_Y"] = pd.to_numeric(df["CURRENT_FIX_Y"].str.replace(',','.'))
df["CURRENT_FIX_X"] = pd.to_numeric(df["CURRENT_FIX_X"].str.replace(',','.'))


# Last fixations does not have NEXT_SAC information
df["NEXT_SAC_AMPLITUDE"] = pd.to_numeric(df["NEXT_SAC_AMPLITUDE"].str.replace(".", "").str.replace(",", "."))
df["NEXT_SAC_END_X"] = pd.to_numeric(df["NEXT_SAC_END_X"].str.replace(".", "").str.replace(",", "."))
df["NEXT_SAC_END_Y"] = pd.to_numeric(df["NEXT_SAC_END_Y"].str.replace(".", "").str.replace(",", "."))
df["NEXT_SAC_DURATION"] = pd.to_numeric(df["NEXT_SAC_DURATION"].str.replace(".", "").str.replace(",", "."))
df["NEXT_SAC_ANGLE"] = pd.to_numeric(df["NEXT_SAC_ANGLE"].str.replace(".", "").str.replace(",", "."))
df["NEXT_SAC_AVG_VELOCITY"] = pd.to_numeric(df["NEXT_SAC_AVG_VELOCITY"].str.replace(".", "").str.replace(",", "."))

# Time Sync

In [17]:
# Time sync
def sync_time(cell, msg):
    timedelta = cell - msg["CURRENT_MSG_TIME"][0]
    to_return = msg.loc[0, "EVENT_DATETIME"] + pd.Timedelta(milliseconds=timedelta)
    return to_return

In [18]:
def sync_and_clean(group):
    trial_index = group['TRIAL_INDEX'].unique()[0]
    
    msg_start_trial = msg.query("TRIAL_INDEX == @trial_index and EVENT_NAME == 'TRIAL START'").reset_index(drop=True)
    msg_end_trial = msg.query("TRIAL_INDEX == @trial_index and EVENT_NAME == 'TRIAL END'").reset_index(drop=True)

    group["DATETIME"] = group["CURRENT_FIX_START"].apply(lambda x: sync_time(x, msg_start_trial))
    
    group = group[group["DATETIME"] > msg_start_trial["EVENT_DATETIME"][0]]
    group = group[group["DATETIME"] < msg_end_trial["EVENT_DATETIME"][0]]
    
    return group

In [19]:
df = df.groupby("TRIAL_INDEX").apply(sync_and_clean).reset_index(drop = True)

# Websites and Conditions

In [20]:
# Condition 1:  Free    + NoPub
# Condition 2:  Target  + NoPub
# Condition 3:  Free    + Skin
# Condition 4:  Target  + Skin
# Condition 5:  Free    + Skin/MPU
# Condition 6:  Target  + Skin/MPU
def get_condition(trial_num):
    data = config["rand_weblist"][trial_num - 1]
    if(data["type"] == "free" and data["ad_id"] == 0 and data["mpu_id"] == 0):
        return 1
    elif(data["type"] == "target" and data["ad_id"] == 0 and data["mpu_id"] == 0):
        return 2
    elif(data["type"] == "free" and data["ad_id"] > 0 and data["mpu_id"] == 0):
        return 3
    elif(data["type"] == "target" and data["ad_id"] > 0 and data["mpu_id"] == 0):
        return 4
    elif(data["type"] == "free" and data["ad_id"] > 0 and data["mpu_id"] > 0):
        return 5
    elif(data["type"] == "target" and data["ad_id"] > 0 and data["mpu_id"] > 0):
        return 6
    
    return None

def condition_string(num):
    if(num == 1):
        return "Free + NoPub"
    elif(num == 2):
        return "Target + NoPub"
    elif(num == 3):
        return "Free + Skin"
    elif(num == 4):
        return "Target + Skin"
    elif(num == 5):
        return "Free + Skin/MPU"
    elif(num == 6):
        return "Target + Skin/MPU"
    else:
        return None

In [21]:
def get_website_id(trial_num):
    return config["rand_weblist"][trial_num - 1]["id"]

In [22]:
df["WEBSITE_ID"] = df["TRIAL_INDEX"].apply(get_website_id)

In [23]:
df["CONDITION"] = df["TRIAL_INDEX"].apply(get_condition)

# Offset Sync

In [24]:
def get_last_offset(trial_scroll, date_eye):
    result = trial_scroll[trial_scroll["datetime"] < date_eye]
    if(result.empty):
        return 0
    else:
        return result.iloc[-1]["offset"]

In [25]:
def get_offset(group):
    website_id = group['WEBSITE_ID'].unique()[0]
    group["OFFSET"] = group["DATETIME"].apply(lambda x: get_last_offset(df_scroll.query("website_id == "+str(website_id)), x))
    return group

In [26]:
df = df.groupby("TRIAL_INDEX").apply(get_offset)

In [27]:
df.groupby("TRIAL_INDEX")["OFFSET"].unique()

TRIAL_INDEX
1     [0, 200, 300, 600, 692, 700, 717, 800, 850, 90...
2     [0, 87, 100, 199, 200, 239, 300, 400, 485, 500...
3     [0, 9, 100, 200, 201, 446, 519, 597, 638, 700,...
4     [0, 195, 386, 556, 600, 697, 898, 900, 821, 79...
5     [0, 11, 100, 155, 500, 557, 600, 699, 700, 732...
6     [0, 500, 598, 600, 698, 700, 703, 800, 897, 90...
7     [0, 43, 163, 270, 634, 798, 821, 934, 1152, 12...
8     [0, 1, 200, 497, 500, 504, 571, 700, 946, 1111...
9     [0, 299, 400, 600, 700, 859, 900, 950, 1000, 1...
10    [0, 26, 172, 392, 400, 412, 800, 1000, 1073, 1...
11    [0, 6, 500, 975, 1000, 1002, 1183, 1500, 1800,...
12    [0, 18, 500, 1200, 1900, 2971, 3000, 3013, 362...
13    [0, 94, 203, 395, 900, 935, 992, 1076, 1309, 1...
14    [0, 186, 300, 400, 401, 586, 600, 795, 800, 85...
15    [0, 37, 265, 300, 319, 500, 703, 900, 905, 100...
16    [0, 101, 300, 328, 700, 889, 1000, 1010, 1100,...
17    [0, 400, 403, 503, 900, 909, 1366, 1400, 1759,...
18    [0, 257, 308, 674, 800, 815, 9

In [28]:
df["Y_OFFSET"] = df["CURRENT_FIX_Y"] + df["OFFSET"]

# Export

In [30]:
df.to_csv("./data/part_{0}/part_{0}_clean.csv".format(part_id), index=False)

In [31]:
df.head()

Unnamed: 0,TRIAL_INDEX,EYE_USED,CURRENT_FIX_X,CURRENT_FIX_Y,CURRENT_FIX_START,CURRENT_FIX_DURATION,NEXT_SAC_END_X,NEXT_SAC_END_Y,NEXT_SAC_AMPLITUDE,NEXT_SAC_DIRECTION,NEXT_SAC_DURATION,NEXT_SAC_ANGLE,NEXT_SAC_AVG_VELOCITY,DATETIME,WEBSITE_ID,CONDITION,OFFSET,Y_OFFSET
0,1,RIGHT,741.8,602.6,231,151,718.9,506.9,2.73,UP,28.0,104.99,97.63,2018-04-03 14:52:01.224,9,1,0,602.6
1,1,RIGHT,718.2,519.3,410,180,957.6,513.8,6.92,RIGHT,34.0,3.82,203.59,2018-04-03 14:52:01.403,9,1,0,519.3
2,1,RIGHT,951.5,517.7,624,156,1090.1,519.5,3.98,RIGHT,32.0,0.28,124.47,2018-04-03 14:52:01.617,9,1,0,517.7
3,1,RIGHT,1090.5,524.3,812,169,576.5,126.5,18.06,LEFT,128.0,142.38,141.1,2018-04-03 14:52:01.805,9,1,0,524.3
4,1,RIGHT,579.7,123.0,1109,183,502.4,137.9,2.18,LEFT,31.0,-168.77,70.24,2018-04-03 14:52:02.102,9,1,0,123.0
