In [11]:
#!pip install --upgrade pip
#!pip install datasets
#!pip install tensorboard
#!pip install transformers

## All other imports are here:

In [12]:
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional

from math import isnan

import random

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoConfig, Trainer, EvalPrediction, set_seed
from transformers.training_args import TrainingArguments

------------------------------

In [13]:
## Gaze4Hate dataset
filename = "../processed_data/click_output_data_Jan24.csv"
gaze_df = pd.read_csv(filename)#
gaze_df.drop(gaze_df.filter(regex="Unname"), axis=1, inplace=True)
gaze_df.head(5)

Unnamed: 0,IP_INDEX,IP_LABEL,RECORDING_SESSION_LABEL,TRIAL_INDEX,IA_AVERAGE_FIX_PUPIL_SIZE,TRIAL_DWELL_TIME,TRIAL_FIXATION_COUNT,TRIAL_IA_COUNT,TRIAL_LABEL,IA_FIXATION_%,...,forward_reg_list,total_reg_list,backward_reg_count,forward_reg_count,total_reg_count,Clicked,token,Intensity_Category,Intensity_Category_Binary,Pupilsize_variation
0,1,Assertion_Image_Period,p1,5,931.0,6522.0,30,13,Trial: 5,0.1,...,"[3, 2, 2]","[3, 2, 2]",0.0,3.0,3.0,False,frauen,hate,hate,68.0
1,1,Assertion_Image_Period,p1,5,950.0,6522.0,30,13,Trial: 5,0.1,...,"[4, 4]","[1, 4, 4]",1.0,2.0,3.0,False,haben,hate,hate,111.0
2,1,Assertion_Image_Period,p1,5,901.0,6522.0,30,13,Trial: 5,0.1,...,[4],"[1, 4, 1]",2.0,1.0,3.0,False,einen,hate,hate,55.0
3,1,Assertion_Image_Period,p1,5,945.5,6522.0,30,13,Trial: 5,0.2,...,"[6, 5, 5, 5]","[6, 5, 5, 5]",0.0,4.0,4.0,True,signifikant,hate,hate,146.0
4,1,Assertion_Image_Period,p1,5,946.43,6522.0,30,13,Trial: 5,0.2333,...,"[6, 6]","[6, 3, 6]",1.0,2.0,3.0,True,geringeren,hate,hate,166.0


In [14]:
gaze_df.loc[(gaze_df['RECORDING_SESSION_LABEL']=="p1") & (gaze_df['TRIAL_INDEX']==6), 'IA_DWELL_TIME']

6     649
7     525
8     636
9       0
10    543
11    304
12    223
13    141
Name: IA_DWELL_TIME, dtype: int64

In [15]:
### There are words which are not fixated, while sentence level paramaters like TRIAL_DWELL_TIME, TRIAL_FIXATION_COUNT are calculated, some of the parameters are NaN
### to be able to run them with the ML models, we need to fill them with average or drop the entire row, but droping also introduces some other problems

avg_col_for_nan= ['IA_AVERAGE_FIX_PUPIL_SIZE', 'IA_MAX_FIX_PUPIL_SIZE','IA_MIN_FIX_PUPIL_SIZE', 'Pupilsize_variation','IA_FIRST_FIXATION_DURATION', 'IA_SECOND_FIXATION_DURATION', 'IA_SECOND_RUN_FIXATION_%','IA_FIRST_RUN_FIXATION_%']
 
zero_col_for_nan=['IA_REGRESSION_IN', 'IA_RUN_COUNT', 'IA_REGRESSION_IN_COUNT','IA_REGRESSION_OUT', 'IA_REGRESSION_OUT_COUNT', 'backward_reg_count', 'forward_reg_count', 'total_reg_count']

empty_list_for_nan = ['backward_reg_list', 'forward_reg_list', 'total_reg_list']


grouped_df_nans = gaze_df.groupby(['RECORDING_SESSION_LABEL', 'TRIAL_INDEX'], dropna=False)

to_change_values_dict= dict()
for group_name in grouped_df_nans.groups.keys(): # sample key : ('p1', 5)
    col_dict= dict()
    #print('Processing : ',group_name, type(group_name))
    trial_df = grouped_df_nans.get_group(group_name)
    for avg_col in avg_col_for_nan:
        col_dict[avg_col] = trial_df[avg_col].mean()
    for zero_col in zero_col_for_nan:
        col_dict[zero_col] = 0      
    for empty_col in empty_list_for_nan:
        col_dict[empty_col] = []     
    to_change_values_dict[group_name] = col_dict
    
to_change_values_dict # to_change_values_dict['p1', 5]    

{('p1', 5): {'IA_AVERAGE_FIX_PUPIL_SIZE': 928.465,
  'IA_MAX_FIX_PUPIL_SIZE': 979.3333333333334,
  'IA_MIN_FIX_PUPIL_SIZE': 876.6666666666666,
  'Pupilsize_variation': 102.66666666666667,
  'IA_FIRST_FIXATION_DURATION': 159.0,
  'IA_SECOND_FIXATION_DURATION': 223.66666666666666,
  'IA_SECOND_RUN_FIXATION_%': 0.03886666666666667,
  'IA_FIRST_RUN_FIXATION_%': 0.049983333333333345,
  'IA_REGRESSION_IN': 0,
  'IA_RUN_COUNT': 0,
  'IA_REGRESSION_IN_COUNT': 0,
  'IA_REGRESSION_OUT': 0,
  'IA_REGRESSION_OUT_COUNT': 0,
  'backward_reg_count': 0,
  'forward_reg_count': 0,
  'total_reg_count': 0,
  'backward_reg_list': [],
  'forward_reg_list': [],
  'total_reg_list': []},
 ('p1', 6): {'IA_AVERAGE_FIX_PUPIL_SIZE': 923.5714285714286,
  'IA_MAX_FIX_PUPIL_SIZE': 959.8571428571429,
  'IA_MIN_FIX_PUPIL_SIZE': 893.2857142857143,
  'Pupilsize_variation': 66.57142857142857,
  'IA_FIRST_FIXATION_DURATION': 162.57142857142858,
  'IA_SECOND_FIXATION_DURATION': 215.4,
  'IA_SECOND_RUN_FIXATION_%': 0.0556000

In [16]:
to_change_values_dict['p1', 6]

{'IA_AVERAGE_FIX_PUPIL_SIZE': 923.5714285714286,
 'IA_MAX_FIX_PUPIL_SIZE': 959.8571428571429,
 'IA_MIN_FIX_PUPIL_SIZE': 893.2857142857143,
 'Pupilsize_variation': 66.57142857142857,
 'IA_FIRST_FIXATION_DURATION': 162.57142857142858,
 'IA_SECOND_FIXATION_DURATION': 215.4,
 'IA_SECOND_RUN_FIXATION_%': 0.055600000000000004,
 'IA_FIRST_RUN_FIXATION_%': 0.06352857142857142,
 'IA_REGRESSION_IN': 0,
 'IA_RUN_COUNT': 0,
 'IA_REGRESSION_IN_COUNT': 0,
 'IA_REGRESSION_OUT': 0,
 'IA_REGRESSION_OUT_COUNT': 0,
 'backward_reg_count': 0,
 'forward_reg_count': 0,
 'total_reg_count': 0,
 'backward_reg_list': [],
 'forward_reg_list': [],
 'total_reg_list': []}

In [17]:
to_remove_index = []
for index, row in  gaze_df.iterrows():
    
    ##check the rows with NaN in one of these columns and replace it with mean or zero value depending on the parameter
    if row['IA_DWELL_TIME']==0:
        
        pno= row['RECORDING_SESSION_LABEL']
        tno= row['TRIAL_INDEX']
        #print(pno, tno)
        for key, value in to_change_values_dict[pno,tno].items():

            gaze_df.at[index,key]= value
            
        ##if it is still nan this means that no fixation at all for the sentence
        #if  pd.isnull( row['IA_AVERAGE_FIX_PUPIL_SIZE']) == True:
        #    to_remove_index.append(index)   
gaze_df.to_csv('gaze_word_level.csv', index=False)        

In [18]:
print('DF Length: ', len(gaze_df))

DF Length:  27649


In [19]:
## Maximum IA count in the dataset for padding
gaze_df['TRIAL_IA_COUNT'].max(), gaze_df['TRIAL_IA_COUNT'].min()

(29, 6)

In [20]:
gaze_df['token']

0               frauen
1                haben
2                einen
3          signifikant
4           geringeren
             ...      
27644             sind
27645    minderleister
27646               in
27647          unserer
27648     gesellschaft
Name: token, Length: 27649, dtype: object

------------------------