# Feature Prep for Gaze+BERT embeddings
## Feature-based
## 
f1t1 + f1t2 + f1t3+ -------+ f1tn   + f2t1 + f2t2 + f2t3+ -------+ f2tn 

## Setup

In [1]:
# import packages
from ast import literal_eval
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import random
import torch
import os


import tensorflow as tf
import keras
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None


import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
plt.rcParams['figure.figsize'] = [50, 10]


# check keras version
print('Keras Version:', keras.__version__)
print('CUDA installation:', tf.test.is_built_with_cuda())

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
2024-02-11 22:06:58.282966: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-11 22:06:58.282991: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-11 22:06:58.283787: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory 

Keras Version: 2.15.0
CUDA installation: True


In [2]:
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification,AutoTokenizer, AutoModel, AutoConfig, utils
from transformers import pipeline

from tqdm import tqdm

import torch
import pickle

import logging
import matplotlib.pyplot as plt
from torch.nn import functional as F

from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set a random seed
random_seed = 42
random.seed(random_seed)
 
# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [4]:
from numpy.random import seed
seed(16)
tf.random.set_seed(19)

### READ already processed data from token-based version

## Data Reading

In [5]:
#SELECT CONDITION:
condition = "entire_dataset" # or 'entire_data'
train= pd.read_csv(condition+'/train.csv')#, index_col=False)
val= pd.read_csv(condition+'/val.csv')#, index_col=False)
test= pd.read_csv(condition+'/test.csv')#, index_col=False)


#y_col='Intensity_Category' ## multiclass hate, neutral, positive 
y_col='Intensity_Category_Binary' ## binary hate, nohate

print(train.shape, test.shape, val.shape)

(2772, 33) (598, 33) (246, 33)


In [14]:
train.columns

Index(['level_0', 'RECORDING_SESSION_LABEL', 'TRIAL_INDEX',
       'IA_AVERAGE_FIX_PUPIL_SIZE', 'TRIAL_DWELL_TIME', 'TRIAL_FIXATION_COUNT',
       'IA_FIXATION_%', 'IA_FIXATION_COUNT', 'IA_DWELL_TIME_%',
       'IA_FIRST_FIXATION_DURATION', 'IA_LABEL', 'IA_MAX_FIX_PUPIL_SIZE',
       'IA_MIN_FIX_PUPIL_SIZE', 'IA_REGRESSION_IN', 'IA_REGRESSION_IN_COUNT',
       'IA_REGRESSION_OUT', 'IA_REGRESSION_OUT_COUNT', 'IA_SKIP',
       'IA_RUN_COUNT', 'sno', 'assertion', 'IA_FIRST_RUN_FIXATION_%',
       'cond_remark', 'cleaned_tokens', 'backward_reg_count',
       'forward_reg_count', 'total_reg_count', 'Clicked', 'token',
       'Intensity_Category', 'Intensity_Category_Binary',
       'Pupilsize_variation', 'index'],
      dtype='object')

In [6]:
import ast

#there is this weird type problem on some columns
train['cleaned_tokens']= train['cleaned_tokens'].apply(lambda x: ast.literal_eval(x))
val['cleaned_tokens']= val['cleaned_tokens'].apply(lambda x: ast.literal_eval(x))
test['cleaned_tokens']= test['cleaned_tokens'].apply(lambda x: ast.literal_eval(x))

words = train['cleaned_tokens'].values.tolist()[:4]
print(words)

[['der', 'mann', 'verprasst', 'das', 'geld'], ['alle', 'männer', 'sind', 'machos'], ['männer', 'gehören', 'auf', 'den', 'bau'], ['männer', 'verbingen', 'wenig', 'zeit', 'mit', 'ihren', 'kindern']]



if bert_type_short=='base':
    tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased") #, use_fast=False)    
    model = AutoModel.from_pretrained('dbmdz/bert-base-german-uncased', output_hidden_states=True, return_dict=True, output_attentions=True)

if  bert_type_short =="fine_tuned":
    model_dir = '../sannes_playground/finetuned_models/rott-hatecheck'
    tokenizer_dir = 'chrisrtt/gbert-multi-class-german-hate'
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
    model = AutoModel.from_pretrained(model_dir, output_hidden_states=True, return_dict=True, output_attentions=True)
      

## get bert token embeddings

In [27]:
def return_bert_embeddings(text, tokenizer, model):
    # encode input
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

    input = tokenizer.encode_plus(text, return_tensors = "pt", padding='max_length', max_length=50) ## "max_length"###or 'longest' after you check the longest sequence in the entire dataset
    input_text = tokenizer.convert_ids_to_tokens(input['input_ids'][0])
    
     
    input_ids = input['input_ids']  # Token IDs
    #print(f"Input ID: {input_ids}")
    attention_mask = input['attention_mask']  # Attention mask
    #print('Attention masks:', attention_mask)
    #print('Input text: ', input_text)
    
    # Generate embeddings using BERT model
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state  # This contains the embeddings
        CLS_emb = word_embeddings[0][0]
        
 
    # Output the shape of word embeddings
    #print(f"Shape of Word Embeddings: {word_embeddings.shape}")
    #print(f"Shape of Word Embeddings: {CLS_emb.shape}")

    return CLS_emb    

### Concatanate all embeddings and feature vectors (fixed-sized) 
#### NOTE: for the ablation study with different set of features. it would be nice to automatize that!

In [28]:
split_types= ['train', 'test', 'val']

In [29]:
train.loc[1200]

level_0                                                                    1203
RECORDING_SESSION_LABEL                                                      p9
TRIAL_INDEX                                                                  73
IA_AVERAGE_FIX_PUPIL_SIZE     [0.22703388294980972, 0.22739626743975358, 0.2...
TRIAL_DWELL_TIME                                                         2319.0
TRIAL_FIXATION_COUNT                                                         13
IA_FIXATION_%                 [0.38865731969180245, 0.38865731969180245, 0.2...
IA_FIXATION_COUNT                                                  [4, 4, 3, 1]
IA_DWELL_TIME_%               [0.46767348976535195, 0.33100349475786317, 0.2...
IA_FIRST_FIXATION_DURATION    [0.17597292724196278, 0.10490693739424706, 0.1...
IA_LABEL                                                                 Word 1
IA_MAX_FIX_PUPIL_SIZE         [0.20864661654135339, 0.2030075187969925, 0.20...
IA_MIN_FIX_PUPIL_SIZE         [0.2376237

In [30]:
max_token_length = 14

import pickle
import ast
def processed_split_data(split_type, df, bert_type):

       
    feat_index_dict= dict()
    num_gaze_features = [ 'IA_FIXATION_%', 'IA_RUN_COUNT','IA_DWELL_TIME_%', 'IA_AVERAGE_FIX_PUPIL_SIZE', 'IA_MAX_FIX_PUPIL_SIZE','IA_MIN_FIX_PUPIL_SIZE', 'Pupilsize_variation','IA_FIRST_FIXATION_DURATION', 'IA_FIRST_RUN_FIXATION_%',
                        'IA_REGRESSION_IN_COUNT','IA_REGRESSION_OUT_COUNT', 'backward_reg_count', 'forward_reg_count', 'total_reg_count'] ## a small set for trying out
    cat_features = [ 'IA_REGRESSION_IN', 'IA_REGRESSION_OUT', "IA_SKIP"]
    
    rationale_feature = ['Clicked']
    
    if bert_type=='base':
        tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased") #, use_fast=False)    
        model = AutoModel.from_pretrained('dbmdz/bert-base-german-uncased', output_hidden_states=True, return_dict=True, output_attentions=True)

    if  bert_type =="finetuned":
        model_dir = '../sannes_playground/finetuned_models/rott-hatecheck'
        tokenizer_dir = 'chrisrtt/gbert-multi-class-german-hate'
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
        model = AutoModel.from_pretrained(model_dir, output_hidden_states=True, return_dict=True, output_attentions=True)
      
    
    for index_t, row in df.iterrows():
        print('processing the index: ', index_t)
        #if index_t > 5:
        #    continue

        ## chrisrtt/gbert-multi-class-german-hate is case sensitive, therefore the assertion needs to be lowercased
        text = row['assertion']
        if bert_type =="finetuned":
            text = text.lower()

        cleaned_tokens = row['cleaned_tokens']
                #GET GAZE feature vector
        token_embs = []
        #GET CLS embeddings from BERT Embedding 
        CLS_emb = return_bert_embeddings(text, tokenizer, model) #'Frauen sind Säulen in unserer Gesellschaft.')
        
        
        gaze_token_emb = []
        
        for gfeat in num_gaze_features:
        
            feat= literal_eval(row[gfeat])
            feat = feat + [0] * (max_token_length -len(feat))
            gaze_token_emb.append(feat)

        for cfeat in cat_features+rationale_feature:
            #print('processing the feature: ', cfeat)
            feat= literal_eval(row[cfeat])
            feat = feat + [0] * (max_token_length -len(feat))
            gaze_token_emb.append(feat)

        token_emb = torch.cat((CLS_emb,torch.tensor(gaze_token_emb).reshape(-1)))
      
        #print('Token embeddings: ', token_embs.shape, token_embs)
                
        folder_dir = "attribute_based/pts/"+condition+"/"+bert_type+"/"+split_type
        if not os.path.exists(folder_dir):
            os.makedirs(folder_dir)
            
        torch.save(token_emb, folder_dir+"/tensor_"+str(index_t)+".pt")    

      


In [31]:
variations= [ 'base', 'finetuned'] # 'base_allfeats', 'finetuned_allfeats',
for variation in variations:
    processed_split_data('train', train, variation)
    processed_split_data('test', test, variation)
    processed_split_data('val', val, variation)

processing the index:  0
processing the index:  1
processing the index:  2
processing the index:  3
processing the index:  4
processing the index:  5
processing the index:  6
processing the index:  7
processing the index:  8
processing the index:  9
processing the index:  10
processing the index:  11
processing the index:  12
processing the index:  13
processing the index:  14
processing the index:  15
processing the index:  16
processing the index:  17
processing the index:  18
processing the index:  19
processing the index:  20
processing the index:  21
processing the index:  22
processing the index:  23
processing the index:  24
processing the index:  25
processing the index:  26
processing the index:  27
processing the index:  28
processing the index:  29
processing the index:  30
processing the index:  31
processing the index:  32
processing the index:  33
processing the index:  34
processing the index:  35
processing the index:  36
processing the index:  37
processing the index: 

KeyboardInterrupt: 

In [None]:
### VERSION 2 :bert embd + clicked + then gaze 
max_token_length = 14

import pickle
import ast
def processed_split_data_rv(split_type, df, bert_type):

    num_gaze_features = [ 'IA_FIXATION_%', 'IA_RUN_COUNT','IA_DWELL_TIME_%', 'IA_AVERAGE_FIX_PUPIL_SIZE', 'IA_MAX_FIX_PUPIL_SIZE','IA_MIN_FIX_PUPIL_SIZE', 'Pupilsize_variation','IA_FIRST_FIXATION_DURATION', 'IA_FIRST_RUN_FIXATION_%',
                        'IA_REGRESSION_IN_COUNT','IA_REGRESSION_OUT_COUNT', 'backward_reg_count', 'forward_reg_count', 'total_reg_count'] ## a small set for trying out
    cat_features = [ 'IA_REGRESSION_IN', 'IA_REGRESSION_OUT', "IA_SKIP"]
    
    rationale_feature = ['Clicked']
    
    if bert_type=='base':
        tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased") #, use_fast=False)    
        model = AutoModel.from_pretrained('dbmdz/bert-base-german-uncased', output_hidden_states=True, return_dict=True, output_attentions=True)

    if  bert_type =="finetuned":
        model_dir = '../sannes_playground/finetuned_models/rott-hatecheck'
        tokenizer_dir = 'chrisrtt/gbert-multi-class-german-hate'
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
        model = AutoModel.from_pretrained(model_dir, output_hidden_states=True, return_dict=True, output_attentions=True)
      
    
    for index_t, row in df.iterrows():
        print('processing the index: ', index_t)

        ## chrisrtt/gbert-multi-class-german-hate is case sensitive, therefore the assertion needs to be lowercased
        text = row['assertion']
        if bert_type =="finetuned":
            text = text.lower()

        cleaned_tokens = row['cleaned_tokens']
                #GET GAZE feature vector
        token_embs = []
        #GET CLS embeddings from BERT Embedding 
        CLS_emb = return_bert_embeddings(text, tokenizer, model) #'Frauen sind Säulen in unserer Gesellschaft.')
        
        
        gaze_token_emb = []
        
        for rfeat in rationale_feature:
            #print('processing the feature: ', rfeat)
            feat= literal_eval(row[rfeat])
            feat = feat + [0] * (max_token_length -len(feat))
            gaze_token_emb.append(feat)
                
        for gfeat in num_gaze_features:
       
            feat= literal_eval(row[gfeat])
            feat = feat + [0] * (max_token_length -len(feat))
            gaze_token_emb.append(feat)

        for cfeat in cat_features:
            #print('processing the feature: ', cfeat)
            feat= literal_eval(row[cfeat])
            feat = feat + [0] * (max_token_length -len(feat))
            gaze_token_emb.append(feat)

        token_emb = torch.cat((CLS_emb,torch.tensor(gaze_token_emb).reshape(-1)))
      
        #print('Token embeddings: ', token_embs.shape, token_embs)
                
        folder_dir = "attribute_based/pts_rv/"+condition+"/"+bert_type+"/"+split_type
        if not os.path.exists(folder_dir):
            os.makedirs(folder_dir)
            
        torch.save(token_emb, folder_dir+"/tensor_"+str(index_t)+".pt")   



In [None]:
variations= [ 'base', 'finetuned'] 
for variation in variations:
    processed_split_data_rv('train', train, variation)
    processed_split_data_rv('test', test, variation)
    processed_split_data_rv('val', val, variation)

processing the index:  0
processing the index:  1
processing the index:  2
processing the index:  3
processing the index:  4
processing the index:  5
processing the index:  6
processing the index:  7
processing the index:  8
processing the index:  9
processing the index:  10
processing the index:  11
processing the index:  12
processing the index:  13
processing the index:  14
processing the index:  15
processing the index:  16
processing the index:  17
processing the index:  18
processing the index:  19
processing the index:  20
processing the index:  21
processing the index:  22
processing the index:  23
processing the index:  24
processing the index:  25
processing the index:  26
processing the index:  27
processing the index:  28
processing the index:  29
processing the index:  30
processing the index:  31
processing the index:  32
processing the index:  33
processing the index:  34
processing the index:  35
processing the index:  36
processing the index:  37
processing the index: 