In [19]:
from collections import defaultdict
import numpy as np
import pandas as pd
import scipy.io as io
import gzip
import math
import os
import re
import scipy

In [None]:
#TODO: still need to understand why those lines are necessary

zurich['BNCfreq'] = zurich.WORDstrip.map(lambda x: unigrdict.get(str(x).lower()))
zurich.BNCfreq = zurich.BNCfreq.fillna(zurich.BNCfreq.min())
zurich.BNCfreq = zurich.BNCfreq/100 #because 100 million word - to get freq per million
zurich.BNCfreq = np.log(zurich.BNCfreq)

zurich['BNCfreqinv']= -zurich.BNCfreq

In [3]:
def get_bncfreq(subdir = '\\BNC\\', file = 'all.al.gz', n_fields = 4):
    """
        Args: British National Corpus word frequency list;
              four fields per line (0: freq, 1: word, 2: pos, 4 :n_files the word occurs in)
        Return: FreqDict for BNC
    """
    bnc_freq = defaultdict(float)
    path = os.getcwd() + subdir
    # unzip automatically and make sure you don't read in binary mode (-> 'rt' instead of 'rb')
    with gzip.open(os.path.join(path, file), 'rt') as file:
        bnc_freqlist = list(map(lambda el: el.split(), file.readlines()[1:]))
        for line in bnc_freqlist:
            if len(line) == n_fields:
                bnc_freq[line[1]] += float(line[0])
        return bnc_freq

In [4]:
def get_matfiles(task:str, subdir = '\\results_zuco\\'):
    """
        Args: Task number ("task1", "task2", "task3") plus subdirectory
        Return: 12 matlab files (one per subject) for given task.
    """
    path = os.getcwd() + subdir + task
    files = [os.path.join(path,file) for file in os.listdir(path)[1:]]
    assert len(files) == 12, 'each task must contain 12 .mat files'
    return files

In [69]:
def inf_check(features):
    pop_idx = 0
    for i, feat in enumerate(features):
        if True in np.isneginf(feat) or True in np.isinf(feat):
            features = np.delete(features, i-pop_idx, axis=0)
            pop_idx += 1
    return features

In [81]:
def mk_dataframe(task:str, subject:int, level:str):
    """
        Args: Task number ("task1", "task2", "task3" , test subject(0-11).
        Return: DataFrame on word level.
    """
    bnc_freq = get_bncfreq()
    files = get_matfiles(task)
    data = io.loadmat(files[subject], squeeze_me=True, struct_as_record=False)['sentenceData']
    
    if level == 'sentence':
        fields = ['SentLen', 'nFixations', 'meanPupilSize', 'GD', 'TRT', 'FFD', 'SFD', 
                  'GPT', 'BNCFreq']
        features = np.zeros((len(data), len(fields)))
        
    elif level == 'word':
        n_words = sum([len(sent.word) for sent in data])    
        fields = list(set(field for sent in data for word in sent.word for field in word._fieldnames\
                     if not field.startswith('raw')))
        fields = sorted(fields, reverse=True)
        fields.insert(0, 'word_id')
        fields.insert(0, 'sent_id')
        df = pd.DataFrame(index=range(n_words), columns=[fields])
        k = 0
    else:
        raise Exception('Data can only be processed on sentence or word level')
        
    for i, sent in enumerate(data):
        for j, word in enumerate(sent.word):
            if level == 'sentence':
                features[i,1:-1] += [getattr(word, field) if hasattr(word, field)\
                                and not isinstance(getattr(word, field), np.ndarray) else\
                                0 for field in fields[1:-1]]
                token = re.sub('[^\w\s]', '', word.content)
                #TODO: figure out whether divsion by 100 leads to log = -inf 
                features[i,-1] += np.log(bnc_freq[token]/100) if bnc_freq[token]/100 != 0 else 0
            else:
                df.iloc[k, 0] = str(i) + '_NR' if task=='task1' or task=='task2' else str(i) + '_TSR'
                df.iloc[k, 1] = j
                df.iloc[k, 2:] = [getattr(word, field) if hasattr(word, field) else np.nan\
                                  for field in fields[2:]]
                k += 1
                
        if level == 'sentence':
            features[i, 0] = len(sent.word)
            features[i, 1:] /= len(sent.word)
            
    if level == 'sentence':
        features = inf_check(features)
        # normalize data featurewise
        features = np.array([feat / max(feat) for i, feat in enumerate(features.T)])
        df = pd.DataFrame(data=features.T, index=range(features.shape[1]), columns=[fields])

    return df

In [14]:
files_task1 = get_matfiles('task1')
files_task2 = get_matfiles('task2')
files_task3 = get_matfiles('task3')

In [82]:
mk_dataframe('task1', 0, level='sentence')

Unnamed: 0,SentLen,nFixations,meanPupilSize,GD,TRT,FFD,SFD,GPT,BNCFreq
0,0.511628,0.315152,0.708518,0.368106,0.261899,0.299736,0.376217,0.239384,0.860823
1,0.511628,0.315152,0.728860,0.489171,0.285866,0.417779,0.483852,0.271056,0.577363
2,0.372093,0.233333,0.636190,0.363573,0.203150,0.345914,0.545599,0.214355,0.751566
3,0.116279,0.586667,0.905290,0.516898,0.498041,0.516898,0.152916,0.455226,0.400268
4,0.302326,0.430769,0.723694,0.423823,0.354115,0.350948,0.293499,0.338561,0.562132
5,0.372093,0.250000,0.628592,0.434470,0.221413,0.326697,0.411519,0.202379,0.732506
6,0.441860,0.294737,0.808053,0.508893,0.272046,0.466467,0.628230,0.256197,0.574636
7,0.255814,0.218182,0.514103,0.345631,0.198662,0.293881,0.299624,0.181584,0.727797
8,0.651163,0.247619,0.710762,0.380342,0.200560,0.333004,0.507158,0.208272,0.773502
9,0.488372,0.304762,0.743206,0.420459,0.271850,0.393945,0.523153,0.248479,0.667541


In [15]:
# index of the array `data` is the number of sentence
data = io.loadmat(files_task1[0], squeeze_me=True, struct_as_record=False)['sentenceData']

In [8]:
# example: print sentence
for sent in data:
    for field in sent._fieldnames:
        print("Field:", field)
        print()
        print("Values:", getattr(sent, field))
        print()
    break

Field: content

Values: Presents a good case while failing to provide a reason for us to care beyond the very basic dictums of human decency.

Field: rawData

Values: [[ 0.8102897   0.4924412   0.44047612 ...  0.30642092  0.20077913
   0.32715505]
 [ 0.1091525  -0.05207972 -0.28782055 ...  0.26032427 -0.04572273
   0.02665068]
 [ 0.70459163  0.5766893   1.1255366  ... -0.6907126  -1.1149031
  -0.67108846]
 ...
 [ 1.1299472   0.49951333  0.40294212 ...  0.8088422  -0.11194292
   0.3848682 ]
 [ 2.194058    1.2687929   1.1725545  ...  0.4536394  -1.0771086
   0.00784904]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]

Field: mean_t1

Values: [0.13733967 0.18169284 0.38413203 0.4809076  0.5031249  0.5072147
 0.4300718  0.4204321  0.5885104  0.54508054 1.0100969  0.34071308
 0.55860126 0.68211114 1.2959126  0.58885574 0.51419014 0.99041927
 0.67183656 0.6030719  0.5139628  0.52564293 0.7012217  0.6463046
 0.77796596 0.49136987 0.6431333  0.65274966 0.70108867 0.89

Values: [[0.1478582  0.16677441 0.36800182 0.43231592 0.53880072 0.54359919
  0.566199   0.38583276 0.59823704 0.4785122  1.12738538 0.25234357
  0.51257044 0.62898403 1.36107552 0.41869208 0.45671806 0.88795763
  0.4979279  0.50933373 0.43786007 0.36401913 0.60749781 0.52772081
  0.7581414  0.34873304 0.48791128 0.5035708  0.51738876 0.76089436
  1.705755   0.56932873 0.52578378 0.56639087 0.59586525 1.33616889
  0.63524038 0.61017174 0.61155295 0.56317753 0.71429044 0.65167689
  0.68642288 0.75263709 1.35147119 0.60893798 0.71398187 1.02077222
  0.7627781  0.98018056 0.99564457 1.37919998 1.804389   1.10674393
  0.93783909 0.97415465 0.97689229 1.12098169 1.06291521 1.28436732
  0.9806577  1.36407793 1.12506974 1.10891998 0.90906054 0.63991582
  0.26788858 1.36094069 1.30827022 1.05151939 0.91944027 0.66454059
  0.47503445 1.50056469 1.35143638 1.24151659 0.95988059 0.63684446
  1.10994363 1.15412688 0.89076966 0.70089066 0.74112272 0.69849712
  0.58850634 0.5845837  0.40849474 0.193

Values: [[-6.35901093e-03  4.84866403e-01  6.27028644e-01  2.93846592e-01
   2.35013440e-01  6.64822757e-01  2.68276215e-01  4.74548608e-01
   3.11987713e-01  2.05589369e-01  4.98355582e-01  3.79345044e-01
   8.54638621e-01  4.25959319e-01  4.52082336e-01  3.24015468e-01
   3.29685092e-01  2.78760850e-01  4.27598804e-01  3.58055204e-01
   3.50709498e-01  5.72173774e-01  2.57206857e-01  4.10003722e-01
   4.99778032e-01  5.48655987e-01  2.46542692e-01  5.34686625e-01
   4.47992086e-01  4.05215979e-01  4.00885224e-01  6.92645252e-01
   5.69610044e-01  1.07442364e+00  5.24844021e-01  3.61699760e-01
   4.99675930e-01  1.73839188e+00 -1.52610064e-01  6.79015398e-01
   2.48256087e-01 -1.67714834e-01  3.12789559e-01  7.22289920e-01
   6.48482606e-01  3.69871214e-01  1.09629124e-01  1.77994817e-01]
 [ 5.32337725e-02  4.30215463e-01  4.95239377e-01  6.68044284e-01
   4.47439045e-01  6.37616903e-01  3.89149874e-01  9.47117120e-01
   5.78081340e-01  3.82069051e-01  8.96298006e-01  6.13483191e-01
 

In [19]:
fixations.pupilsize

array([ 866,  833,  857,  895,  951,  987, 1022, 1027, 1029, 1013,  991,
        960,  932,  921,  906,  885,  859,  820,  834,  838,  856,  885,
        902,  914,  902,  888,  875,  859], dtype=uint16)

In [111]:
data[0].word[0].SFD

array([], dtype=float64)

In [44]:
[getattr(data[0].word[4], field) for field in data[0].word[3]._fieldnames if field.endswith('_pupilsize')]

[991, 961, 961, 961, array([], dtype=float64)]

In [66]:
data[0].word[0]._fieldnames

['content',
 'fixPositions',
 'nFixations',
 'meanPupilSize',
 'rawEEG',
 'rawET',
 'FFD',
 'FFD_pupilsize',
 'FFD_t1',
 'FFD_t2',
 'FFD_a1',
 'FFD_a2',
 'FFD_b1',
 'FFD_b2',
 'FFD_g1',
 'FFD_g2',
 'FFD_t1_diff',
 'FFD_t2_diff',
 'FFD_a1_diff',
 'FFD_a2_diff',
 'FFD_b1_diff',
 'FFD_b2_diff',
 'FFD_g1_diff',
 'FFD_g2_diff',
 'TRT',
 'TRT_pupilsize',
 'TRT_t1',
 'TRT_t2',
 'TRT_a1',
 'TRT_a2',
 'TRT_b1',
 'TRT_b2',
 'TRT_g1',
 'TRT_g2',
 'TRT_t1_diff',
 'TRT_t2_diff',
 'TRT_a1_diff',
 'TRT_a2_diff',
 'TRT_b1_diff',
 'TRT_b2_diff',
 'TRT_g1_diff',
 'TRT_g2_diff',
 'GD',
 'GD_pupilsize',
 'GD_t1',
 'GD_t2',
 'GD_a1',
 'GD_a2',
 'GD_b1',
 'GD_b2',
 'GD_g1',
 'GD_g2',
 'GD_t1_diff',
 'GD_t2_diff',
 'GD_a1_diff',
 'GD_a2_diff',
 'GD_b1_diff',
 'GD_b2_diff',
 'GD_g1_diff',
 'GD_g2_diff',
 'GPT',
 'GPT_pupilsize',
 'GPT_t1',
 'GPT_t2',
 'GPT_a1',
 'GPT_a2',
 'GPT_b1',
 'GPT_b2',
 'GPT_g1',
 'GPT_g2',
 'GPT_t1_diff',
 'GPT_t2_diff',
 'GPT_a1_diff',
 'GPT_a2_diff',
 'GPT_b1_diff',
 'GPT_b2_diff',

In [20]:
data[0]._fieldnames

['content',
 'rawData',
 'mean_t1',
 'mean_t2',
 'mean_a1',
 'mean_a2',
 'mean_b1',
 'mean_b2',
 'mean_g1',
 'mean_g2',
 'mean_t1_sec',
 'mean_t2_sec',
 'mean_a1_sec',
 'mean_a2_sec',
 'mean_b1_sec',
 'mean_b2_sec',
 'mean_g1_sec',
 'mean_g2_sec',
 'mean_t1_diff',
 'mean_t2_diff',
 'mean_a1_diff',
 'mean_a2_diff',
 'mean_b1_diff',
 'mean_b2_diff',
 'mean_g1_diff',
 'mean_g2_diff',
 'mean_t1_diff_sec',
 'mean_t2_diff_sec',
 'mean_a1_diff_sec',
 'mean_a2_diff_sec',
 'mean_b1_diff_sec',
 'mean_b2_diff_sec',
 'mean_g1_diff_sec',
 'mean_g2_diff_sec',
 'word',
 'omissionRate',
 'allFixations',
 'wordbounds',
 'answer_mean_t1',
 'answer_mean_t2',
 'answer_mean_a1',
 'answer_mean_a2',
 'answer_mean_b1',
 'answer_mean_b2',
 'answer_mean_g1',
 'answer_mean_g2',
 'answer_mean_t1_diff',
 'answer_mean_t2_diff',
 'answer_mean_a1_diff',
 'answer_mean_a2_diff',
 'answer_mean_b1_diff',
 'answer_mean_b2_diff',
 'answer_mean_g1_diff',
 'answer_mean_g2_diff']

In [None]:
# get word level data
word_data = data[0].word

In [None]:
n_fieldnames = max(set(len(word._fieldnames) for sent in data for word in sent.word))
fieldnames = set(field for sent in data for word in sent.word for field in word._fieldnames if not field.startswith('raw'))
fieldnames = sorted(list(fieldnames), reverse=True)
fieldnames.insert(0, 'word_id')
fieldnames.insert(0, 'sent_id')


for i, sent in enumerate(data):
    for j, word in enumerate(sent.word):
        print("Index:", j)
        print()
        print("Word:", word.content)
        print()
        print("Number of attributes: ", len([getattr(word, field) if hasattr(word, field) else np.nan\
                              for field in fieldnames]))
        print()
        print([getattr(word, field) if hasattr(word, field) else np.nan\
                              for field in fieldnames[2:]])
        break
    break

In [None]:
word_data[0].content

In [None]:
len(word_data[0]._fieldnames)

In [None]:
data[0].content.split()

In [None]:
# index of the array `data` is the number of sentence
data = io.loadmat(files_task1[0], squeeze_me=True, struct_as_record=False)['sentenceData']

In [None]:
# example: get omission rate of first sentence
omission_rate = data[0].omissionRate
print(omission_rate)

In [None]:
# get word level data
word_data = data[0].word

In [None]:
# get names of all word features
# index of the array `word_data` is the number of the word
print(word_data[0]._fieldnames)

In [None]:
word_fieldnames = word_data[0]._fieldnames #sorted(list(set([field for word in word_data for field in word._fieldnames])))
for word in word_data:
    for field in word_fieldnames:
        print('Attribute:', field)
        print()
        print('Values:', getattr(word, field))
        print()
    break

In [None]:
# example: get first word
print(word_data[0].nFixations)

In [None]:
# example: get number of fixations of first word
print(word_data[0].nFixations)