## Data Exploration

ET and EEG feature comparison between Normal Reading (NR) and Task Specific Reading (TSR) / Annotation Reading (AR) for [ZuCo](https://osf.io/q3zws/)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.io as io
import gzip
import math
import matplotlib
import os
import re
import scipy

from scipy.stats import pearsonr, ttest_rel
from plot_funcs import *
from utils import *

In [None]:
#NOTE: don't execute this cell
#TODO: still need to understand why those lines in Maria's code are necessary

#zurich['BNCfreq'] = zurich.WORDstrip.map(lambda x: unigrdict.get(str(x).lower()))
#zurich.BNCfreq = zurich.BNCfreq.fillna(zurich.BNCfreq.min())
#zurich.BNCfreq = zurich.BNCfreq/100 #because 100 million word - to get freq per million
#zurich.BNCfreq = np.log(zurich.BNCfreq)

#zurich['BNCfreqinv']= -zurich.BNCfreq

In [2]:
files = get_matfiles('task3')
data = io.loadmat(files[7], squeeze_me=True, struct_as_record=False)['sentenceData']

In [6]:
# instantiate data transformer object for task 1, 2, or 3 on sentence level (no scaling for now)
datatransform_t1 = DataTransformer('task1', level='sentence', scaling='raw', fillna='zeros')
datatransform_t2 = DataTransformer('task2', level='sentence', scaling='raw', fillna='zeros')
datatransform_t3 = DataTransformer('task3', level='sentence', scaling='raw', fillna='zeros')

In [7]:
# get data for all subjects according to settings specified in cell above
# NOTE: data for each sbj will be stored in pd.DataFrame
sbjs_t1 = [datatransform_t1(i) for i in range(12)]
sbjs_t2 = [datatransform_t2(i) for i in range(12)]
sbjs_t3 = [datatransform_t3(i) for i in range(12)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [9]:
# for each dataset check whether data is complete and transformation was performed correctly
for i, (sbj_t1, sbj_t2, sbj_t3) in enumerate(zip(sbjs_t1, sbjs_t2, sbjs_t3)):
    print("Subject:", i+1)
    print()
    print("Number of sents for task 1: ", len(sbj_t1))
    print("Number of sents for task 2: ", len(sbj_t2))
    print("Number of sents for task 3: ", len(sbj_t3))
    print()

Subject: 1

Number of sents for task 1:  400
Number of sents for task 2:  300
Number of sents for task 3:  407

Subject: 2

Number of sents for task 1:  400
Number of sents for task 2:  300
Number of sents for task 3:  407

Subject: 3

Number of sents for task 1:  299
Number of sents for task 2:  300
Number of sents for task 3:  407

Subject: 4

Number of sents for task 1:  400
Number of sents for task 2:  300
Number of sents for task 3:  360

Subject: 5

Number of sents for task 1:  400
Number of sents for task 2:  300
Number of sents for task 3:  407

Subject: 6

Number of sents for task 1:  400
Number of sents for task 2:  300
Number of sents for task 3:  407

Subject: 7

Number of sents for task 1:  400
Number of sents for task 2:  250
Number of sents for task 3:  407

Subject: 8

Number of sents for task 1:  400
Number of sents for task 2:  300
Number of sents for task 3:  359

Subject: 9

Number of sents for task 1:  400
Number of sents for task 2:  300
Number of sents for task 3

### Eye-Tracking feature comparison between first and second half of Task 1 (NR)

First and second half of Task 1 (NR Sentiment) have to be compared to inspect whether an order effect has to be reported

In [10]:
# split data into first and second half
first_half, second_half = split_data(sbjs_t1)

In [None]:
# check whether data splitting was performed correctly
for i, (first, second) in enumerate(zip(first_half, second_half)):
    print("Subject:", i)
    print()
    print("Number of sents in first half:", len(first))
    print()
    print("Number of sents in second half:", len(second))
    print()

In [11]:
sentlen_m1, omissions_m1, fixations_m1, pupilsize_m1, gd_m1, trt_m1, ffd_m1, gpt_m1, bncfreq_m1 = compute_means(first_half)
sentlen_m2, omissions_m2, fixations_m2, pupilsize_m2, gd_m2, trt_m2, ffd_m2, gpt_m2, bncfreq_m2 = compute_means(second_half)

In [None]:
sentlens1, omissions1, fixations1, pupilsize1, gd1, trt1, ffd1, gpt1, bnc_freqs1 = compute_allvals(first_half)
sentlens2, omissions2, fixations2, pupilsize2, gd2, trt2, ffd2, gpt2, bnc_freqs2 = compute_allvals(second_half)

In [None]:
corr_mat(compute_allvals(first_half))

In [None]:
print("p-value:", ttest_rel(fixations_m1, fixations_m2)[1])

In [None]:
print("p-value:", ttest_rel(sentlen_m1, sentlen_m2)[1])

#### Mean fixations per word per subject

In [None]:
plot_fix(fixations_m1, fixations_m2)

#### Omission rate on sentence level

In [None]:
plot_omissions(omissions_m1, omissions_m2)

#### Mean GD per word per subject

In [None]:
plot_gd(gd_m1, gd_m2)

#### Mean TRT per word per subject

In [None]:
plot_trt(trt_m1, trt_m2)

#### Mean FFD per word per subject

In [None]:
plot_ffd(ffd_m1, ffd_m2)

### Eye-Tracking feature comparison between Task 2 (NR) and Task 3 (TSR)

In [None]:
sentlen_m_t2, omissions_m_t2, fixations_m_t2, pupilsize_m_t2, gd_m_t2, trt_m_t2, ffd_m_t2, gpt_m_t2, bncfreq_m_t2 = compute_means(sbjs_t2)
sentlen_m_t3, omissions_m_t3, fixations_m_t3, pupilsize_m_t3, gd_m_t3, trt_m_t3, ffd_m_t3, gpt_m_t3, bncfreq_m_t3 = compute_means(sbjs_t3)

In [None]:
sentlens_t2, omissions_t2, fixations_t2, pupilsize_t2, gd_t2, trt_t2, ffd_t2, gpt_t2, bnc_freqs_t2 = compute_allvals(sbjs_t2)
sentlens_t3, omissions_t3, fixations_t3, pupilsize_t3, gd_t3, trt_t3, ffd_t3, gpt_t3, bnc_freqs_t3 = compute_allvals(sbjs_t3)

In [None]:
print("p-value:", ttest_rel(sentlen_m_t2, sentlen_m_t2)[1])

#### Mean fixations per word per subject

In [None]:
plot_fix(fixations_m_t2, fixations_m_t3)
print("p-value:", ttest_rel(fixations_m1_t2, fixations_m2_t3)[1])

#### Omission rate on sentence level

In [None]:
plot_omissions(omissions_m_t2, omissions_m_t3)
print("p-value:", ttest_rel(omissions_m_t2, omissions_m_t3)[1])

#### Mean GD per word per subject

In [None]:
plot_gd(gd_m_t2, gd_m_t3)

#### Mean TRT per word per subject

In [None]:
plot_trt(trt_m_t2, trt_m_t2)

#### Mean FFD per word per subject

In [None]:
plot_ffd(ffd_m_t2, ffd_m_t2)