In [55]:
import pandas as pd
import numpy as np
import json
import os

In [56]:
PRE_PROCESS_DATA_FOLDER = "pre_processed_data"
TEXT_DATA_FOLDER = os.path.join("experiment_data", "set_texts")
FIXATION_DATA_FOLDER = os.path.join("pre_processed_data","fixation_data_per_part")

data_list = []

for file in os.listdir(PRE_PROCESS_DATA_FOLDER):
    if ".json" not in file:
        continue
    with open(os.path.join(PRE_PROCESS_DATA_FOLDER, file), "r") as f:
        load_data = json.load(f)
    data_list.append(pd.Series(list(load_data.values()), index=load_data.keys()))
data = pd.DataFrame(data_list)
data.head()


Unnamed: 0,worker_id,worker_age,worker_lang,worker_fluency,set_name,set_trials,participant_type,platform_type,vision,target_error,...,trial_9_total_fix_points_p_filtered,pre_question_9_name,pre_question_9_time,question_9_name,question_9_time,question_9_answer,question_9_correct_flag,question_9_target_to_fixation_ratio,set_language,fixation_error
0,linkPfXXpFV1cct,20,Turkish,5,mturk_TR_v03,"[meco_para_7, a_HarvardUniversity_1, a_Rhine_3...",,,,False,...,271,q_before_a_Teacher_0_qa_4,5869.0,q_after_a_Teacher_0_qa_4,18067.0,fiziksel acı,1.0,0.00738,TR,False
1,linkPpaoeaQKkM0,18,Russian,4,mturk_TR_v03,"[meco_para_7, a_HarvardUniversity_1, a_Rhine_3...",,,,False,...,131,q_before_a_Teacher_0_qa_4,5902.0,q_after_a_Teacher_0_qa_4,70096.0,fiziksel acıya,1.0,0.007634,TR,False
2,A4W9APAHFWVLO,41,English,5,mturk_EN_v13,"[meco_para_12, a_DoctorWho_4, a_VictoriaAustra...",,,,False,...,26,q_before_a_FresnoCalifornia_0_qa_0,1656.0,q_after_a_FresnoCalifornia_0_qa_0,3148.0,southwest Fresno,1.0,0.0,EN,False
3,linkPnEwgBwkH4O,20,Turkish,5,mturk_TR_v03,"[meco_para_7, a_HarvardUniversity_1, a_Rhine_3...",,,,False,...,27,q_before_a_Teacher_0_qa_4,10271.0,q_after_a_Teacher_0_qa_4,19051.0,fiziksel acıya,1.0,0.0,TR,False
4,AYY0UC6KCN9RW,27,English,5,mturk_EN_v02,"[meco_para_3, a_Computationalcomplexitytheory_...",mturk,mturk,,False,...,103,q_before_a_DoctorWho_3_qa_0,12534.0,q_after_a_DoctorWho_3_qa_0,10072.0,DUDLEY SIMPSON,1.0,0.165049,EN,False


In [57]:
# Filters to apply:
approved_only = (data.approved_flag > 0).to_numpy() 
no_fixation_error = (data.fixation_error == False).to_numpy()
no_target_error = (data.target_error == False).to_numpy()
sample_higher_10 = (data.webgazer_sample_rate > 10).to_numpy()
acc_higher = (data.avg_roi_last_val > 0).to_numpy()
filter_mturks = np.array([False if "link" in worker_id else True for worker_id in data["worker_id"]])
filter_sets = np.array([True if set_lang in ["EN"] else False for set_lang in data["set_language"]])

screen_x_above_1280 = (data.screen_x > 1110).to_numpy() # Some tolerance
screen_y_above_720 = (data.screen_y > 615).to_numpy() # Some Tolerance
screen_above_1280_720 = screen_x_above_1280 & screen_y_above_720

dict_filter = {
    "filter_mturks" : filter_mturks,
    "filter_sets" : filter_sets,
    "Approved":approved_only,
    "Fix_Error, Target_Error": no_fixation_error & no_target_error,
    "Sample Rate": sample_higher_10,
    "acc_thresh": acc_higher,
    "screen_above_1280_720": screen_above_1280_720,
}

n_total = len(data)
current_filter = np.ones(len(data),dtype=bool)
for condition, f in dict_filter.items():
    n_data_filtered = len(data.iloc[~f & current_filter])
    per_cent = n_data_filtered/n_total * 100
    print(f"For condition ({condition}), {per_cent:.2f}% has been filtered. ({n_data_filtered} out of {n_total})")
    current_filter = current_filter & f
    n_total = len(data.iloc[current_filter])

mask = filter_mturks & approved_only & no_fixation_error & no_target_error & sample_higher_10 & screen_above_1280_720 & acc_higher & filter_sets
data_filtered = data[mask].copy()
len(data_filtered)

For condition (filter_mturks), 37.36% has been filtered. (226 out of 605)
For condition (filter_sets), 43.27% has been filtered. (164 out of 379)
For condition (Approved), 18.60% has been filtered. (40 out of 215)
For condition (Fix_Error, Target_Error), 2.86% has been filtered. (5 out of 175)
For condition (Sample Rate), 8.24% has been filtered. (14 out of 170)
For condition (acc_thresh), 1.28% has been filtered. (2 out of 156)
For condition (screen_above_1280_720), 0.00% has been filtered. (0 out of 154)


154

Target data structure:  
An array of JSON object:  
[  
    {  
        "worker_id": id,  
        "set_name": name,  
        "text_id": id,  
        "text": text,  
        "question_id": id,  
        "question": question    
    }  
]  

In [58]:
# Only need IS 
target_features_list = []
for i in range (5, 10):
    target_features = data_filtered[['worker_id', 'set_name', f'trial_{i}_condition', f'trial_{i}_name', f'question_{i}_name']]
    print(len(target_features))
    for key in range(len(target_features)):
        # Convert Python to JSON  
        # print()
        # json_object = json.dumps(target_features.iloc[key].to_dict(), indent = 4) 
        target_dict = target_features.iloc[key].to_dict()
        temp_dict = {}
        for key in target_dict:
            if 'trial_' in key and '_name' in key:
                temp_dict['text_name'] = target_dict[key]
            if 'question_' in key and '_name' in key:
                temp_dict['question_name'] = target_dict[key]
        target_dict.update(temp_dict)
        target_features_list.append(target_dict)
    print(i)
print(len(target_features_list))


154
5
154
6
154
7
154
8
154
9
770


In [59]:
data_list = []

for file in os.listdir(TEXT_DATA_FOLDER):
    if "EN_" not in file:
        continue
    temp_data = pd.read_csv(os.path.join(TEXT_DATA_FOLDER, file), index_col=0)
    data_list.append(temp_data)

print(len(data_list))
text_data = pd.concat(data_list, ignore_index=True)
print(len(text_data))
text_data.head()

20
400


Unnamed: 0,stimulus,trial_name,task_type,correct_answer,lang
0,"In competitive sports, doping is the use of ba...",meco_para_3,NR,,EN
1,Can doping have adverse health effects?,meco_para_3_qa_0,NR,True,EN
2,"When considering computational problems, a pro...",a_Computationalcomplexitytheory_1,NR,,EN
3,What is the name of the alphabet is most commo...,a_Computationalcomplexitytheory_1_qa_1,NR,binary,EN
4,Many known complexity classes are suspected to...,a_Computationalcomplexitytheory_4,NR,,EN


Target data structure:  
An array of JSON object:  
[  
    {  
        "worker_id": id,  
        "set_name": name,  
        "text_id": id,  
        "text": text,  
        "question_id": id,  
        "question": question    
    }  
]  

In [60]:
print(target_features_list[0])

{'worker_id': 'A4W9APAHFWVLO', 'set_name': 'mturk_EN_v13', 'trial_5_condition': 'is', 'trial_5_name': 'a_Kenya_1', 'question_5_name': 'q_after_a_Kenya_1_qa_0', 'text_name': 'a_Kenya_1', 'question_name': 'q_after_a_Kenya_1_qa_0'}


In [65]:
import pprint as pp
for item in target_features_list:
    for i in range(len(text_data)):
        if item["text_name"] == text_data.iloc[i]['trial_name']:
            item['text'] = text_data.iloc[i]['stimulus']
        if item["question_name"] == text_data.iloc[i]['trial_name']:
            item['question'] = text_data.iloc[i]['stimulus']
    # pp.pprint(item)
        # break
    # break

In [67]:
# verify
for item in target_features_list:
    if len(item) != 9:
        pp.pprint(item)

In [70]:
# Save into files
with open("target_experiments_IS_EN.json", "w") as outfile:
    json.dump(target_features_list, outfile, indent=4)