In [1]:
import numpy as np
import pandas as pd
import statsrat as sr
from statsrat import rw
from statsrat.expr.predef.cat import fast
import read_fast_surveys as rfs
from pseudo_guid_list import pguid
import seaborn as sns

In [2]:
# Indicate whether data import should be tested for debugging purposes.
test_import = False

# Indicate whether the learning model should be fit.
fit_model = False

# Define learning model.
CompAct_fast = rw.model(name = 'CompAct_fast',
                        fbase = rw.fbase.elem,
                        fweight = rw.fweight.from_aux_norm,
                        lrate = rw.lrate.from_aux_norm,
                        aux = rw.aux.gradcomp_fast)

# Define time limit for model fitting (in old R code was Inf, but is 10 by default in current code).
max_time = 60

In [3]:
# Test import of in lab data
if test_import:
    path = 'fast_data_inlab_debug'
    (ds_fall2020_inlab, summary_fall2020_inlab) = fast.read_csv(path = path,
                                                                x_col = ['cue1', 'cue2'],
                                                                resp_col = ['trial_resp.keys', 'test_resp.keys'],
                                                                resp_map = {'h' : 'cati', 'g' : 'catii', 'c' : 'cat1', 'm' : 'cat2', 's' : 'cat3', 'r' : 'cat4'},
                                                                ident_col = 'participant',
                                                                conf_col = 'conf_rating.response',
                                                                other_info = {'age': ('questionText', 'What is your age?', 'slider.response'),
                                                                              'sex': ('questionText', 'What was your biological sex assigned at birth?', 'slider.response'),
                                                                              'latinx': ('questionText', 'Do you consider yourself to be of Hispanic, Latino, or Spanish origin?', 'slider.response'),
                                                                              'race': ('questionText', 'With which racial group do you most closely identify?', 'slider.response'),
                                                                              'employment': ('questionText', 'What is your current employment status?', 'slider.response')},
                                                               n_final = 8)
    # Import survey data.
    foo = rfs.read_surveys(path = path,
                           fun = rfs.process_psychopy,
                           ident_col = 'participant')

    # Only keep people whose surveys can be read.
    index = summary_fall2020_inlab.index.isin(foo.index.values)
    ds_fall2020_inlab = ds_fall2020_inlab[{'ident': index}]

    if fit_model:
        # Fit model.
        bar = sr.fit_em(model = CompAct_fast,
                        ds = ds_fall2020_inlab,
                        max_time = max_time)
        # Concatenate data frames.
        summary_fall2020_inlab = pd.concat([summary_fall2020_inlab.loc[index], foo, bar], axis = 1)
    else:
        # Concatenate data frames.
        summary_fall2020_inlab = pd.concat([summary_fall2020_inlab.loc[index], foo], axis = 1)
    print(summary_fall2020_inlab)

b'Skipping line 229: expected 737 fields, saw 741\n'
b'Skipping line 229: expected 737 fields, saw 741\n'
b'Skipping line 229: expected 737 fields, saw 741\n'
b'Skipping line 229: expected 737 fields, saw 741\n'
b'Skipping line 229: expected 733 fields, saw 737\n'
b'Skipping line 229: expected 737 fields, saw 741\n'
b'Skipping line 229: expected 737 fields, saw 741\n'
b'Skipping line 229: expected 737 fields, saw 741\n'


             schedule    age     sex latinx    race           employment  \
ident                                                                      
NDARZC072PAH   design  18-24    Male     No   White    Full-time Student   
NDARMH389NCL   design  18-24  Female     No   White    Full-time Student   
NDARDJ786YFD   design  18-24    Male     No   White   Employed part-time   
NDARVN179CFY   design  18-24  Female     No   White   Employed part-time   
NDARPG437CRZ   design  18-24  Female     No   White    Full-time Student   

              tutorial_0a_last8_pct_correct  tutorial_0b_last8_pct_correct  \
ident                                                                        
NDARZC072PAH                          100.0                          100.0   
NDARMH389NCL                          100.0                          100.0   
NDARDJ786YFD                          100.0                          100.0   
NDARVN179CFY                          100.0                          100.0   

b'Skipping line 229: expected 737 fields, saw 741\n'
b'Skipping line 229: expected 733 fields, saw 737\n'


In [4]:
# Test import of online data
if test_import:
    path = 'fast_data_online_debug'
    (ds_fall2020_online, summary_fall2020_online) = fast.read_csv(path = path,
                                                                  x_col = ['abstract_stim1', 'abstract_stim2'],
                                                                  resp_col = ['response'],
                                                                  resp_map = {'h' : 'cati', 'g' : 'catii', 'l' : 'cat1', 'o' : 'cat2', 's' : 'cat3', 'r' : 'cat4'},
                                                                  ident_col = None,
                                                                  other_info = {'age': ('head', 'What is your age?', 'response'),
                                                                                'sex': ('head', 'What was your biological sex assigned at birth?', 'response'),
                                                                                'latinx': ('head', 'Do you consider yourself to be Hispanic, Latino, or of Spanish origin?', 'response'),
                                                                                'race': ('head', 'With which racial group do you identify (select the one with which you MOST CLOSELY identify)?', 'response'),
                                                                                'employment': ('head', 'What is your current employment status?', 'response')},
                                                                  header = 2,
                                                                  n_final = 8)
    # Import survey data.
    foo = rfs.read_surveys(path = path,
                           fun = rfs.process_testable,
                           ident_col = None,
                           header = 2)

    # Only keep people whose surveys can be read.
    index = summary_fall2020_online.index.isin(foo.index.values)
    ds_fall2020_online = ds_fall2020_online[{'ident': index}]

    if fit_model:
        # Fit model.
        bar = sr.fit_em(model = CompAct_fast,
                        ds = ds_fall2020_online,
                        max_time = max_time)
        # Concatenate data frames.
        summary_fall2020_online = pd.concat([summary_fall2020_online.loc[index], foo, bar], axis = 1)
    else:
        # Concatenate data frames.
        summary_fall2020_online = pd.concat([summary_fall2020_online.loc[index], foo], axis = 1)
    print(summary_fall2020_online)

      schedule           age     sex latinx                race  \
ident                                                             
sub_0   design  20 years old  Female     No  More than one race   
sub_1   design  18 years old    Male     No               White   
sub_2   design  18 years old  Female     No               White   
sub_3   design  19 years old  Female     No               White   
sub_4   design  19 years old  Female    Yes               White   

               employment  tutorial_0a_last8_pct_correct  \
ident                                                      
sub_0   Full-time student                          100.0   
sub_1  Employed part-time                          100.0   
sub_2   Full-time student                          100.0   
sub_3   Full-time student                          100.0   
sub_4  Employed part-time                          100.0   

       tutorial_0b_last8_pct_correct  tutorial_0c_last8_pct_correct  \
ident                                 

b'Skipping line 4: expected 9 fields, saw 36\nSkipping line 5: expected 9 fields, saw 36\nSkipping line 6: expected 9 fields, saw 36\nSkipping line 7: expected 9 fields, saw 36\nSkipping line 8: expected 9 fields, saw 36\nSkipping line 9: expected 9 fields, saw 36\nSkipping line 10: expected 9 fields, saw 36\nSkipping line 11: expected 9 fields, saw 36\nSkipping line 12: expected 9 fields, saw 36\nSkipping line 13: expected 9 fields, saw 36\nSkipping line 14: expected 9 fields, saw 36\nSkipping line 15: expected 9 fields, saw 36\nSkipping line 16: expected 9 fields, saw 36\nSkipping line 17: expected 9 fields, saw 36\nSkipping line 18: expected 9 fields, saw 36\nSkipping line 19: expected 9 fields, saw 36\nSkipping line 20: expected 9 fields, saw 36\nSkipping line 21: expected 9 fields, saw 36\nSkipping line 22: expected 9 fields, saw 36\nSkipping line 23: expected 9 fields, saw 36\nSkipping line 24: expected 9 fields, saw 36\nSkipping line 25: expected 9 fields, saw 36\nSkipping line 

In [5]:
# Import data collected in the lab (using Psychopy).

if not test_import:
    path = 'fast_data_inlab'

    # Import trial by trial task data.
    (ds_fall2020_inlab, summary_fall2020_inlab) = fast.read_csv(path = path,
                                                                x_col = ['cue1', 'cue2'],
                                                                resp_col = ['trial_resp.keys', 'test_resp.keys'],
                                                                resp_map = {'h' : 'cati', 'g' : 'catii', 'c' : 'cat1', 'm' : 'cat2', 's' : 'cat3', 'r' : 'cat4'},
                                                                ident_col = 'participant',
                                                                #conf_col = 'conf_rating.response', # INCLUDE CONFIDENCE RATINGS IN OAT SCORES.
                                                                other_info = {'age': ('questionText', 'What is your age?', 'slider.response'),
                                                                              'sex': ('questionText', 'What was your biological sex assigned at birth?', 'slider.response'),
                                                                              'latinx': ('questionText', 'Do you consider yourself to be of Hispanic, Latino, or Spanish origin?', 'slider.response'),
                                                                              'race': ('questionText', 'With which racial group do you most closely identify?', 'slider.response'),
                                                                              'employment': ('questionText', 'What is your current employment status?', 'slider.response')},
                                                               n_final = 8)

    # Import survey data.
    foo = rfs.read_surveys(path = path,
                           fun = rfs.process_psychopy,
                           ident_col = 'participant')

    # Only keep people whose surveys can be read.
    index = summary_fall2020_inlab.index.isin(foo.index.values)
    ds_fall2020_inlab = ds_fall2020_inlab[{'ident': index}]

    if fit_model:
        # Fit model.
        bar = sr.fit_em(model = CompAct_fast,
                        ds = ds_fall2020_inlab,
                        max_time = max_time)
        # Concatenate data frames.
        summary_fall2020_inlab = pd.concat([summary_fall2020_inlab.loc[index], foo, bar], axis = 1)
    else:
        # Concatenate data frames.
        summary_fall2020_inlab = pd.concat([summary_fall2020_inlab.loc[index], foo], axis = 1)
    print(summary_fall2020_inlab)

In [6]:
# Import online data (collected using Testable).

# IMPORTANT NOTE: Many data files have missing confidence ratings, but are otherwise readable.
# THERE MUST BE SOME FLAW IN THE CURRENT TESTABLE CODE.

# DOES NOT INCLUDE CONFIDENCE RATINGS IN OAT SCORES.

if not test_import:
    path = 'fast_data_online'
    # Import trial by trial task data.
    (ds_fall2020_online, summary_fall2020_online) = fast.read_csv(path = path,
                                                                  x_col = ['abstract_stim1', 'abstract_stim2'],
                                                                  resp_col = ['response'],
                                                                  resp_map = {'h' : 'cati', 'g' : 'catii', 'l' : 'cat1', 'o' : 'cat2', 's' : 'cat3', 'r' : 'cat4'},
                                                                  ident_col = None,
                                                                  other_info = {'age': ('head', 'What is your age?', 'response'),
                                                                                'sex': ('head', 'What was your biological sex assigned at birth?', 'response'),
                                                                                'latinx': ('head', 'Do you consider yourself to be Hispanic, Latino, or of Spanish origin?', 'response'),
                                                                                'race': ('head', 'With which racial group do you identify (select the one with which you MOST CLOSELY identify)?', 'response'),
                                                                                'employment': ('head', 'What is your current employment status?', 'response')},
                                                                  header = 2,
                                                                  n_final = 8)
    # Import survey data.
    foo = rfs.read_surveys(path = path,
                           fun = rfs.process_testable,
                           ident_col = None,
                           header = 2)

    # Only keep people whose surveys can be read.
    index = summary_fall2020_online.index.isin(foo.index.values)
    ds_fall2020_online = ds_fall2020_online[{'ident': index}]

    if fit_model:
        # Fit model.
        bar = sr.fit_em(model = CompAct_fast,
                        ds = ds_fall2020_online,
                        max_time = max_time)
        # Concatenate data frames.
        summary_fall2020_online = pd.concat([summary_fall2020_online.loc[index], foo, bar], axis = 1)
    else:
        # Concatenate data frames.
        summary_fall2020_online = pd.concat([summary_fall2020_online.loc[index], foo], axis = 1)
    print(summary_fall2020_online)

In [7]:
if not test_import:
    # Combine online and in lab summary data.
    foo = summary_fall2020_inlab
    foo['source'] = 'inlab'
    bar = summary_fall2020_online
    bar['source'] = 'online'
    summary_fall2020 = pd.concat([foo, bar], axis = 0)
    summary_fall2020['avg_last8_pct_correct'] = (summary_fall2020['tutorial_0c_last8_pct_correct'] + summary_fall2020['training_last8_pct_correct'] + summary_fall2020['transfer_last8_pct_correct'])/3
    
    # Define GUID column, inserting pseudo-GUIDs as needed.
    ident_val = summary_fall2020.index.values
    guid = []
    i = 0 # keep track of how many pseudo-GUIDs have been used so far
    for ident in ident_val:
        # the true GUIDs should begin with 'NDAR' and be 12 characters long
        if (ident[0:4] == 'NDAR') and (len(ident) == 12):
            guid += [ident]
        else:
            guid += [pguid[i]]
            i += 1
    summary_fall2020['guid'] = guid    

    # Add performance criterion.
    # >= 75% on all stages (lumping all parts of stage 0 together as one stage) is the criterion for good performance that we used to analyze the Spring 2020 data. 
    summary_fall2020['good_perf'] = (summary_fall2020['tutorial_0c_last8_pct_correct'] >= 75)&(summary_fall2020['training_last8_pct_correct'] >= 75)&(summary_fall2020['transfer_last8_pct_correct'] >= 75)

    # Export processed data.
    #ds_fall2020.to_netcdf("ds_fall2020.nc") # I need to actually combine the datasets.
    summary_fall2020.to_csv('summary_fall2020.csv')
    
    # PREVIOUSLY THERE WERE 276 PEOPLE WHOSE DATA COULD BE READ.  WTF.
    # 107 IN LAB AND 170 ONLINE.
    # NOW THERE ARE 112 IN LAB AND 173 ONLINE.

In [8]:
# FIGURE OUT WHY I SUDDENLY HAVE MORE DATA
old = pd.read_csv('summary_fall2020 (ml with max_time = 30).csv')
new = pd.read_csv('summary_fall2020 (eb with max_time = 120 and min metric = 1).csv')
old_ident = old.ident.values
new_ident = new.ident.values
not_found = []
for ni in new_ident:
    if not ni in old_ident:
        not_found += [ni]
print(new.loc[new.ident.isin(not_found)])
# The newly found in lab files are all from November 16th or 19th.
# I'm confident that I didn't add new data files.
# Also, the GUIDs for the in lab data are all unique, so there don't appear to be any duplicates.
# The only thing I can think of is that somehow my tinkering with the import code removed some flaw that
# previously was preventing a few perfectly good data files from being imported.

            ident schedule           age     sex latinx                race  \
107  NDARZC072PAH   design         18-24    Male     No               White   
108  NDARMH389NCL   design         18-24  Female     No               White   
109  NDARDJ786YFD   design         18-24    Male     No               White   
110  NDARVN179CFY   design         18-24  Female     No               White   
111  NDARPG437CRZ   design         18-24  Female     No               White   
282       sub_187   design  20 years old  Female     No  More than one race   
283       sub_188   design  18 years old    Male     No               White   
284       sub_189   design  18 years old  Female     No               White   
285       sub_190   design  19 years old  Female     No               White   
286       sub_191   design  19 years old  Female    Yes               White   

              employment  tutorial_0a_last8_pct_correct  \
107    Full-time Student                          100.0   
108    Full-

inlab     Psychopy
online    Testable
inlab     Psychopy
dtype: object
