In [1]:
import numpy as np
import pandas as pd
import statsrat as sr
from statsrat import rw
from statsrat.expr.predef.cat import fast
import read_fast_surveys as rfs
from pseudo_guid_list import pguid

In [4]:
# Indicate whether data import should be tested for debugging purposes.
test_import = False

# Indicate whether the learning model should be fit.
fit_model = True

# Define learning model.
model = rw.model(name = 'CompAct_eta0',
                        fbase = rw.fbase.elem,
                        fweight = rw.fweight.from_aux_norm,
                        lrate = rw.lrate.from_aux_norm,
                        drate = rw.drate.zero,
                        aux = rw.aux.gradcomp_eta0)

# Define time limit for model fitting (in old R code was Inf, but is 10 by default in current code).
max_time = 30

In [10]:
# Import data collected in the lab (using Psychopy).

if test_import:
    path = 'debug_data_inlab'
else:
    path = 'fast_data_inlab'

# Import trial by trial task data.
(ds_fall2020_inlab, summary_fall2020_inlab) = fast.read_csv(path = path,
                                                            x_col = ['cue1', 'cue2'],
                                                            resp_col = ['trial_resp.keys', 'test_resp.keys'],
                                                            resp_map = {'h' : 'cati', 'g' : 'catii', 'c' : 'cat1', 'm' : 'cat2', 's' : 'cat3', 'r' : 'cat4'},
                                                            ident_col = 'participant',
                                                            #conf_col = 'conf_rating.response', # INCLUDE CONFIDENCE RATINGS IN OAT SCORES.
                                                            other_info = {'age': ('questionText', 'What is your age?', 'slider.response'),
                                                                          'sex': ('questionText', 'What was your biological sex assigned at birth?', 'slider.response'),
                                                                          'latinx': ('questionText', 'Do you consider yourself to be of Hispanic, Latino, or Spanish origin?', 'slider.response'),
                                                                          'race': ('questionText', 'With which racial group do you most closely identify?', 'slider.response'),
                                                                          'employment': ('questionText', 'What is your current employment status?', 'slider.response')},
                                                           n_final = 8)

# Import survey data.
print('\n Importing survey data \n')
foo = rfs.read_surveys(path = path,
                       fun = rfs.process_psychopy,
                       ident_col = 'participant')

# Only keep people whose surveys can be read.
index = summary_fall2020_inlab.index.isin(foo.index.values)
ds_fall2020_inlab = ds_fall2020_inlab[{'ident': index}]

if fit_model:
    print('\n Fitting model \n')
    # Fit model.
    bar = sr.fit_indv(model = model,
                      ds = ds_fall2020_inlab,
                      max_time = max_time)
    # Concatenate data frames.
    summary_fall2020_inlab = pd.concat([summary_fall2020_inlab.loc[index], foo, bar], axis = 1)
else:
    # Concatenate data frames.
    summary_fall2020_inlab = pd.concat([summary_fall2020_inlab.loc[index], foo], axis = 1)
print(summary_fall2020_inlab)

Usecols do not match columns, columns expected but not found: ['cue2', 'test_resp.keys', 'trial_resp.keys', 'cue1']
Usecols do not match columns, columns expected but not found: ['cue2', 'test_resp.keys', 'trial_resp.keys', 'cue1']
Usecols do not match columns, columns expected but not found: ['cue2', 'test_resp.keys', 'trial_resp.keys', 'cue1']
Usecols do not match columns, columns expected but not found: ['cue2', 'test_resp.keys', 'trial_resp.keys', 'cue1']
The following files could not be read by Pandas:
fast_data_inlab/_fall_faces_2020_Oct_12_1324.csv
fast_data_inlab/NDARGR305EJL_fall_faces_2020_Sep_17_1210.csv
fast_data_inlab/_fall_faces_2020_Oct_16_1602.csv
fast_data_inlab/_fall_faces_2020_Sep_21_1338.csv
Importing survey data 

'numpy.float64' object has no attribute 'values'
'numpy.float64' object has no attribute 'values'
'numpy.float64' object has no attribute 'values'
'numpy.float64' object has no attribute 'values'
'numpy.float64' object has no attribute 'values'
'numpy.flo

KeyError: "None of ['ident'] are in the columns"

In [9]:
print(summary_fall2020_inlab.head())

             schedule    age     sex latinx    race           employment  \
ident                                                                      
NDARKM468VR6   design  18-24    Male     No   White    Full-time Student   
NDARJJ637MVP   design  18-24  Female     No   White    Full-time Student   
nan            design  18-24  Female     No   White    Full-time Student   
NDARTX594HBJ   design  18-24  Female     No   White   Employed part-time   
NDARRR883WV4   design  18-24  Female     No   White    Full-time Student   

              tutorial_0a_last8_pct_correct  tutorial_0b_last8_pct_correct  \
ident                                                                        
NDARKM468VR6                          100.0                          100.0   
NDARJJ637MVP                          100.0                          100.0   
nan                                    75.0                          100.0   
NDARTX594HBJ                           87.5                          100.0   

In [None]:
# Import online data (collected using Testable).

# IMPORTANT NOTE: Many data files have missing confidence ratings, but are otherwise readable.
# THERE MUST BE SOME FLAW IN THE CURRENT TESTABLE CODE.

# DOES NOT INCLUDE CONFIDENCE RATINGS IN OAT SCORES.

if test_import:
    path = 'debug_data_online'
else:
    path = 'fast_data_online'
    
# Import trial by trial task data.
(ds_fall2020_online, summary_fall2020_online) = fast.read_csv(path = path,
                                                              x_col = ['abstract_stim1', 'abstract_stim2'],
                                                              resp_col = ['response'],
                                                              resp_map = {'h' : 'cati', 'g' : 'catii', 'l' : 'cat1', 'o' : 'cat2', 's' : 'cat3', 'r' : 'cat4'},
                                                              ident_col = None,
                                                              other_info = {'age': ('head', 'What is your age?', 'response'),
                                                                            'sex': ('head', 'What was your biological sex assigned at birth?', 'response'),
                                                                            'latinx': ('head', 'Do you consider yourself to be Hispanic, Latino, or of Spanish origin?', 'response'),
                                                                            'race': ('head', 'With which racial group do you identify (select the one with which you MOST CLOSELY identify)?', 'response'),
                                                                            'employment': ('head', 'What is your current employment status?', 'response')},
                                                              header = 2,
                                                              n_final = 8)
# Import survey data.
foo = rfs.read_surveys(path = path,
                       fun = rfs.process_testable,
                       ident_col = None,
                       header = 2)

# Only keep people whose surveys can be read.
index = summary_fall2020_online.index.isin(foo.index.values)
ds_fall2020_online = ds_fall2020_online[{'ident': index}]

if fit_model:
    # Fit model.
    bar = sr.fit_indv(model = model,
                      ds = ds_fall2020_online,
                      max_time = max_time)
    # Concatenate data frames.
    summary_fall2020_online = pd.concat([summary_fall2020_online.loc[index], foo, bar], axis = 1)
else:
    # Concatenate data frames.
    summary_fall2020_online = pd.concat([summary_fall2020_online.loc[index], foo], axis = 1)
print(summary_fall2020_online)

In [None]:
# ***** COMBINE DATA FRAMES AND FIX/ADD COLUMNS *****

# Combine online and in lab summary data.
foo = summary_fall2020_inlab
foo['source'] = 'inlab'
bar = summary_fall2020_online
bar['source'] = 'online'
summary_fall2020 = pd.concat([foo, bar], axis = 0)
summary_fall2020['avg_last8_pct_correct'] = (summary_fall2020['tutorial_0c_last8_pct_correct'] + summary_fall2020['training_last8_pct_correct'] + summary_fall2020['transfer_last8_pct_correct'])/3
summary_fall2020.reset_index(inplace = True, drop = False) # re-index

# Define GUID column, inserting pseudo-GUIDs as needed.
ident_val = summary_fall2020['ident'].values
guid = []
i = 0 # keep track of how many pseudo-GUIDs have been used so far
for ident in ident_val:
    # the true GUIDs should begin with 'NDAR' and be 12 characters long
    if (ident[0:4] == 'NDAR') and (len(ident) == 12):
        guid += [ident]
    else:
        guid += [pguid[i]]
        i += 1
summary_fall2020['guid'] = guid
summary_fall2020.loc[summary_fall2020['source'] == 'inlab', 'ident'] = summary_fall2020.loc[summary_fall2020['source'] == 'inlab', 'guid'].values # deal with those people who typed in an inappropriate ID

# Fix the 'sex' column.
summary_fall2020.loc[summary_fall2020['sex'] == ' Male', 'sex'] = 'Male'

# Fix the 'employment' column.
summary_fall2020.loc[summary_fall2020['employment'] == ' Employed part-time', 'employment'] = 'Employed part-time'
summary_fall2020.loc[summary_fall2020['employment'] == ' Full-time Student', 'employment'] = 'Full-time Student'
summary_fall2020.loc[summary_fall2020['employment'] == ' Seeking opportunities', 'employment'] = 'Seeking opportunities'
summary_fall2020['employment'] = summary_fall2020['employment'].str.lower()

# Fix the 'latinx' column.
summary_fall2020.loc[summary_fall2020['latinx'] == ' Yes', 'latinx'] = 'Yes'
summary_fall2020.loc[summary_fall2020['latinx'] == ' No', 'latinx'] = 'No'

# Add performance criterion.
# >= 75% on all stages (lumping all parts of stage 0 together as one stage) is the criterion for good performance that we used to analyze the Spring 2020 data. 
summary_fall2020['good_perf'] = (summary_fall2020['tutorial_0c_last8_pct_correct'] >= 75)&(summary_fall2020['training_last8_pct_correct'] >= 75)&(summary_fall2020['transfer_last8_pct_correct'] >= 75)

# Check that there are no duplicates.
n = summary_fall2020.shape[0]
n_unique = len(np.unique(summary_fall2020['ident']))
if n == n_unique:
    print('No duplicates detected.')
else:
    print(str(n - n_unique) + ' duplicates detected.')

print(summary_fall2020)

In [None]:
if not test_import:
    # ***** EXPORT PROCESSED DATA *****
    
    #ds_fall2020.to_netcdf("ds_fall2020.nc") # I need to actually combine the datasets.
    summary_fall2020.to_csv('summary_fall2020.csv', index = False)
    
    # PREVIOUSLY THERE WERE 276 PEOPLE WHOSE DATA COULD BE READ.  WTF.
    # 107 IN LAB AND 170 ONLINE.
    # NOW THERE ARE 112 IN LAB AND 173 ONLINE.
    # The newly found in lab files are all from November 16th or 19th.
# I'm confident that I didn't add new data files.
# Also, the GUIDs for the in lab data are all unique, so there don't appear to be any duplicates.
# The only thing I can think of is that somehow my tinkering with the import code removed some flaw that
# previously was preventing a few perfectly good data files from being imported.
# UPDATE: I have found that, for whatever reason, not all of the data files were copied into the statsrat folders.
# I have since fixed this prob

In [None]:
df = pd.read_csv('summary_spring2020.csv (30 sec ML)', index_col = 'ident')