In [1]:
import numpy as np
import pandas as pd
import statsrat as sr
import edit_distance
from statsrat import rw
from statsrat.expr.predef.cat import fast
from fuzzywuzzy import fuzz, process
from Levenshtein import distance
from read_fast_surveys import promis_lookup

In [2]:
# Indicate whether the learning model should be fit.
fit_model = True

# Use Levenstein distance (for consistency with R script); otherwise use ratio.
use_distance = True

# Should processed data be exported?
export_data = True

# Define learning model.
model = rw.model(name = 'CompAct_eta0',
                 fbase = rw.fbase.elem,
                 fweight = rw.fweight.from_aux_norm,
                 lrate = rw.lrate.from_aux_norm,
                 drate = rw.drate.zero,
                 aux = rw.aux.gradcomp_eta0)

# Define time limit for model fitting (in old R code was Inf, but is 10 by default in current code).
max_time = 120

In [3]:
(ds_spring2020, summary_spring2020) = fast.read_csv(path = 'fast_data_spring/learning',
                                                    x_col = ['cue1', 'cue2'],
                                                    resp_col = ['trial_resp.keys', 'test_resp.keys'],
                                                    resp_map = {'h' : 'cati', 'g' : 'catii', 'c' : 'cat1', 'm' : 'cat2', 's' : 'cat3', 'r' : 'cat4'},
                                                    ident_col = 'sonaID',
                                                    conf_col = 'conf_rating.response',
                                                    n_final = 8)


print(summary_spring2020)

wrong number of trials for file fast_data_spring/learning/20_spring_faces_2020_Jan_28_1526.csv
trials found: 194
trials expected: 204
Usecols do not match columns, columns expected but not found: ['test_resp.keys', 'trial_resp.keys', 'cue1', 'cue2', 'conf_rating.response']
Usecols do not match columns, columns expected but not found: ['test_resp.keys', 'conf_rating.response']
Usecols do not match columns, columns expected but not found: ['test_resp.keys', 'trial_resp.keys', 'cue1', 'cue2', 'conf_rating.response']
Usecols do not match columns, columns expected but not found: ['test_resp.keys', 'conf_rating.response']
Usecols do not match columns, columns expected but not found: ['test_resp.keys', 'trial_resp.keys', 'cue1', 'cue2', 'conf_rating.response']
Usecols do not match columns, columns expected but not found: ['test_resp.keys', 'trial_resp.keys', 'cue1', 'cue2', 'conf_rating.response']
Usecols do not match columns, columns expected but not found: ['test_resp.keys', 'trial_resp.key

In [4]:
# **** import survey data ****
df_surveys = pd.read_csv('fast_data_spring/survey/JonesSec9.csv')
df_surveys.rename(columns = {'IDCode': 'ident',
                   'mjon1': 'anx1', # In the past 7 days...I felt fearful
                   'mjon2': 'anx2', # In the past 7 days...I found it hard to focus on anything other than my anxiety
                   'mjon3': 'anx3', # In the past 7 days...My worries overwhelmed me
                   'mjon4': 'anx4', # In the past 7 days...I felt uneasy
                   'mjon5': 'anx5', # In the past 7 days...I felt nervous
                   'mjon6': 'anx6', # In the past 7 days...I felt like I needed help for my anxiety
                   'mjon7': 'anx7', # In the past 7 days...I felt anxious
                   'mjon8': 'anx8', # In the past 7 days...I felt tense
                   'mjon9': 'ang1', # In the past 7 days...I was irritated more than people knew
                   'mjon10': 'ang2', # In the past 7 days...I felt angry
                   'mjon11': 'ang3', # In the past 7 days...I felt like I was ready to explode
                   'mjon12': 'ang4', # In the past 7 days...I was grouchy
                   'mjon13': 'ang5', # In the past 7 days...I felt annoyed
                   'mjon14': 'dep1', # In the past 7 days...I felt worthless
                   'mjon15': 'dep2', # In the past 7 days...I felt that I had nothing to look forward to
                   'mjon16': 'dep3', # In the past 7 days...I felt helpless
                   'mjon17': 'dep4', # In the past 7 days...I felt sad
                   'mjon18': 'dep5', # In the past 7 days...I felt like a failure
                   'mjon19': 'dep6', # In the past 7 days...I felt depressed
                   'mjon20': 'dep7', # In the past 7 days...I felt unhappy
                   'mjon21': 'dep8', # In the past 7 days...I felt hopeless
                   'mjon22': 'pos1', # In the past 7 days...I felt cheerful
                   'mjon23': 'pos2', # In the past 7 days...I felt attentive
                   'mjon24': 'pos3', # In the past 7 days...I felt delighted
                   'mjon25': 'pos4', # In the past 7 days...I felt happy
                   'mjon26': 'pos5', # In the past 7 days...I felt joyful
                   'mjon27': 'pos6', # In the past 7 days...I felt enthusiastic
                   'mjon28': 'pos7', # In the past 7 days...I felt determined
                   'mjon29': 'pos8', # In the past 7 days...I felt interested
                   'mjon30': 'pos9', # In the past 7 days...I was thinking creatively
                   'mjon31': 'pos10', # In the past 7 days...I liked myself
                   'mjon32': 'pos11', # In the past 7 days...I felt peaceful
                   'mjon33': 'pos12', # In the past 7 days...I felt good-natured
                   'mjon34': 'pos13', # In the past 7 days...I felt useful
                   'mjon35': 'pos14', # In the past 7 days...I felt understood
                   'mjon36': 'pos15'}, # In the past 7 days...I felt content
                  inplace = True)
df_surveys['ident'] = df_surveys['ident'].str.lower()
print(df_surveys.head())

       ident  Section Sum  anx1  anx2  anx3  anx4  anx5  anx6  anx7  anx8  \
0  1211hamaj         85.0   2.0   3.0   2.0   2.0   3.0   3.0   3.0   3.0   
1  0106raber        111.0   2.0   4.0   4.0   3.0   4.0   3.0   5.0   4.0   
2  0704jafai        104.0   3.0   3.0   3.0   3.0   4.0   4.0   3.0   2.0   
3  0424pahig        117.0   4.0   2.0   3.0   3.0   3.0   3.0   4.0   3.0   
4  0905gimac         91.0   2.0   2.0   3.0   3.0   3.0   2.0   3.0   2.0   

   ...  pos6  pos7  pos8  pos9  pos10  pos11  pos12  pos13  pos14  pos15  
0  ...   3.0   2.0   2.0   2.0    2.0    3.0    3.0    3.0    2.0    2.0  
1  ...   3.0   3.0   3.0   4.0    3.0    3.0    3.0    2.0    3.0    3.0  
2  ...   4.0   4.0   4.0   2.0    4.0    3.0    4.0    3.0    3.0    4.0  
3  ...   5.0   4.0   4.0   4.0    4.0    4.0    4.0    4.0    3.0    4.0  
4  ...   3.0   3.0   3.0   2.0    3.0    3.0    4.0    4.0    3.0    3.0  

[5 rows x 38 columns]


In [5]:
print(promis_lookup([np.NaN, 8.0], 'anx'))

[ nan 37.1]


In [6]:
# **** perform PROMIS calculations ****

col_names = []
for i in range(8):
    col_names += ['anx' + str(i + 1)]
df_surveys['promis_anx_sum'] = df_surveys[col_names].sum(axis = 1, skipna = False)
df_surveys['promis_anx_std'] = df_surveys[col_names].std(axis = 1, skipna = False)
df_surveys['promis_anx'] = promis_lookup(df_surveys['promis_anx_sum'], 'anx')

col_names = []
for i in range(5):
    col_names += ['ang' + str(i + 1)]
df_surveys['promis_ang_sum'] = df_surveys[col_names].sum(axis = 1, skipna = False)
df_surveys['promis_ang_std'] = df_surveys[col_names].std(axis = 1, skipna = False)
df_surveys['promis_ang'] = promis_lookup(df_surveys['promis_ang_sum'], 'ang')

col_names = []
for i in range(8):
    col_names += ['dep' + str(i + 1)]
df_surveys['promis_dep_sum'] = df_surveys[col_names].sum(axis = 1, skipna = False)
df_surveys['promis_dep_std'] = df_surveys[col_names].std(axis = 1, skipna = False)
df_surveys['promis_dep'] = promis_lookup(df_surveys['promis_dep_sum'], 'dep8b')

col_names = []
for i in range(15):
    col_names += ['pos' + str(i + 1)]
df_surveys['promis_pos_sum'] = df_surveys[col_names].sum(axis = 1, skipna = False)
df_surveys['promis_pos_std'] = df_surveys[col_names].std(axis = 1, skipna = False)
df_surveys['promis_pos'] = promis_lookup(df_surveys['promis_pos_sum'], 'pos')

In [7]:
# **** drop people not found in FAST data, and fix IDs ****

# https://www.datacamp.com/community/tutorials/fuzzy-string-python
# https://pypi.org/project/fuzzywuzzy/
# https://pypi.org/project/python-Levenshtein/#documentation

# Loop through survey data.
fast_ident = summary_spring2020.index.values # list of IDs found in the FAST data
new_ident = []
for i in df_surveys.index:
    ident_i = df_surveys['ident'].loc[i]
    
    if use_distance:
        # Use Levenstein distance (for consistency with the R script).
        dist = []
        for fident in fast_ident:
            dist += [distance(ident_i, fident)]
        best_match = fast_ident[np.argmin(dist)]
        min_dist = np.min(dist)
        print()
        print(min_dist)
        print(best_match)
        close_enough = min_dist <= 3
        exact_match = min_dist == 0
    else:
        # Use Levenstein ratio.
        best_match, match_pct = process.extractOne(ident_i, fast_ident, scorer = fuzz.ratio)
        close_enough = match_pct >= 70
        exact_match = match_pct == 100
    not_duplicate = not best_match in new_ident
    
    if close_enough and not_duplicate:
        if not exact_match:
            print(ident_i)
            print(best_match)
            print()
        new_ident += [best_match]
    else:
        df_surveys.drop(index = i, inplace = True)
df_surveys['new_ident'] = new_ident
df_surveys.drop(columns = 'ident', inplace = True)
df_surveys.set_index('new_ident', inplace = True)        

# Only keep people whose survey data can be found.
index = summary_spring2020.index.isin(df_surveys.index.values)
ds_spring2020 = ds_spring2020[{'ident': index}]

# Concatenate data frames.
summary_spring2020 = pd.concat([summary_spring2020.loc[index], df_surveys], axis = 1)

# I have spot-checked the OAT scores computed by the R and Python scripts, and they match up.


4
1111jalak

4
0601samer

3
0224mafai
0704jafai
0224mafai


0
0424pahig

4
0905iraqu

4
0830dehad

4
0308tileg

2
0607cafai
0407fafai
0607cafai


5
0908kamul

4
0129cyale

4
0418anfai

5
0829jurus

4
0602shpas

2
0602shpas
0502shpal
0602shpas


4
1127anhig

4
0818karoc

5
0417rosmi

3
0510shfai
0524sufai
0510shfai


5
0508elche

4
0607cafai

4
1101resie

0
0127trcen

5
0409gigeo

0
0103anche

0
1208jucor

4
0822susky

0
0215irara

4
0625limou

5
0128meher

0
1105mavis

5
0222rehom

3
0414mitrh
0414miral
0414mitrh


5
1116nawes

0
1203mihol

5
1004kasha

0
0521mileg

4
0605kacre

4
0425kawas

3
0605kacre
0425kache
0605kacre


4
0118cathu

4
0127trcen

3
0313lamon
0312lamia
0313lamon


1
1115susai
1115sasai
1115susai


4
0920mafra

3
0607cafai

4
0422faeas

4
0409gigeo

4
0620junot

15
0921lowes

5
1003vimar

0
0804eidis

4
0709kabis

4
0503cidoh

5
0709kabis

4
0630mypou

4
0625limou

0
0426liniw

5
0308tileg

5
0707reran

0
0212baben

4
0510shfai

3
0607cafai

4
0510histr

4
1105mavis


4
1004kasha

0
0630mypou

3
0917jahal

5
0510shfai

4
0125kefos

3
0625limou

0
1216edasf

5
0129cyale

4
0422faeas

5
0303kareg

0
0320almen

0
0221calyn

0
0810tatak

4
1030hutre

5
0920mafra

4
0917jahal

3
0810kaste

3
0422faeas
0702caeas
0422faeas


4
0704saleg

5
0303kareg

4
0917jahal

4
0625limou

2
1001renor

5
0129cyale

4
0908kamul

4
0707reran

4
1127anhig

5
0625limou

3
0614sunor

4
1116nawes

0
0303kilyd

15
0921lowes

0
1006shhol

4
0811magem

5
1111jeara

3
0128meher

5
0222rehom

4
0614sunor

4
0410legra

5
0720chlak

0
1001chcas

0
0720chlak

4
0424pahig

4
1001renor

4
0601keken

5
0920mafra

4
1111heche

5
1001renor

6
0409gigeo

0
0607cafai

4
0317mahor

4
1006shhol

0
1101resie

0
1125kilit

3
0816kamon

4
0503juroc

15
0921lowes

5
0508elche

9
0508elche

4
0507jelak

4
0717bomou

4
0921lowes

6
0724albro

4
0708loroc

3
0501trbou
0101crmou
0501trbou


4
0222rehom

2
0818karoc

0
0118cathu

6
0222rehom

0
0118keche

3
0818karoc

5
0224mafai

15
0921lowes

15
09

In [None]:
if fit_model:
    # Fit model.
    df_model = sr.fit_indv(model = model,
                           ds = ds_spring2020,
                           max_time = max_time)
    # Concatenate data frames.
    summary_spring2020 = pd.concat([summary_spring2020, df_model], axis = 1)

Fitting 1 of 127 (0.8%)
Fitting 2 of 127 (1.6%)
Fitting 3 of 127 (2.4%)
Fitting 4 of 127 (3.1%)
Fitting 5 of 127 (3.9%)
Fitting 6 of 127 (4.7%)
Fitting 7 of 127 (5.5%)
Fitting 8 of 127 (6.3%)
Fitting 9 of 127 (7.1%)
Fitting 10 of 127 (7.9%)
Fitting 11 of 127 (8.7%)
Fitting 12 of 127 (9.4%)
Fitting 13 of 127 (10.2%)
Fitting 14 of 127 (11.0%)
Fitting 15 of 127 (11.8%)
Fitting 16 of 127 (12.6%)
Fitting 17 of 127 (13.4%)
Fitting 18 of 127 (14.2%)
Fitting 19 of 127 (15.0%)
Fitting 20 of 127 (15.7%)
Fitting 21 of 127 (16.5%)
Fitting 22 of 127 (17.3%)
Fitting 23 of 127 (18.1%)


In [None]:
# Add performance criterion.
# >= 75% on all stages (lumping all parts of stage 0 together as one stage) is the criterion for good performance that we used to analyze the Spring 2020 data. 
summary_spring2020['good_perf'] = (summary_spring2020['tutorial_0c_last8_pct_correct'] >= 75)&(summary_spring2020['training_last8_pct_correct'] >= 75)&(summary_spring2020['transfer_last8_pct_correct'] >= 75)

In [None]:
if export_data:
    # ***** EXPORT PROCESSED DATA *****
    summary_spring2020.to_csv('summary_spring2020.csv', index = True)

In [None]:
print(summary_spring2020.shape[0])

print(summary_spring2020['rel_irl'].min())
print(summary_spring2020['rel_irl'].max())
print(summary_spring2020['rel_irl'].mean())

print(summary_spring2020['threat_benign_os'].min())
print(summary_spring2020['threat_benign_os'].max())
print(summary_spring2020['threat_benign_os'].mean())

In [None]:
print(summary_spring2020['promis_pos'].values)