In [1]:
import numpy as np
import pandas as pd
import statsrat as sr
import edit_distance
from statsrat import rw
from statsrat.expr.predef.cat import fast
from fuzzywuzzy import fuzz, process
from Levenshtein import distance
from read_fast_surveys import promis_lookup

In [2]:
# Define time limit for model fitting (in old R code was Inf, but is 10 by default in current code).
max_time = 10

# Define learning model.
model = rw.model(name = 'CompAct_eta0',
                 fbase = rw.fbase.elem,
                 fweight = rw.fweight.from_aux_norm,
                 lrate = rw.lrate.from_aux_norm,
                 drate = rw.drate.zero,
                 aux = rw.aux.gradcomp_eta0)

# Should we use MLE or the EM algorithm to fit the model, or not fit the model at all?
#fit_fun = None # don't fit the model
#fit_fun = lambda ds: sr.fit_indv(model, ds, tau = None, max_time = max_time)
fit_fun = lambda ds: sr.fit_em(model, ds, max_em_iter = 3, max_time = max_time)

# Use Levenstein distance (for consistency with R script); otherwise use ratio.
use_distance = True

# Should processed data be exported?
export_data = True

In [3]:
(ds_spring2020, summary_spring2020) = fast.read_csv(path = 'fast_data_spring/learning',
                                                    x_col = ['cue1', 'cue2'],
                                                    resp_col = ['trial_resp.keys', 'test_resp.keys'],
                                                    resp_map = {'h' : 'cati', 'g' : 'catii', 'c' : 'cat1', 'm' : 'cat2', 's' : 'cat3', 'r' : 'cat4'},
                                                    ident_col = 'sonaID',
                                                    #conf_col = 'conf_rating.response',
                                                    n_final = 8)


print(summary_spring2020)
# I have spot-checked the OAT scores computed by the R and Python scripts, and they match up.
# However, I'm going to try analyzing the data without the confidence ratings.

wrong number of trials for file fast_data_spring/learning/20_spring_faces_2020_Jan_28_1526.csv
trials found: 194
trials expected: 204
Usecols do not match columns, columns expected but not found: ['trial_resp.keys', 'cue1', 'test_resp.keys', 'cue2']
Usecols do not match columns, columns expected but not found: ['test_resp.keys']
Usecols do not match columns, columns expected but not found: ['trial_resp.keys', 'cue1', 'test_resp.keys', 'cue2']
Usecols do not match columns, columns expected but not found: ['test_resp.keys']
Usecols do not match columns, columns expected but not found: ['trial_resp.keys', 'cue1', 'test_resp.keys', 'cue2']
Usecols do not match columns, columns expected but not found: ['trial_resp.keys', 'cue1', 'test_resp.keys', 'cue2']
Usecols do not match columns, columns expected but not found: ['trial_resp.keys', 'cue1', 'test_resp.keys', 'cue2']
Usecols do not match columns, columns expected but not found: ['trial_resp.keys', 'cue1', 'test_resp.keys', 'cue2']
Usecols 

In [4]:
# **** import survey data ****
df_surveys = pd.read_csv('fast_data_spring/survey/JonesSec9.csv').dropna()
df_surveys.rename(columns = {'IDCode': 'ident',
                   'mjon1': 'anx1', # In the past 7 days...I felt fearful
                   'mjon2': 'anx2', # In the past 7 days...I found it hard to focus on anything other than my anxiety
                   'mjon3': 'anx3', # In the past 7 days...My worries overwhelmed me
                   'mjon4': 'anx4', # In the past 7 days...I felt uneasy
                   'mjon5': 'anx5', # In the past 7 days...I felt nervous
                   'mjon6': 'anx6', # In the past 7 days...I felt like I needed help for my anxiety
                   'mjon7': 'anx7', # In the past 7 days...I felt anxious
                   'mjon8': 'anx8', # In the past 7 days...I felt tense
                   'mjon9': 'ang1', # In the past 7 days...I was irritated more than people knew
                   'mjon10': 'ang2', # In the past 7 days...I felt angry
                   'mjon11': 'ang3', # In the past 7 days...I felt like I was ready to explode
                   'mjon12': 'ang4', # In the past 7 days...I was grouchy
                   'mjon13': 'ang5', # In the past 7 days...I felt annoyed
                   'mjon14': 'dep1', # In the past 7 days...I felt worthless
                   'mjon15': 'dep2', # In the past 7 days...I felt that I had nothing to look forward to
                   'mjon16': 'dep3', # In the past 7 days...I felt helpless
                   'mjon17': 'dep4', # In the past 7 days...I felt sad
                   'mjon18': 'dep5', # In the past 7 days...I felt like a failure
                   'mjon19': 'dep6', # In the past 7 days...I felt depressed
                   'mjon20': 'dep7', # In the past 7 days...I felt unhappy
                   'mjon21': 'dep8', # In the past 7 days...I felt hopeless
                   'mjon22': 'pos1', # In the past 7 days...I felt cheerful
                   'mjon23': 'pos2', # In the past 7 days...I felt attentive
                   'mjon24': 'pos3', # In the past 7 days...I felt delighted
                   'mjon25': 'pos4', # In the past 7 days...I felt happy
                   'mjon26': 'pos5', # In the past 7 days...I felt joyful
                   'mjon27': 'pos6', # In the past 7 days...I felt enthusiastic
                   'mjon28': 'pos7', # In the past 7 days...I felt determined
                   'mjon29': 'pos8', # In the past 7 days...I felt interested
                   'mjon30': 'pos9', # In the past 7 days...I was thinking creatively
                   'mjon31': 'pos10', # In the past 7 days...I liked myself
                   'mjon32': 'pos11', # In the past 7 days...I felt peaceful
                   'mjon33': 'pos12', # In the past 7 days...I felt good-natured
                   'mjon34': 'pos13', # In the past 7 days...I felt useful
                   'mjon35': 'pos14', # In the past 7 days...I felt understood
                   'mjon36': 'pos15'}, # In the past 7 days...I felt content
                  inplace = True)
df_surveys['ident'] = df_surveys['ident'].str.lower()
df_surveys.set_index('ident', inplace = True)

In [5]:
# **** perform PROMIS calculations ****

col_names = []
for i in range(8):
    col_names += ['anx' + str(i + 1)]
df_surveys['promis_anx_sum'] = df_surveys[col_names].sum(axis = 1)
df_surveys['promis_anx_std'] = df_surveys[col_names].std(axis = 1)
df_surveys['promis_anx'] = promis_lookup(df_surveys['promis_anx_sum'], 'anx')

col_names = []
for i in range(5):
    col_names += ['ang' + str(i + 1)]
df_surveys['promis_ang_sum'] = df_surveys[col_names].sum(axis = 1, skipna = False)
df_surveys['promis_ang_std'] = df_surveys[col_names].std(axis = 1, skipna = False)
df_surveys['promis_ang'] = promis_lookup(df_surveys['promis_ang_sum'], 'ang')

col_names = []
for i in range(8):
    col_names += ['dep' + str(i + 1)]
df_surveys['promis_dep_sum'] = df_surveys[col_names].sum(axis = 1, skipna = False)
df_surveys['promis_dep_std'] = df_surveys[col_names].std(axis = 1, skipna = False)
df_surveys['promis_dep'] = promis_lookup(df_surveys['promis_dep_sum'], 'dep8b')

col_names = []
for i in range(15):
    col_names += ['pos' + str(i + 1)]
df_surveys['promis_pos_sum'] = df_surveys[col_names].sum(axis = 1, skipna = False)
df_surveys['promis_pos_std'] = df_surveys[col_names].std(axis = 1, skipna = False)
df_surveys['promis_pos'] = promis_lookup(df_surveys['promis_pos_sum'], 'pos')

In [6]:
# **** drop people not found in FAST data, and fix IDs ****

# https://www.datacamp.com/community/tutorials/fuzzy-string-python
# https://pypi.org/project/fuzzywuzzy/
# https://pypi.org/project/python-Levenshtein/#documentation

# Loop through FAST data.
survey_ident = df_surveys.index.values # list of IDs found in the survey data
new_ident = []
for ident in summary_spring2020.index:
    
    if use_distance:
        # Use Levenstein distance (for consistency with the R script).
        dist = []
        for sident in survey_ident:
            dist += [distance(ident, sident)]
        best_match = survey_ident[np.argmin(dist)]
        min_dist = np.min(dist)
        close_enough = min_dist <= 3
        exact_match = min_dist == 0
    else:
        # Use ratio.
        best_match, match_pct = process.extractOne(ident, survey_ident, scorer = fuzz.ratio)
        close_enough = match_pct >= 70
        exact_match = match_pct == 100
    
    if close_enough:
        if not exact_match:
            print(ident)
            print(best_match)
            print('\n')
        new_ident += [best_match]
    else:
        # Only keep people whose survey data can be found.
        summary_spring2020.drop(index = ident, inplace = True)
        ds_spring2020 = ds_spring2020.drop_sel(ident = [ident])
summary_spring2020['new_ident'] = new_ident
old_ident = summary_spring2020.index.values
summary_spring2020.set_index('new_ident', inplace = True)

# Concatenate data frames.
summary_spring2020 = pd.concat([summary_spring2020, df_surveys.loc[new_ident]], axis = 1)
summary_spring2020['ident'] = old_ident

0128meher
0823beher


0920mafra
0926mepra


0414mitrh
0414mithu


0709kabis
0306kabis


0422faeas
0425kawas


0316duher
0329doher


1115susai
1115sasai


0331jamur
0331jamurrow


0602shpas
0502shpal


0628elnot
0620junot


0319jebis
0319jevai


0126krchy
0426krdch


0818karoc
0718laroc


1101livis
1105mavis


1118cacat
1118cadocat


0702digol
0708cagol


0522kafhs
0529kacas


1116lotor
1113sonor


0501trbou
0101crmou


1030masky
1030malan


1216vabou
0116mabou




In [None]:
if not fit_fun is None:
    # Fit model.
    df_model = fit_fun(ds_spring2020)
    df_model['new_ident'] = new_ident
    df_model.set_index('new_ident', inplace = True)
    # Concatenate data frames.
    summary_spring2020 = pd.concat([summary_spring2020, df_model], axis = 1)

initial estimation with uniform priors
Fitting 1 of 127 (0.8%)
Fitting 2 of 127 (1.6%)
Fitting 3 of 127 (2.4%)
Fitting 4 of 127 (3.1%)
Fitting 5 of 127 (3.9%)
Fitting 6 of 127 (4.7%)
Fitting 7 of 127 (5.5%)
Fitting 8 of 127 (6.3%)
Fitting 9 of 127 (7.1%)
Fitting 10 of 127 (7.9%)
Fitting 11 of 127 (8.7%)
Fitting 12 of 127 (9.4%)
Fitting 13 of 127 (10.2%)
Fitting 14 of 127 (11.0%)
Fitting 15 of 127 (11.8%)
Fitting 16 of 127 (12.6%)
Fitting 17 of 127 (13.4%)
Fitting 18 of 127 (14.2%)
Fitting 19 of 127 (15.0%)
Fitting 20 of 127 (15.7%)
Fitting 21 of 127 (16.5%)
Fitting 22 of 127 (17.3%)
Fitting 23 of 127 (18.1%)
Fitting 24 of 127 (18.9%)
Fitting 25 of 127 (19.7%)
Fitting 26 of 127 (20.5%)
Fitting 27 of 127 (21.3%)
Fitting 28 of 127 (22.0%)
Fitting 29 of 127 (22.8%)
Fitting 30 of 127 (23.6%)
Fitting 31 of 127 (24.4%)
Fitting 32 of 127 (25.2%)
Fitting 33 of 127 (26.0%)
Fitting 34 of 127 (26.8%)


In [None]:
# COMPARISON OF LOG-LIKELIHOOD VALUES BETWEEN THE PYTHON AND R CODE
# R log-likelihood values
# 1116nawes: -117.40835
# 1014cynis: -79.11026
# 0630mypou: -41.63949
# 1216edasf: -65.33993

#print(sr.log_lik(model, ds_spring2020.loc[{'ident': '1116nawes'}], par_val = [0.3079861, 0.10629251, 0.04122127, 10.0000000, 2.1102876]))
#print(sr.log_lik(model, ds_spring2020.loc[{'ident': '1014cynis'}], par_val = [4.0032015, 0.21623505, 1.99000000, 10.0000000, 2.9885900]))
#print(sr.log_lik(model, ds_spring2020.loc[{'ident': '0630mypou'}], par_val = [1.2883276, 0.16728406, 1.96853767, 1.0716446, 7.4701937]))
#print(sr.log_lik(model, ds_spring2020.loc[{'ident': '1216edasf'}], par_val = [0.9602774, 0.17666184, 1.73290646, 1.2863178, 4.0602626]))

# It looks the log-likelihood values produced by the R and Python versions are close, but not quite identical.

In [None]:
# Add performance criterion.
# >= 75% on all stages (lumping all parts of stage 0 together as one stage) is the criterion for good performance that we used to analyze the Spring 2020 data. 
summary_spring2020['good_perf'] = (summary_spring2020['tutorial_0c_last8_pct_correct'] >= 75)&(summary_spring2020['training_last8_pct_correct'] >= 75)&(summary_spring2020['transfer_last8_pct_correct'] >= 75)

In [None]:
if export_data:
    # ***** EXPORT PROCESSED DATA *****
    summary_spring2020.to_csv('summary_spring2020.csv')