In [1]:
import pandas as pd
import numpy as np
import pdb 
from datetime import datetime, timedelta
from dis import dis
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None  # default='warn'

data = pd.read_csv("data.csv")

# drop all variables & datapoints not needed
data = data.drop(columns = ["read_id", "topic_id", "contest_id", "problem_id", "origin_created_at"]) #drop columns not needed
not_gold = data.loc[data['current_labeling_state'] != 'gold_standard'] #drop all non gold_standards
data = data.drop(not_gold.index)

# calculate responsetime in seconds
data['response_submitted_at']= pd.to_datetime(data['response_submitted_at'])
data['problem_appeared_at']= pd.to_datetime(data['problem_appeared_at'])
response_time = (data['response_submitted_at'] - data['problem_appeared_at'])#calculate response time
response_time_seconds = response_time / np.timedelta64(1, 's') #transform to seconds
data['RT'] = response_time_seconds

# outlier removal based on responsetime
data = data.drop(data.loc[data['RT'] <= 0].index) #excludes negative response time (e.g. datetime problem in row 684584)
data = data.drop(data.loc[data['RT'] >= 3600].index) #excludes response time longer than one hour (e.g. app runs without subject paying attention)
def outlier_removal_RT(data, RT_threshold=3*np.std(data['RT'])): #remove all dp with longer RT than 3*std of raw data
    length1 = len(data['RT']) 
    data = data[data['RT'] <= RT_threshold]
    data = data.reset_index()
    length2 = len(data['RT'])
    Avg_RT = np.average(data['RT'])
    SD_RT = np.std(data['RT'])
    print('Average Response Time (in seconds): %.2f' % (Avg_RT))
    print('Response Time SD: %.2f' % (SD_RT),'\n')
    return data

data = outlier_removal_RT(data)

#exclude subjects with less than 10 trials
not_enough_trials = data['user_id'].value_counts()[data['user_id'].value_counts()<=9].index
data = data.drop(data.loc[data['user_id'].isin(not_enough_trials)].index) 

# sort data according to user and order of responses given
data = data.sort_values(
    by=["user_id", "response_submitted_at"]
)

# function for n-back
def get_backward(dataframe, id_col, correct_col, chosen_col, n_back):    
    df = dataframe
    is_valid = df[id_col].value_counts(sort=False)>n_back
    result = np.array([]) # create empty list
    for i, subject in enumerate(np.unique(df[id_col])): # loop over subjects
        if is_valid.values[i] == True:
            true_false_list = np.empty([n_back]) # initialize with nan for first n_back trials
            true_false_list[:] = np.nan # select rows of currect subject
            df_subj = df.loc[df[id_col]==subject,]
            choices_subj = df_subj[chosen_col] 
            correct_subj = df_subj[correct_col] 
            tmp_list = np.array(choices_subj == correct_subj.shift(periods = n_back)) # calculate accuracy (True if choice[t] == correct[t-n_back], else False)
            true_false_list = np.append(true_false_list, tmp_list[n_back:]) # append to inialized list
            result = np.append(result, true_false_list)
            
        elif is_valid.values[i] == False:
            num_rows = df[id_col][df[id_col]==subject].value_counts()
            true_false_list = np.empty([num_rows.values[0]]) # initialize with nan for rows with invalid data
            true_false_list[:] = np.nan
            result = np.append(result, true_false_list)

    return(result)

#append 1-back
data['1back_accuracy'] = get_backward(n_back = 1, dataframe = data, id_col = 'user_id', correct_col = 'current_correct_answer', chosen_col = 'chosen_answer')
#append 2-back
data['2back_accuracy'] = get_backward(n_back = 2, dataframe = data, id_col = 'user_id', correct_col = 'current_correct_answer', chosen_col = 'chosen_answer')
#append 3-back
data['3back_accuracy'] = get_backward(n_back = 3, dataframe = data, id_col = 'user_id', correct_col = 'current_correct_answer', chosen_col = 'chosen_answer')
# result, true_false_list = get_values(n_back = 2, dataframe = data, id_col = 'user_id', correct_col = 'current_correct_answer', chosen_col = 'chosen_answer')


#function for 1-forward
def get_forward(dataframe, id_col, correct_col, chosen_col, n_back):    
    df = dataframe
    is_valid = df[id_col].value_counts(sort=False)>n_back
    result = np.array([]) # create empty list
    for i, subject in enumerate(np.unique(df[id_col])): # loop over subjects
        if is_valid.values[i] == True:
            true_false_list = np.empty(np.abs(n_back)) # initialize with nan for first n_back trials
            true_false_list[:] = np.nan # select rows of currect subject
            df_subj = df.loc[df[id_col]==subject,]
            choices_subj = df_subj[chosen_col] 
            correct_subj = df_subj[correct_col] 
            tmp_list = np.array(choices_subj == correct_subj.shift(periods = n_back)) # calculate accuracy (True if choice[t] == correct[t-n_back], else False)
            true_false_list = np.append(tmp_list[:n_back], true_false_list) # append to inialized list
            result = np.append(result, true_false_list)
            
        elif is_valid.values[i] == False:
            num_rows = df[id_col][df[id_col]==subject].value_counts()
            true_false_list = np.empty([num_rows.values[0]]) # initialize with nan for rows with invalid data
            true_false_list[:] = np.nan
            result = np.append(result, true_false_list)

    return(result)

#append 1-forward
data['1forward_accuracy'] = get_forward(n_back = -1, dataframe = data, id_col = 'user_id', correct_col = 'current_correct_answer', chosen_col = 'chosen_answer')

data.to_csv('prepped_data.csv')

Average Response Time (in seconds): 1.37
Response Time SD: 1.98 

