In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dis import dis
pd.options.mode.chained_assignment = None  # default='warn'

data = pd.read_csv("data.csv")

# drop all variables & datapoints not needed
data = data.drop(columns = ["read_id", "topic_id", "contest_id", "problem_id", "origin", "origin_created_at"]) #drop columns not needed
not_gold = data.loc[data['current_labeling_state'] != 'gold_standard'] #drop all non gold_standards
data = data.drop(not_gold.index)

# calculate responsetime in seconds
data['response_submitted_at']= pd.to_datetime(data['response_submitted_at'])
data['problem_appeared_at']= pd.to_datetime(data['problem_appeared_at'])
response_time = (data['response_submitted_at'] - data['problem_appeared_at'])#calculate response time
response_time_seconds = response_time / np.timedelta64(1, 's') #transform to seconds
data['RT'] = response_time_seconds
data = data.drop(data.loc[data['RT'] <= 0].index) #excludes negative response time (e.g. datetime problem in row 684584)

# outlier removal based on responsetime
def outlier_removal_RT(data, RT_threshold=30): #remove all dp with longer RT than 30 seconds
    length1 = len(data['RT']) 
    data = data[data['RT'] <= RT_threshold]
    data = data.reset_index()
    length2 = len(data['RT'])
    Avg_RT = np.average(data['RT'])
    SD_RT = np.std(data['RT'])
    print('Average Response Time (in seconds): %.2f' % (Avg_RT))
    print('Response Time SD: %.2f' % (SD_RT),'\n')
    return data

data = outlier_removal_RT(data)

# sort data according to user and order of responses given
data = data.sort_values(
    by=["user_id", "response_submitted_at"]
)

# function for calculating n-back
def get_values(dataframe, id_col, correct_col, chosen_col, n_back):    
    df = dataframe
    result = np.array([]) # create empty list
    for _, subject in enumerate(np.unique(df[id_col])): # loop over subjects
        true_false_list = np.empty([n_back]) # initialize with nan for first n_back trials
        true_false_list[:] = np.nan # select rows of currect subject
        df_subj = df.loc[df[id_col]==subject,]
        choices_subj = df_subj[chosen_col] 
        correct_subj = df_subj[correct_col] 
        tmp_list = np.array(choices_subj == correct_subj.shift(periods = n_back)) # calculate accuracy (True if choice[t] == correct[t-n_back], else False)
        true_false_list = np.append(true_false_list, tmp_list[n_back:]) # append to inialized list
        true_false_list = np.multiply(true_false_list, 1) # convert boolean (True/False) to numbers (1,0)
        result = np.append(result, true_false_list)
    return(result)

#append 1-back
data['1back_accuracy'] = get_values(n_back = 1, dataframe = data, id_col = 'user_id', correct_col = 'current_correct_answer', chosen_col = 'chosen_answer')



#calculate vales of interest
print('Number of subjects:', data['user_id'].nunique(),'\n')
data['score'].value_counts(1) # rate of correct answers: 81.2%
#data.groupby(['user_id']).mean() # mean score and mean response time for each subject

#previous: any image
previous_any = pd.notnull(data['1back_accuracy'])
print('Ratio of chosen answers with any image in previous trial:\n',data['chosen_answer'][previous_any].value_counts(1),'\n')

#previous: nevus
previous_nevus = data['current_correct_answer'].shift(periods = 1) == "['nevus']"
print('Ratio of chosen answers with nevus image in previous trial:\n',data['chosen_answer'][previous_nevus].value_counts(1),'\n')

#previous: melanoma
previous_melanoma = data['current_correct_answer'].shift(periods = 1) == "['melanoma']"
print('Ratio of chosen answers with melanoma image in previous trial:\n',data['chosen_answer'][previous_melanoma].value_counts(1),'\n')


Average Response Time (in seconds): 1.33
Response Time SD: 1.48 

Number of subjects: 1173 

Ratio of chosen answers with any image in previous trial:
 ['nevus']       0.518415
['melanoma']    0.481585
Name: chosen_answer, dtype: float64 

Ratio of chosen answers with nevus image in previous trial:
 ['nevus']       0.51896
['melanoma']    0.48104
Name: chosen_answer, dtype: float64 

Ratio of chosen answers with melanoma image in previous trial:
 ['nevus']       0.5173
['melanoma']    0.4827
Name: chosen_answer, dtype: float64 

