In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime

from download_tools.plugins.mouselab_mdp import preprocess_mouselab_data, add_click_count_columns
from download_tools.plugins.survey_html_form import process_html_demographics
from download_tools.plugins.survey_multi_choice import score_mouselab_questionnaires, get_mouselab_quiz_name, get_quiz_passer_ids, score_row, score_generic_questionnaires
from download_tools.plugins.survey_text import preprocess_survey_text, get_old_demographics
from download_tools.plugins.utils import get_demo_string

from datetime import datetime
import json
import dill as pickle

In [2]:
# Parameters
database_key = None
bonus_function = "lambda row: round((row[\"score\"] + 50) * .002, 2)"
simulated = False
participants_to_remove = [3,4,6,13]
sessions = ["quest_pilot"]
experiment_setting = "high_increasing"
COST = None
DEPTH = None
html_survey_names = {}
num_parts = None
old_experiment = False
manual_age_mapping = {}
experiment_specific_mapping = {}
trials_per_block = {"test": 20, "training": 20}
ranges_to_extract = None
mouselab_column_identifier = "name"
mouselab_mapping = {"0.0-1.*-0.*": "mouselab-quiz-pre", "0.0-3.*-3*": "mouselab-quiz-post"}
name_mapping = {"0.0-3.0-5.0": "crt", "0.0-3.0-7.0": "demographics", "0.0-3.0-4.0": "survey"}
max_attempts = {"mouselab-quiz-pre": 4, "mouselab-quiz-post": 1}
passing_score = {"mouselab-quiz-pre": 5, "mouselab-quiz-post": 0}
mouselab_quiz_solutions = {"mouselab-quiz-pre": {"Q1": "$-48 to $48", "Q2": "No, the cost is always $1.00.", "Q3": "There is no limit", "Q4": "The better I perform the higher my bonus will be.", "Q5": "No, the amount of cash at each node of the web may be different each time."}, "mouselab-quiz-post": {"Q1": "$-4 to $4", "Q2": "$-8 to $8", "Q3": "$-48 to $48", "Q4": "No, the cost is always $1.00.", "Q5": {"Very unmotivated": -2, "Slightly unmotivated": -1, "Neither motivated nor unmotivated": 0, "Slightly motivated": 1, "Very motivated": 2, "": 0}}}
ground_truth_file = "high_increasing"
node_classification = {"early": [1, 5, 9], "middle": [2, 6, 10], "late": [3, 4, 7, 8, 11, 12], "clicks": [1, 5, 9, 2, 6, 10, 3, 4, 7, 8, 11, 12]}
structure = "312_2_4_24"
analysis_run = "quest_pilot"
data_path = "/home/vfelso/github/planning-depth-differences/data"


In [3]:
# paths to use
inputs_path = Path(data_path).joinpath('inputs')
raw_data_path = Path(data_path).joinpath('raw')
processed_data_path = Path(data_path).joinpath(f'processed/{analysis_run}')
processed_data_path.mkdir(parents=True, exist_ok=True)

In [4]:
with open(inputs_path.joinpath(f"exp_inputs/rewards/{ground_truth_file}.json")) as json_file:
    ground_truths = json.load(json_file)

In [5]:
# load data
full_data = {}

# read in sessions
for run in sessions:
    for file_path in raw_data_path.glob(f"{run}/*.csv"):
        # don't want to save identifiable bonuses
        # file, information is already in data
        if "bonuses" not in str(file_path):
            file_name = file_path.stem
            curr_data_frame = pd.read_csv(file_path)
            curr_data_frame["run"] = run

            # remove participants who are me, or for other reasons
            curr_data_frame = curr_data_frame[~curr_data_frame["pid"].isin(participants_to_remove)]
            curr_data_frame = curr_data_frame.rename({"response" : "responses"}, axis="columns")
            
            if "internal_node_id" in curr_data_frame.columns:
                curr_data_frame["name"] = curr_data_frame.apply(lambda row : name_mapping[row["internal_node_id"]] if row["internal_node_id"] in name_mapping.keys() else row["name"], axis=1)
            if "response" in curr_data_frame.columns:
                curr_data_frame = curr_data_frame.rename({"response" : "responses"}, axis="columns")
                
            if file_name not in full_data:
                full_data[file_name] = [curr_data_frame]
            else:
                full_data[file_name].append(curr_data_frame)
                

full_data = {k: pd.concat(v) for k,v in full_data.items()}

In [6]:
individual_variables = full_data["general_info"].merge(full_data["question_data"], on=["pid","run"])

if all(pd.isnull(individual_variables["endhit"])):
    begin_hit = full_data["survey"].groupby("pid").min()["time_elapsed"]
    end_hit = full_data['survey-text'].groupby("pid").max()["time_elapsed"]
    finished_df = individual_variables[individual_variables["pid"].isin(end_hit.index)]
    finished_df["time_diff"] =  finished_df["pid"].apply(lambda pid: (end_hit[pid]-begin_hit[pid])/(60*1000))
else:
    time_format = '%Y-%m-%d %H:%M:%S.%f'
    finished_df = individual_variables[individual_variables["endhit"].apply(lambda endhit: isinstance(endhit, str))].reset_index()
    finished_df["time_diff"] = finished_df.apply(lambda row: ((datetime.strptime(row["endhit"], time_format) - datetime.strptime(row["beginhit"], time_format)).seconds % 3600 )/ 60.0, axis=1)
individual_variables = individual_variables.merge(finished_df[["time_diff", "pid", "run"]], how="left", on=["pid", "run"])

# check saved cost makes sense
if (COST is not None) and (DEPTH is not None):
    if isinstance(COST, dict):
        assert(np.all(individual_variables.apply(lambda row: row["COST"] == COST[row["codeversion"]][int(row["cond"])],axis=1)))
    else:
        unique_costs = np.unique(individual_variables["COST"])
        assert(len(unique_costs) == 1)
        assert(unique_costs[0] == COST)
    if DEPTH:
        if isinstance(DEPTH, dict):
            assert(np.all(individual_variables.apply(lambda row: row["DEPTH"] == DEPTH[row["codeversion"]][int(row["cond"])],axis=1)))
        else:
            unique_costs = np.unique(individual_variables["DEPTH"])
            assert(len(unique_costs) == 1)
            assert(unique_costs[0] == DEPTH)
    print(finished_df.groupby(["DEPTH", "COST"]).mean())
else:
    print(finished_df.groupby(["cond"]).mean())

      counterbalance  beginexp  endhit  bonus  status   pid  DEPTH  COST  \
cond                                                                       
0                0.0       NaN     NaN    0.0     1.0  21.5    0.0   0.0   

      MIN_TIME  inspectCost  bonusRate  branching  first_trial  final_bonus  \
cond                                                                          
0          7.0          1.0      0.002      312.0     0.666667     2.751667   

      displayed_bonus  time_diff  
cond                              
0            2.751667  59.833003  


  begin_hit = full_data["survey"].groupby("pid").min()["time_elapsed"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finished_df["time_diff"] =  finished_df["pid"].apply(lambda pid: (end_hit[pid]-begin_hit[pid])/(60*1000))


In [7]:
finished_df["time_diff"].median()

57.017725

In [8]:
time_fields = {"startTime":lambda date: np.nan if not isinstance(date,str) else datetime.strptime(date[:15], "%a %b %d %Y") ,"beginhit":lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0]), "beginexp" : lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0]), "endhit" : lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0])}

for time_field, time_func in time_fields.items():
    individual_variables[time_field] = individual_variables[time_field].apply(time_func)

individual_variables["beginhit"].describe()

  individual_variables["beginhit"].describe()


count                      21
unique                      3
top       2022-10-30 00:00:00
freq                       13
first     2022-10-27 00:00:00
last      2022-10-30 00:00:00
Name: beginhit, dtype: object

In [9]:
if not isinstance(old_experiment, list):
    old_experiment = [old_experiment] * len(sessions)

In [10]:
full_data["survey"]["internal_node_id"]
# 0.0-3.0-4.0 	

0     0.0-1.0-0.0
20    0.0-1.0-0.0
21    0.0-3.0-0.0
22    0.0-1.0-0.0
23    0.0-3.0-0.0
24    0.0-3.0-3.0
25    0.0-1.0-0.0
26    0.0-1.1-0.1
27    0.0-3.0-0.0
28    0.0-3.0-3.0
29    0.0-3.0-4.0
30    0.0-1.0-0.0
31    0.0-3.0-0.0
32    0.0-3.0-3.0
33    0.0-3.0-4.0
34    0.0-1.0-0.0
35    0.0-1.1-0.1
36    0.0-1.2-0.2
37    0.0-3.0-0.0
38    0.0-3.0-3.0
39    0.0-3.0-4.0
40    0.0-1.0-0.0
41    0.0-1.1-0.1
42    0.0-1.2-0.2
43    0.0-3.0-0.0
44    0.0-3.0-3.0
45    0.0-3.0-4.0
46    0.0-1.0-0.0
47    0.0-1.1-0.1
48    0.0-1.2-0.2
49    0.0-3.0-0.0
50    0.0-3.0-3.0
51    0.0-3.0-4.0
52    0.0-1.0-0.0
53    0.0-1.1-0.1
54    0.0-1.2-0.2
55    0.0-1.3-0.3
56    0.0-3.0-0.0
57    0.0-3.0-3.0
58    0.0-3.0-4.0
Name: internal_node_id, dtype: object

In [11]:
from collections import ChainMap

survey = full_data["survey"][full_data["survey"]["name"] == "survey"]

pages = []
names = []
question_ids = []
responses = []
reverse_coded = []
for row_idx, row in survey.iterrows():
    (keys, vals) = zip(*eval(row["responses"]).items())
    pages.append(str(keys))
    
    full_dict = dict(ChainMap(*[ans_dict for ans_dict in vals if ans_dict]))
    assert(len(full_dict) == 342)

    curr_question_ids = [[key for key in val.keys()]  if val else [] for val in vals]
    names.append(str([[key.split(".")[0] for key in val.keys()][0] if val else [] for val in vals ]))
    responses.append(str([[full_dict[quest] for quest in page] for page in curr_question_ids]))
    reverse_coded.append(str([[0 for quest in page] for page in curr_question_ids]))
    question_ids.append(str(curr_question_ids))
    

survey["question_id"] = question_ids
survey["name"] = names
survey["responses"] = responses
survey["pages"] = pages
survey["reverse_coded"] = reverse_coded

explode_columns =["question_id", "name", "reverse_coded", "responses", "pages"]
for explode_column in explode_columns:
    survey[explode_column] = survey[explode_column].apply(eval)
exploded_survey = survey.explode(explode_columns)
for explode_column in explode_columns:
    exploded_survey[explode_column] = exploded_survey[explode_column].apply(str)
    
# check for straightlining
assert(len(exploded_survey["responses"][exploded_survey["responses"].apply(lambda responses: (len(np.unique(responses))<=1)& len(responses)>5)])==0)

with open(inputs_path.joinpath(f"questionnaire_files/solutions_OnePart.pkl"), "rb") as f:
    questionnaires_presentation = pickle.load(f)
adjusted_questionnaires_presentation = {}

for key, val in questionnaires_presentation.items():
    new_key = [k.split(".")[0] for k in val.keys()][0]
    if new_key not in adjusted_questionnaires_presentation:
        adjusted_questionnaires_presentation[new_key] = {}
    adjusted_questionnaires_presentation[new_key] = {**adjusted_questionnaires_presentation[new_key], **val}

exploded_survey["open_ended"]=False
exploded_survey["correct"]=np.nan
scored_questionnaire_df = score_generic_questionnaires(exploded_survey, {"quest_pilot": adjusted_questionnaires_presentation}, group_identifier="name", default_open_ended={})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey["question_id"] = question_ids
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey["name"] = names
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey["responses"] = responses
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

In [12]:
def score_new_crt(crt_df, crt_quiz_solutions):

    crt_df["responses"] = crt_df['responses'].apply(str)
    crt_df["open_ended"] = True
    crt_df["name"] = "crt"
    crt_df["correct"] = np.nan

    #     crt_df["question_id"]=crt_df["question_id"].apply(lambda crt_string: crt_string.replace("“","").replace("”",""))
    crt_df["responses"] = crt_df["responses"].apply(lambda crt_string: crt_string.replace("“", "").replace("”", ""))
    #     return crt_df
    scored_questionnaire_df = score_generic_questionnaires(crt_df, {"quest_pilot": {"crt": crt_quiz_solutions}},
                                                           group_identifier="name", default_open_ended={"crt": "other"})
    return scored_questionnaire_df


crt_quiz_solutions = {"crt1": {".10": "intuitive", "10": "intuitive", ".05": "correct", "5": "correct", "": "no response"}, "crt2": {"100": "intuitive", "5": "correct", "": "no response"}, "crt3": {"24": "intuitive", "47": "correct", "": "no response"}, "crt4": {"9": "intuitive", "4": "correct", "": "no response"}, "crt5": {"30": "intuitive", "29": "correct", "": "no response"}, "crt6": {"10": "intuitive", "20": "correct", "": "no response"}, "crt7": {"is ahead of where he began": "intuitive", "has lost money": "correct", "": "no response"}}
crt = full_data["survey-html-form"][full_data["survey-html-form"]["internal_node_id"]=="0.0-3.0-5.0"]

scored_crt = score_new_crt(full_data["survey-html-form"][full_data["survey-html-form"]["name"]=="crt"], crt_quiz_solutions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crt_df["responses"] = crt_df['responses'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crt_df["open_ended"] = True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crt_df["name"] = "crt"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [13]:
full_data["survey-multi-choice"]["name"] = "IQ"
full_data["survey-multi-choice"]["open_ended"] = False
full_data["survey-multi-choice"]["correct"] = np.nan
    
scored_iq = score_generic_questionnaires(full_data["survey-multi-choice"], {"quest_pilot": questionnaires_presentation},group_identifier="name", default_open_ended={})

In [14]:
scored_questionnaire_df = pd.concat([scored_questionnaire_df, scored_iq, scored_crt])

In [15]:
survey_texts = preprocess_survey_text(full_data["survey-text"])

if any(old_experiment):
    old_runs = np.asarray(sessions)[old_experiment]
    old_demographics, demo_text = get_old_demographics(survey_texts[survey_texts["run"].isin(old_runs)], experiment_specific_gender=experiment_specific_mapping, manual_age_mapping=manual_age_mapping)
    print(demo_text)

new_runs =  np.asarray(sessions)[[not ex for ex in old_experiment]]
html_survey = full_data["survey-html-form"]
if len(html_survey_names) > 0:
    if "name" not in html_survey:
        html_survey["name"] = np.nan

    html_survey["name"] = html_survey.apply(
        lambda row: get_mouselab_quiz_name(row, html_survey_names[row["run"]]) if not isinstance(row["name"], str) else row["name"],
        axis=1)
    demographics, demo_text = process_html_demographics(html_survey[(html_survey["name"] == "demographics")&(html_survey["run"].isin(new_runs))])
    full_data["survey-multi-choice"] = pd.concat([full_data["survey-multi-choice"], html_survey[html_survey["name"] != "demographics"]])
else:
    demographics, demo_text = process_html_demographics(full_data["survey-html-form"])

print(demo_text)


if any(old_experiment):
    gender_name, gender_count = np.unique(pd.concat([old_demographics["Q2"],demographics["gender"]]), return_counts=True)
    ages = pd.concat([old_demographics["Q1"],demographics["age"]]).apply(int)
    print(get_demo_string(ages, gender_count, gender_name))

4 females, 2 males; median age 21, age range 19-39


In [16]:
# TODO fix this 
# mouselab_datas = preprocess_mouselab_data(full_data["mouselab-mdp"],trials_per_block, ground_truths)
mouselab_datas = full_data["mouselab-mdp"]
mouselab_datas["queries"] = mouselab_datas["queries"].apply(eval)

mouselab_datas = mouselab_datas.merge(individual_variables, how="left", on=["pid", "run"])

# path may contain a bunch of 0s at the start due to miscoding
mouselab_datas["path"] = mouselab_datas["path"].apply(lambda path : eval(path)[-3:])

node_classification = {key : [str(node) for node in val] for key, val in node_classification.items()}
mouselab_datas = add_click_count_columns(mouselab_datas, node_classification)

In [17]:
#TODO would be nice to refactor this out and import it
def expand_range_dictionary(input_dictionary):
    trial_to_block = {}
    for block, trial_range in input_dictionary.items():
        if isinstance(trial_range, str):
            for trial_index in eval(trial_range):
                trial_to_block[trial_index] = block
        else:
            trial_to_block[block] = expand_range_dictionary(trial_range)
    return trial_to_block

if ranges_to_extract:
    trial_to_block = expand_range_dictionary(ranges_to_extract)

    mouselab_datas["block"] = mouselab_datas.apply(lambda row: trial_to_block[row["run"]][row["trial_index"]] if row["run"] in trial_to_block else trial_to_block[row["trial_index"]], axis=1)

In [19]:
questionnaires = full_data["survey"]
questionnaires["correct"] = np.nan

if "name" not in questionnaires:
    questionnaires["name"] = np.nan

questionnaires["name"] = questionnaires.apply(
    lambda row: get_mouselab_quiz_name(row, mouselab_mapping) if not isinstance(row["name"], str) else row["name"],
    axis=1)

mouselab_questionnaires = questionnaires[questionnaires["name"].isin(mouselab_mapping.values())].reset_index()

#TODO check if multiple responses
mouselab_questionnaires["responses"] = mouselab_questionnaires["responses"].apply(lambda responses: str({k.split("_")[1]: v for k,v in eval(responses).items() if k.split("_")[1] != "Q0"}))

mouselab_questionnaires["name"] = mouselab_questionnaires.apply(lambda row: get_mouselab_quiz_name(row, mouselab_mapping) if not isinstance(row["name"], str) else row["name"],axis=1)
mouselab_questionnaires = score_mouselab_questionnaires(mouselab_questionnaires, mouselab_quiz_solutions, mouselab_column_identifier)
mouselab_quiz = mouselab_questionnaires.drop_duplicates(["pid","question_id"], keep="last")

pivoted_mouselab_quiz = mouselab_quiz.pivot_table(values="score", index=["pid","run"], columns="question_id")

In [20]:
quiz_passers = get_quiz_passer_ids(mouselab_questionnaires, max_attempts=max_attempts, passing_score=passing_score, identifying_columns = ["pid", "run"])
passed_all_quizzes = list(set.intersection(*map(set,quiz_passers.values())))
print(len(passed_all_quizzes))
pivoted_mouselab_quiz["passed_quizzes"] = 0
pivoted_mouselab_quiz.loc[passed_all_quizzes, "passed_quizzes"] = 1

7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_data["attempt_num"] = survey_data["internal_node_id"].apply(


In [21]:
quiz_and_demo = pivoted_mouselab_quiz.join(demographics)
quiz_and_demo = quiz_and_demo.merge(individual_variables, how="left", on=["pid", "run"])

quiz_and_demo_subselection = quiz_and_demo[quiz_and_demo.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)]

quiz_and_demo_subselection["gender"]=quiz_and_demo_subselection["gender"].replace(np.nan, "participants with no demographic")
ages = [int(age) for age in quiz_and_demo_subselection["age"] if not pd.isnull(age)]

gender_values, gender_counts = np.unique(quiz_and_demo_subselection["gender"].values, return_counts = True)
print(get_demo_string(ages, gender_counts, gender_values))


if len(questionnaires)>0:
        scored_questionnaire_df[scored_questionnaire_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("questionnaires.csv"))
quiz_and_demo[quiz_and_demo.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("quiz-and-demo.csv"))
mouselab_datas[mouselab_datas.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("mouselab-mdp.csv"))
survey_texts[survey_texts.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("survey-text.csv"))
individual_variables[individual_variables.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("individual-variables.csv"))

4 females, 2 males, 1 participants with no demographics; median age 21, age range 19-39


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quiz_and_demo_subselection["gender"]=quiz_and_demo_subselection["gender"].replace(np.nan, "participants with no demographic")


In [22]:
if num_parts:
    #TODO only supports one digit num_parts
    for part_num in range(1, num_parts +1):
        curr_runs = [run for run in finished_df["run"].unique() if run.endswith(f"Part{part_num}")]
        curr_runs_df = finished_df[finished_df["run"].isin(curr_runs)]
        median_info = curr_runs_df[curr_runs_df.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                     passed_all_quizzes, axis=1)].median()
        print(f"median time: {median_info['time_diff']:.2f}, median bonus: {median_info['final_bonus']:.2f}")
else:
    median_info = finished_df[finished_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].median()
    print(f"median time: {median_info['time_diff']:.2f}, median bonus: {median_info['final_bonus']:.2f}")

median time: 57.02, median bonus: 3.00


  median_info = finished_df[finished_df.apply(lambda row: (row["pid"], row["run"]) in \
