In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime

from download_tools.plugins.mouselab_mdp import preprocess_mouselab_data, add_click_count_columns
from download_tools.plugins.survey_html_form import process_html_demographics
from download_tools.plugins.survey_multi_choice import score_mouselab_questionnaires, get_mouselab_quiz_name, get_quiz_passer_ids, score_row, score_generic_questionnaires
from download_tools.plugins.survey_text import preprocess_survey_text, get_old_demographics
from download_tools.plugins.utils import get_demo_string

from datetime import datetime
import json
import dill as pickle

In [2]:
# Parameters
database_key = "NEW"
participants_to_remove = [679, 916]
simulated = False
sessions = ["2PartPilot1Part1", "2PartPilot2Part1", "2PartPilot3Part1", "2PartPilot1Part2", "2PartPilot2Part2", "2PartPilot3Part2"]
experiment_setting = "high_increasing"
COST = None
DEPTH = None
num_parts = 2
html_survey_names = {"2PartPilot1Part1": {"0.0-4.0-12.0": "crt", "0.0-4.0-14.0": "demographics", "0.0-4.0-16.0": "crt", "0.0-4.0-17.0": "demographics"}, "2PartPilot1Part2": {"0.0-9.0": "crt"}, "2PartPilot2Part1": {"0.0-4.0-12.0": "demographics", "0.0-4.0-16.0": "demographics"}, "2PartPilot2Part2": {"0.0-13.0": "crt", "0.0-9.0": "crt"}, "2PartPilot3Part1": {"0.0-4.0-12.0": "demographics", "0.0-4.0-16.0": "demographics"}, "2PartPilot3Part2": {"0.0-13.0": "crt", "0.0-9.0": "crt"}}
old_experiment = False
manual_age_mapping = {}
experiment_specific_mapping = {}
trials_per_block = {"test": 30}
ranges_to_extract = {"training": "range(20)", "test": "range(20,30)"}
mouselab_column_identifier = "name"
mouselab_mapping = {"0.0-2.*-1.*": "mouselab-quiz-pre", "0.0-4.0-2.0": "mouselab-quiz-post", "0.0-3.0-*.0": "mouselab-quiz-post", "0.0-19.*-2.*": "mouselab-quiz-post", "0.0-17.*-1.*": "mouselab-quiz-pre"}
max_attempts = {"mouselab-quiz-pre": 4, "mouselab-quiz-post": 1}
passing_score = {"mouselab-quiz-pre": 4, "mouselab-quiz-post": 0}
mouselab_quiz_solutions = {"mouselab-quiz-pre": {"Q0": "$-48 to $48", "Q1": "$1", "Q2": "The better I perform the higher my bonus will be.", "Q3": "No, the amount of cash at each node of the web may be different each time."}, "mouselab-quiz-post": {"Q0": "$-4 to $4", "Q1": "$-8 to $8", "Q2": "$-48 to $48", "Q3": "No, the cost is always $1.00.", "Q4": {"Very unmotivated": -2, "Slightly unmotivated": -1, "Neither motivated nor unmotivated": 0, "Slightly motivated": 1, "Very motivated": 2, "": 0}}}
crt_quiz_solutions = {"crt1": {".10": "intuitive", "10": "intuitive", ".05": "correct", "5": "correct", "": "no response"}, "crt2": {"100": "intuitive", "5": "correct", "": "no response"}, "crt3": {"24": "intuitive", "47": "correct", "": "no response"}, "crt4": {"9": "intuitive", "4": "correct", "": "no response"}, "crt5": {"30": "intuitive", "29": "correct", "": "no response"}, "crt6": {"10": "intuitive", "20": "correct", "": "no response"}, "crt7": {"is ahead of where he began": "intuitive", "has lost money": "correct", "": "no response"}}
ground_truth_file = "312_2_4_24"
node_classification = {"early": [1, 5, 9], "middle": [2, 6, 10], "late": [3, 4, 7, 8, 11, 12], "clicks": [1, 5, 9, 2, 6, 10, 3, 4, 7, 8, 11, 12]}
structure = "312_2_4_24"
analysis_run = "quest_second"
data_path = "/home/vfelso/github/planning-depth-differences/data"

In [3]:
# paths to use
inputs_path = Path(data_path).joinpath('inputs')
raw_data_path = Path(data_path).joinpath('raw')
processed_data_path = Path(data_path).joinpath(f'processed/{analysis_run}')
processed_data_path.mkdir(parents=True, exist_ok=True)

In [4]:
with open(inputs_path.joinpath(f"exp_inputs/rewards/{ground_truth_file}.json")) as json_file:
    ground_truths = json.load(json_file)

In [5]:
# load data
full_data = {}

# read in sessions
for run in sessions:
    for file_path in raw_data_path.glob(f"{run}/*.csv"):
        # don't want to save identifiable bonuses
        # file, information is already in data
        if "bonuses" not in str(file_path):
            file_name = file_path.stem
            curr_data_frame = pd.read_csv(file_path)
            curr_data_frame["run"] = run

            # remove participant who answered they were too young
            curr_data_frame = curr_data_frame[~curr_data_frame["pid"].isin(participants_to_remove)]

            if file_name not in full_data:
                full_data[file_name] = [curr_data_frame]
            else:
                full_data[file_name].append(curr_data_frame)

full_data = {k: pd.concat(v) for k,v in full_data.items()}

In [6]:
individual_variables = full_data["general_info"].merge(full_data["question_data"], on=["pid","run"])

time_format = '%Y-%m-%d %H:%M:%S.%f'
finished_df = individual_variables[individual_variables["endhit"].apply(lambda endhit: isinstance(endhit, str))].reset_index()
finished_df["time_diff"] = finished_df.apply(lambda row: ((datetime.strptime(row["endhit"], time_format) - datetime.strptime(row["beginhit"], time_format)).seconds % 3600 )/ 60.0, axis=1)
individual_variables = individual_variables.merge(finished_df[["time_diff", "pid", "run"]], how="left", on=["pid", "run"])

# check saved cost makes sense
if (COST is not None) and (DEPTH is not None):
    if isinstance(COST, dict):
        assert(np.all(individual_variables.apply(lambda row: row["COST"] == COST[row["codeversion"]][int(row["cond"])],axis=1)))
    else:
        unique_costs = np.unique(individual_variables["COST"])
        assert(len(unique_costs) == 1)
        assert(unique_costs[0] == COST)
    if DEPTH:
        if isinstance(DEPTH, dict):
            assert(np.all(individual_variables.apply(lambda row: row["DEPTH"] == DEPTH[row["codeversion"]][int(row["cond"])],axis=1)))
        else:
            unique_costs = np.unique(individual_variables["DEPTH"])
            assert(len(unique_costs) == 1)
            assert(unique_costs[0] == DEPTH)
    print(finished_df.groupby(["DEPTH", "COST"]).mean())
else:
    print(finished_df.groupby(["cond"]).mean())

           index  counterbalance  bonus  status         pid  final_bonus  \
cond                                                                       
0     251.244898             0.0    0.0     3.0  703.387755     1.334468   
1     280.282609             0.0    0.0     3.0  684.782609     1.120652   
2     290.942308             0.0    0.0     3.0  714.288462     1.278163   
3     319.617021             0.0    0.0     3.0  741.617021     1.218478   
4     341.791667             0.0    0.0     3.0  724.750000     1.142826   
5     344.285714             0.0    0.0     3.0  741.979592     1.321064   
6     354.800000             0.0    0.0     3.0  726.900000     1.318776   
7     360.770833             0.0    0.0     3.0  754.791667     1.433043   
8     400.065217             0.0    0.0     3.0  775.673913     1.194444   
9     409.250000             0.0    0.0     3.0  778.479167     1.102553   

      displayed_bonus  DEPTH  COST  MIN_TIME  inspectCost  bonusRate  \
cond           

In [7]:
time_fields = {"startTime":lambda date: np.nan if not isinstance(date,str) else datetime.strptime(date[:15], "%a %b %d %Y") ,"beginhit":lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0]), "beginexp" : lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0]), "endhit" : lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0])}

for time_field, time_func in time_fields.items():
    individual_variables[time_field] = individual_variables[time_field].apply(time_func)

individual_variables["beginhit"].describe()

  individual_variables["beginhit"].describe()


count                     717
unique                     20
top       2020-12-08 00:00:00
freq                      116
first     2020-11-24 00:00:00
last      2020-12-19 00:00:00
Name: beginhit, dtype: object

In [8]:
if not isinstance(old_experiment, list):
    old_experiment = [old_experiment] * len(sessions)

In [9]:
survey_texts = preprocess_survey_text(full_data["survey-text"])

if any(old_experiment):
    old_runs = np.asarray(sessions)[old_experiment]
    old_demographics, demo_text = get_old_demographics(survey_texts[survey_texts["run"].isin(old_runs)], experiment_specific_gender=experiment_specific_mapping, manual_age_mapping=manual_age_mapping)
    print(demo_text)

new_runs =  np.asarray(sessions)[[not ex for ex in old_experiment]]
html_survey = full_data["survey-html-form"]
if len(html_survey_names) > 0:
    if "name" not in html_survey:
        html_survey["name"] = np.nan

    html_survey["name"] = html_survey.apply(
        lambda row: get_mouselab_quiz_name(row, html_survey_names[row["run"]]) if not isinstance(row["name"], str) else row["name"],
        axis=1)
    demographics, demo_text = process_html_demographics(html_survey[(html_survey["name"] == "demographics")&(html_survey["run"].isin(new_runs))])
    full_data["survey-multi-choice"] = pd.concat([full_data["survey-multi-choice"], html_survey[html_survey["name"] != "demographics"]])
else:
    demographics, demo_text = process_html_demographics(full_data["survey-html-form"])

print(demo_text)


if any(old_experiment):
    gender_name, gender_count = np.unique(pd.concat([old_demographics["Q2"],demographics["gender"]]), return_counts=True)
    ages = pd.concat([old_demographics["Q1"],demographics["age"]]).apply(int)
    print(get_demo_string(ages, gender_count, gender_name))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_demographics["responses"] = raw_demographics["responses"].apply(eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["question_id"] = df.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questionnaire_df["responses"] = questionnaire_df.apply(


132 females, 117 males, 1 others; median age 32, age range 19-73


In [10]:
mouselab_datas = preprocess_mouselab_data(full_data["mouselab-mdp"],trials_per_block,ground_truths)

mouselab_datas = mouselab_datas.merge(individual_variables, how="left", on=["pid", "run"])

# path may contain a bunch of 0s at the start due to miscoding
mouselab_datas["path"] = mouselab_datas["path"].apply(lambda path : eval(path)[-3:])

node_classification = {key : [str(node) for node in val] for key, val in node_classification.items()}
mouselab_datas = add_click_count_columns(mouselab_datas, node_classification)

In [11]:
#TODO would be nice to refactor this out and import it
def expand_range_dictionary(input_dictionary):
    trial_to_block = {}
    for block, trial_range in input_dictionary.items():
        if isinstance(trial_range, str):
            for trial_index in eval(trial_range):
                trial_to_block[trial_index] = block
        else:
            trial_to_block[block] = expand_range_dictionary(trial_range)
    return trial_to_block

if ranges_to_extract:
    trial_to_block = expand_range_dictionary(ranges_to_extract)

    mouselab_datas["block"] = mouselab_datas.apply(lambda row: trial_to_block[row["run"]][row["trial_index"]] if row["run"] in trial_to_block else trial_to_block[row["trial_index"]], axis=1)

In [12]:
questionnaires = full_data["survey-multi-choice"]

if "name" not in questionnaires:
    questionnaires["name"] = np.nan

questionnaires["name"] = questionnaires.apply(
    lambda row: get_mouselab_quiz_name(row, mouselab_mapping) if not isinstance(row["name"], str) else row["name"],
    axis=1)

mouselab_questionnaires = questionnaires[questionnaires["name"].isin(mouselab_mapping.values())].reset_index()
questionnaires = questionnaires[~questionnaires["name"].isin(mouselab_mapping.values())].reset_index()

mouselab_questionnaires["name"] = mouselab_questionnaires.apply(lambda row: get_mouselab_quiz_name(row, mouselab_mapping) if not isinstance(row["name"], str) else row["name"],axis=1)
mouselab_questionnaires = score_mouselab_questionnaires(mouselab_questionnaires, mouselab_quiz_solutions, mouselab_column_identifier)
mouselab_quiz = mouselab_questionnaires.drop_duplicates(["pid","question_id"], keep="last")

pivoted_mouselab_quiz = mouselab_quiz.pivot_table(values="score", index=["pid","run"], columns="question_id")

In [13]:
if len(questionnaires)>0:
    # remove potentially sensitive questionnaire
    questionnaires = questionnaires[~questionnaires["name"].isin(['decision-outcome'])]
    questionnaire_files = {}
    for session in sessions:
        with open(inputs_path.joinpath(f"questionnaire_files/solutions.pkl"), "rb") as f:
            questionnaires_presentation = pickle.load(f)
        questionnaire_files[session] = {**questionnaires_presentation, **mouselab_quiz_solutions, **crt_quiz_solutions}
    questionnaires["open_ended"] = questionnaires["name"].apply(lambda entry: entry in ["crt"])
    questionnaires["responses"] = questionnaires["responses"].apply(lambda entry: entry.replace("null", "None").replace("”","").replace("“","") if isinstance(entry, str) else entry)
    scored_questionnaire_df = score_generic_questionnaires(questionnaires, questionnaire_files, group_identifier="name", default_open_ended={"crt": "other"})

In [14]:
quiz_passers = get_quiz_passer_ids(mouselab_questionnaires, max_attempts=max_attempts, passing_score=passing_score, identifying_columns = ["pid", "run"])
passed_all_quizzes = list(set.intersection(*map(set,quiz_passers.values())))
print(len(passed_all_quizzes))
pivoted_mouselab_quiz["passed_quizzes"] = 0
pivoted_mouselab_quiz.loc[passed_all_quizzes, "passed_quizzes"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_data["attempt_num"] = survey_data["internal_node_id"].apply(


250


In [15]:
# only include participants who really finished experiment, and were paid
passed_all_quizzes = [part for part in passed_all_quizzes if part[0] in demographics.index]

In [16]:
quiz_and_demo = pivoted_mouselab_quiz.join(demographics)
quiz_and_demo = quiz_and_demo.merge(individual_variables, how="left", on=["pid", "run"])

quiz_and_demo_subselection = quiz_and_demo[quiz_and_demo.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                 passed_all_quizzes, axis=1)]

quiz_and_demo_subselection["gender"]=quiz_and_demo_subselection["gender"].replace(np.nan, "participants with no demographics")
ages = [int(age) for age in quiz_and_demo_subselection["age"] if not pd.isnull(age)]

gender_values, gender_counts = np.unique(quiz_and_demo_subselection["gender"].values, return_counts = True)
print(get_demo_string(ages, gender_counts, gender_values))


if len(questionnaires)>0:
    scored_questionnaire_df[scored_questionnaire_df.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("questionnaires.csv"))
quiz_and_demo[quiz_and_demo.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("quiz-and-demo.csv"))
mouselab_datas[mouselab_datas.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("mouselab-mdp.csv"))
survey_texts[survey_texts.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("survey-text.csv"))
individual_variables[individual_variables.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("individual-variables.csv"))

131 females, 116 males, 1 others; median age 32, age range 19-73


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quiz_and_demo_subselection["gender"]=quiz_and_demo_subselection["gender"].replace(np.nan, "participants with no demographics")


In [17]:
if num_parts:
    for part_num in range(1, num_parts +1):
        curr_runs = [run for run in finished_df["run"].unique() if run.endswith(f"Part{part_num}")]
        curr_runs_df = finished_df[finished_df["run"].isin(curr_runs)]
        median_info = curr_runs_df[curr_runs_df.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                     passed_all_quizzes, axis=1)].median()
        print(f"median time: {median_info['time_diff']:.2f}, median bonus: {median_info['displayed_bonus']:.2f}")
else:
    median_info = finished_df[finished_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].median()
    print(f"median time: {median_info['time_diff']:.2f}, median bonus: {median_info['displayed_bonus']:.2f}")

median time: 24.68, median bonus: 3.25
median time: 25.35, median bonus: 0.00


  median_info = curr_runs_df[curr_runs_df.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
  median_info = curr_runs_df[curr_runs_df.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \


In [18]:
questionnaires[questionnaires.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                 passed_all_quizzes, axis=1)].groupby(["name"]).count()["pid"]

name
AES             232
AUDIT           232
BIS             231
BISBAS          232
CFC             239
DOSPERT         232
EAT             232
FTP             238
IQ              221
LSAS_A          231
LSAS_B          231
OCIR            232
STAI            233
STICSA_S        248
STICSA_T        248
UPPS-P          232
Zhung           232
crt             223
life-regrets    239
ppmlr           238
ppmsr           238
pptlr           238
pptsr           238
regrets         238
satisfaction    238
Name: pid, dtype: int64