In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime

from download_tools.plugins.mouselab_mdp import preprocess_mouselab_data, add_click_count_columns
from download_tools.plugins.survey_html_form import process_html_demographics
from download_tools.plugins.survey_multi_choice import score_mouselab_questionnaires, get_mouselab_quiz_name, get_quiz_passer_ids, score_row, score_generic_questionnaires
from download_tools.plugins.survey_text import preprocess_survey_text, get_old_demographics
from download_tools.plugins.utils import get_demo_string

from datetime import datetime
import json
import dill as pickle

In [None]:
# paths to use
inputs_path = Path(data_path).joinpath('inputs')
raw_data_path = Path(data_path).joinpath('raw')
processed_data_path = Path(data_path).joinpath(f'processed/{analysis_run}')
processed_data_path.mkdir(parents=True, exist_ok=True)

In [None]:
with open(inputs_path.joinpath(f"exp_inputs/rewards/{ground_truth_file}.json")) as json_file:
    ground_truths = json.load(json_file)
    
with open(inputs_path.joinpath(f"questionnaire_files/questionnaire_OnePart.txt"), "rb") as f:
    questionnaire_presentation = json.load(f)
reverse_coded_dictionary = {}
for quest_info in questionnaire_presentation.values():
    for quest in quest_info["questions"]:
        reverse_coded_dictionary[quest["question_id"]] = quest["reverse_coded"]

In [None]:
# load data
full_data = {}

# read in sessions
for run in sessions:
    for file_path in raw_data_path.glob(f"{run}/*.csv"):
        # don't want to save identifiable bonuses
        # file, information is already in data
        if "bonuses" not in str(file_path):
            file_name = file_path.stem
            print(file_name)
            curr_data_frame = pd.read_csv(file_path)
            curr_data_frame["run"] = analysis_run

            # remove participant who answered they were too young
            curr_data_frame = curr_data_frame[~curr_data_frame["pid"].isin(participants_to_remove)]
            curr_data_frame = curr_data_frame.rename({"response" : "responses"}, axis="columns")

            if "internal_node_id" in curr_data_frame.columns:
                curr_data_frame["name"] = curr_data_frame.apply(lambda row : name_mapping[row["internal_node_id"]] if row["internal_node_id"] in name_mapping.keys() else row["name"], axis=1)

            if file_name not in full_data:
                full_data[file_name] = [curr_data_frame]
            else:
                full_data[file_name].append(curr_data_frame)

full_data = {k: pd.concat(v) for k,v in full_data.items()}

In [None]:
# people who opened it multiple times, just keep last
full_data["general_info"] = full_data["general_info"].drop_duplicates(keep="last", subset=["pid" ,"run"])
full_data["question_data"] = full_data["question_data"].drop_duplicates(keep="last", subset=["pid" ,"run"])

individual_variables = full_data["general_info"].merge(full_data["question_data"], on=["pid","run"])

begin_hit = full_data["survey"].groupby("pid").min()["time_elapsed"]
end_hit = full_data['survey-text'][full_data['survey-text']["responses"].apply(lambda responses: "Q3" not in responses)].groupby("pid").max()["time_elapsed"]
finished_df = individual_variables[individual_variables["pid"].isin(end_hit.index)].copy(deep=True)
finished_df["time_diff"] =  finished_df["pid"].apply(lambda pid: (end_hit[pid])/(60*1000))
individual_variables = individual_variables.merge(finished_df[["time_diff", "pid", "run"]], how="left", on=["pid", "run"])

# check saved cost makes sense
if (COST is not None) and (DEPTH is not None):
    if isinstance(COST, dict):
        assert(np.all(individual_variables.apply(lambda row: row["COST"] == COST[row["codeversion"]][int(row["cond"])],axis=1)))
    else:
        unique_costs = np.unique(individual_variables["COST"])
        assert(len(unique_costs) == 1)
        assert(unique_costs[0] == COST)
    if DEPTH:
        if isinstance(DEPTH, dict):
            assert(np.all(individual_variables.apply(lambda row: row["DEPTH"] == DEPTH[row["codeversion"]][int(row["cond"])],axis=1)))
        else:
            unique_costs = np.unique(individual_variables["DEPTH"])
            assert(len(unique_costs) == 1)
            assert(unique_costs[0] == DEPTH)
    print(finished_df.groupby(["DEPTH", "COST"]).mean())
else:
    print(finished_df.groupby(["cond"]).mean())

In [None]:
time_fields = {"startTime":lambda date: np.nan if not isinstance(date,str) else datetime.strptime(date[:15], "%a %b %d %Y") ,"beginhit":lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0]), "beginexp" : lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0]), "endhit" : lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0])}

for time_field, time_func in time_fields.items():
    individual_variables[time_field] = individual_variables[time_field].apply(time_func)

individual_variables["beginhit"].describe()

In [None]:
# delete dates / possible location info in general info
del individual_variables["startTime"]
del individual_variables["beginhit"]
del individual_variables["beginexp"]
del individual_variables["endhit"]
del individual_variables["language"]

# Survey trials

In [None]:
with open(inputs_path.joinpath(f"questionnaire_files/questionnaire_OnePart.txt"), "rb") as f:
    questionnaire_presentation = json.load(f)
reverse_coded_dictionary = {}
for quest_namq, quest_info in questionnaire_presentation.items():
    for quest in quest_info["questions"]:
        reverse_coded_dictionary[quest["question_id"]] = quest["reverse_coded"]

In [None]:
survey = full_data["survey"][full_data["survey"]["name"] == "survey"].copy(deep=True)


survey["pages"] = survey["responses"].apply(lambda responses: [[page_num for question_id, response in page.items()] for page_num, page in eval(responses).items() if page \
                                                              ] )

survey["question_id"] = survey["responses"].apply(lambda responses: [[question_id  for question_id, response in page.items()] for page_num, page in eval(responses).items() if page \
                                             ] )

survey["reverse_coded"] = survey["responses"].apply(lambda responses: [[reverse_coded_dictionary[question_id]  for question_id, response in page.items()] for page_num, page in eval(responses).items() if page \
                                             ] )
survey["name"] = survey["responses"].apply(lambda responses: [[question_id.split(".")[0]  for question_id, response in page.items()] for page_num, page in eval(responses).items() if page \
                                             ] )
survey["responses"] = survey["responses"].apply(lambda responses: [[ response for question_id, response in page.items()] for page_num, page in eval(responses).items() if page \
                                                                  ] )


exploded_survey=survey.explode(column=["responses","question_id", "reverse_coded", "name", "pages"]).explode(column=["responses","question_id", "reverse_coded", "name", "pages"])

with open(inputs_path.joinpath(f"questionnaire_files/solutions_OnePart.pkl"), "rb") as f:
    questionnaires_presentation = pickle.load(f)

assert len(np.unique(exploded_survey[["question_id", "reverse_coded"]].drop_duplicates().groupby(["question_id"]).count()["reverse_coded"])) == 1

# question_id to 0 or 1 (whether reversed)
reversed_questions = dict(exploded_survey[["question_id", "reverse_coded"]].to_records(index=False))

adjusted_questionnaires_presentation = {"catch" : {}}

for key, val in questionnaires_presentation.items():
    new_key = [k.split(".")[0] for k in val.keys()][0]
    if new_key not in adjusted_questionnaires_presentation:
        adjusted_questionnaires_presentation[new_key] = {}
    adjusted_questionnaires_presentation[new_key] = {**adjusted_questionnaires_presentation[new_key], **val}

    for item_key, curr_score_dict in adjusted_questionnaires_presentation[new_key].items():
        if "catch" in item_key:
            adjusted_questionnaires_presentation["catch"][item_key] = curr_score_dict
        if item_key in reversed_questions and reversed_questions[item_key] == 1:
            reverse_score_dict = dict(zip(sorted(curr_score_dict.keys(), reverse=False), [curr_score_dict[k] for k in sorted(curr_score_dict.keys(), reverse=True)]))
            adjusted_questionnaires_presentation[new_key][item_key] = reverse_score_dict

exploded_survey["score"] = exploded_survey.apply(lambda row: adjusted_questionnaires_presentation[row["name"]][row["question_id"]][row["responses"]], axis=1)

In [None]:
unique_responses_per_page = exploded_survey.groupby(["pid", "pages"], as_index=False).nunique()\
[["pid", "pages", "responses", "reverse_coded", "question_id"]].rename(columns={"question_id" : "num_items_page"})
unique_responses_per_page["suspicious"] = unique_responses_per_page.apply(lambda row: row["reverse_coded"]==2 and  row["num_items_page"] > 5 and  row["responses"] == 1, axis=1)

straightliners = list(unique_responses_per_page[unique_responses_per_page["suspicious"]]["pid"].unique())
catch_failures = list(np.unique(exploded_survey[(exploded_survey["question_id"]=="catch.1")&(exploded_survey["score"]==0)].pid))

# CRT trials

In [None]:
def score_new_crt(crt_df, crt_quiz_solutions):
    crt_df["name"] = "crt"
    crt_df["responses"] = crt_df["responses"].apply(lambda crt_string: crt_string.replace("“", "").replace("”", ""))

    scored_questionnaire_df = score_generic_questionnaires(crt_df, {analysis_run: {"crt": crt_quiz_solutions}}, open_ended=True,
                                                           group_identifier="name", default_open_ended={"crt": "other"})
    return scored_questionnaire_df

crt_quiz_solutions = {"crt1": {".10": "intuitive", "10": "intuitive", ".05": "correct", "5": "correct", "": "no response"}, "crt2": {"100": "intuitive", "5": "correct", "": "no response"}, "crt3": {"24": "intuitive", "47": "correct", "": "no response"}, "crt4": {"9": "intuitive", "4": "correct", "": "no response"}, "crt5": {"30": "intuitive", "29": "correct", "": "no response"}, "crt6": {"10": "intuitive", "20": "correct", "": "no response"}, "crt7": {"is ahead of where he began": "intuitive", "has lost money": "correct", "": "no response"}}
crt = full_data["survey-html-form"][full_data["survey-html-form"]["name"]=="crt"].copy(deep=True)

scored_crt = score_new_crt(crt, crt_quiz_solutions)
crt_mapping = {"correct": 1, "other": 0, "intuitive":0, "no response":0}
scored_crt["score"] = scored_crt["score"].apply(lambda score: crt_mapping[score])

# IQ trials

In [None]:
full_data["survey-multi-choice"]["name"] = "IQ"

scored_iq = score_generic_questionnaires(full_data["survey-multi-choice"], {analysis_run: questionnaires_presentation},group_identifier="name", default_open_ended={})

#  Combined questionnaires

In [None]:
scored_questionnaire_df = pd.concat([exploded_survey, scored_iq, scored_crt])

In [None]:
demographics_df = full_data["survey-html-form"][full_data["survey-html-form"]["name"]=="demographics"].copy(deep=True)

# participant put negative age (confirmed with prolific demographics)
demographics_df.loc[demographics_df["pid"]==346 , "responses"]=demographics_df.loc[demographics_df["pid"]==346 , "responses"].apply(lambda responses: responses.replace('-63','63'))

# participants with prolific data only
prolific_demographics = pd.read_csv(processed_data_path.joinpath("prolific_demographics.csv"), index_col=0)
pids_with_prolific_demographics_only = set(prolific_demographics["Participant id"]) - set(demographics_df["pid"])

prolific_demographics["Sex"] = prolific_demographics["Sex"].apply(lambda gender: gender.lower() if gender in ["Male", "Female"] else "other")

new_rows = []
for row_idx, row in prolific_demographics[
    prolific_demographics["Participant id"].isin(pids_with_prolific_demographics_only)].iterrows():
    new_rows.append({"pid": row["Participant id"], "responses" : f"{{'gender' : '{row['Sex']}', 'age' : '{row['Age']}', 'effort' : '-1'}}"})

demographics_df = pd.concat([demographics_df, pd.DataFrame(new_rows)])
demographics, demo_text = process_html_demographics(demographics_df)
print(demo_text)

In [None]:
mouselab_datas = preprocess_mouselab_data(full_data["mouselab-mdp"],original_trials_per_block if ranges_to_extract else trials_per_block,ground_truths)

mouselab_datas = mouselab_datas.merge(individual_variables, how="left", on=["pid", "run"])

# path may contain a bunch of 0s at the start due to miscoding
mouselab_datas["path"] = mouselab_datas["path"].apply(lambda path : eval(path)[-3:])

node_classification = {key : [str(node) for node in val] for key, val in node_classification.items()}
mouselab_datas = add_click_count_columns(mouselab_datas, node_classification)

In [None]:
#TODO would be nice to refactor this out and import it
def expand_range_dictionary(input_dictionary):
    trial_to_block = {}
    for block, trial_range in input_dictionary.items():
        if isinstance(trial_range, str):
            for trial_index in eval(trial_range):
                trial_to_block[trial_index] = block
        else:
            trial_to_block[block] = expand_range_dictionary(trial_range)
    return trial_to_block

if ranges_to_extract:
    trial_to_block = expand_range_dictionary(ranges_to_extract)

    mouselab_datas["block"] = mouselab_datas.apply(lambda row: trial_to_block[row["run"]][row["trial_index"]] if row["run"] in trial_to_block else trial_to_block[row["trial_index"]], axis=1)

In [None]:
questionnaires = full_data["survey"]
questionnaires["correct"] = np.nan

if "name" not in questionnaires:
    questionnaires["name"] = np.nan

questionnaires["name"] = questionnaires.apply(
    lambda row: get_mouselab_quiz_name(row, mouselab_mapping) if not isinstance(row["name"], str) else row["name"],
    axis=1)

mouselab_questionnaires = questionnaires[questionnaires["name"].isin(mouselab_mapping.values())].reset_index()
questionnaires = questionnaires[~questionnaires["name"].isin(mouselab_mapping.values())].reset_index()

mouselab_questionnaires["name"] = mouselab_questionnaires.apply(lambda row: get_mouselab_quiz_name(row, mouselab_mapping) if not isinstance(row["name"], str) else row["name"],axis=1)
mouselab_questionnaires["responses"] = mouselab_questionnaires.apply(
    lambda row: str({k.split("_")[1]: v for k, v in eval(row["responses"]).items() if (k.split("_")[1] != "Q0") and ((row["name"] == "mouselab-quiz-pre" and k.split("_")[0] == "P8") or (row["name"] == "mouselab-quiz-post" and k.split("_")[0] == "P0"))}), axis=1)
mouselab_questionnaires = score_mouselab_questionnaires(mouselab_questionnaires, mouselab_quiz_solutions, mouselab_column_identifier)
mouselab_quiz = mouselab_questionnaires.drop_duplicates(["pid","question_id"], keep="last")

pivoted_mouselab_quiz = mouselab_quiz.pivot_table(values="score", index=["pid","run"], columns="question_id")

In [None]:
quiz_passers = get_quiz_passer_ids(mouselab_questionnaires, max_attempts=max_attempts, passing_score=passing_score, identifying_columns = ["pid", "run"])
passed_all_quizzes = list(set.intersection(*map(set,quiz_passers.values())))
print(f"Number who passed quizzes: {len(passed_all_quizzes)}")

passed_all_quizzes = [(pid, run) for pid, run in passed_all_quizzes if pid not in straightliners]
print(f"Number who passed quizzes and didn't straightline: {len(passed_all_quizzes)}")

pivoted_mouselab_quiz["passed_quizzes"] = 0
pivoted_mouselab_quiz.loc[passed_all_quizzes, "passed_quizzes"] = 1

In [None]:
quiz_and_demo = pivoted_mouselab_quiz.join(demographics)
quiz_and_demo = quiz_and_demo.merge(individual_variables, how="left", on=["pid", "run"])

quiz_and_demo_subselection = quiz_and_demo[quiz_and_demo.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].copy(deep=True)

quiz_and_demo_subselection["gender"]=quiz_and_demo_subselection["gender"].replace(np.nan, "participants with no demographic")
ages = [int(age) for age in quiz_and_demo_subselection["age"] if not pd.isnull(age)]

gender_values, gender_counts = np.unique(quiz_and_demo_subselection["gender"].values, return_counts = True)
print(get_demo_string(ages, gender_counts, gender_values))


if len(questionnaires)>0:
        scored_questionnaire_df[scored_questionnaire_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("questionnaires.csv"))
quiz_and_demo[quiz_and_demo.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("quiz-and-demo.csv"))
mouselab_datas[mouselab_datas.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("mouselab-mdp.csv"))
individual_variables[individual_variables.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("individual-variables.csv"))

In [None]:
valid_questionnaires = scored_questionnaire_df[scored_questionnaire_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)]

individual_items = valid_questionnaires.pivot_table(
        index=["pid"], columns="question_id", values="score"
    )

#DOSPERT rationality index
for dospert_item in range(1, 31):
    individual_items[f"dospert-rp.{dospert_item}"] = individual_items[f"dospert-rp.{dospert_item}"].astype(float)
    individual_items[f"dospert-eb.{dospert_item}"] = individual_items[f"dospert-eb.{dospert_item}"].astype(float)
    individual_items[f"dospert.{dospert_item}"] = individual_items[f"dospert.{dospert_item}"].astype(float)

for dospert_item in range(1, 31):
    individual_items[f"dospert-rational.{dospert_item}"] = individual_items.apply(
        lambda row: max(row[f"dospert-rp.{dospert_item}"] - row[f"dospert-eb.{dospert_item}"],
                        7 - row[f"dospert-rp.{dospert_item}"]) if row[f"dospert.{dospert_item}"] > 4 else 0, axis=1)

#prepare for adding combined score to combined dataframe
combined_rational = individual_items.apply(
    lambda row: sum([row[f"dospert-rational.{dospert_item}"] for dospert_item in range(1, 31)]), axis=1)
combined_rational=combined_rational.reset_index()
combined_rational["name"] = "dospert-rational"
combined_rational=combined_rational.rename(columns={0:"score"})

individual_items.to_csv(processed_data_path.joinpath("individual_items.csv"))

summed_scores = valid_questionnaires.groupby(["pid", "name"]).sum()["score"].reset_index()
summed_scores = pd.concat([summed_scores, combined_rational])
combined_scores = summed_scores.pivot_table(
    index=["pid"], columns="name", values="score"
)
combined_scores.to_csv(processed_data_path.joinpath("combined_scores.csv"))


In [None]:
median_info = finished_df[finished_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].median()
print(f"median time: {median_info['time_diff']:.2f}, median bonus: {median_info['final_bonus']:.2f}")