In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime

from download_tools.plugins.mouselab_mdp import preprocess_mouselab_data, add_click_count_columns
from download_tools.plugins.survey_html_form import process_html_demographics
from download_tools.plugins.survey_multi_choice import score_mouselab_questionnaires, get_mouselab_quiz_name, get_quiz_passer_ids, score_row, score_generic_questionnaires
from download_tools.plugins.survey_text import preprocess_survey_text, get_old_demographics
from download_tools.plugins.utils import get_demo_string

from datetime import datetime
import json
import dill as pickle

In [2]:
# Parameters
database_key = None
bonus_function = "lambda row: round((row[\"score\"] + 50) * .002, 2)"
simulated = False
participants_to_remove=[]
sessions = [ "0.0.3", "0.0.4", "0.0.5"]
experiment_setting = "high_increasing"
COST = None
DEPTH = None
html_survey_names = {}
num_parts = None
old_experiment = [False,False,False]
manual_age_mapping = {}
experiment_specific_mapping = {}
trials_per_block = {"test": 20, "training": 20}
ranges_to_extract = None
mouselab_column_identifier = "name"
mouselab_mapping = {"0.0-1.*-0.*": "mouselab-quiz-pre", "0.0-3.*-3*": "mouselab-quiz-post"}
name_mapping = {"0.0-3.0-5.0": "crt", "0.0-3.0-7.0": "demographics", "0.0-3.0-4.0": "survey"}
max_attempts = {"mouselab-quiz-pre": 4, "mouselab-quiz-post": 1}
passing_score = {"mouselab-quiz-pre": 5, "mouselab-quiz-post": 0}
mouselab_quiz_solutions = {"mouselab-quiz-pre": {"Q1": "$-48 to $48", "Q2": "No, the cost is always $1.00.", "Q3": "There is no limit", "Q4": "The better I perform the higher my bonus will be.", "Q5": "No, the amount of cash at each node of the web may be different each time."}, "mouselab-quiz-post": {"Q1": "$-4 to $4", "Q2": "$-8 to $8", "Q3": "$-48 to $48", "Q4": "No, the cost is always $1.00.", "Q5": {"Very unmotivated": -2, "Slightly unmotivated": -1, "Neither motivated nor unmotivated": 0, "Slightly motivated": 1, "Very motivated": 2, "": 0}}}
ground_truth_file = "high_increasing"
node_classification = {"early": [1, 5, 9], "middle": [2, 6, 10], "late": [3, 4, 7, 8, 11, 12], "clicks": [1, 5, 9, 2, 6, 10, 3, 4, 7, 8, 11, 12]}
structure = "312_2_4_24"
analysis_run = "quest_main"
data_path = "/home/vfelso/github/planning-depth-differences/data"


In [3]:
# paths to use
inputs_path = Path(data_path).joinpath('inputs')
raw_data_path = Path(data_path).joinpath('raw')
processed_data_path = Path(data_path).joinpath(f'processed/{analysis_run}')
processed_data_path.mkdir(parents=True, exist_ok=True)

In [4]:
with open(inputs_path.joinpath(f"questionnaire_files/questionnaire_OnePart.txt"), "rb") as f:
    questionnaire_presentation = json.load(f)
reverse_coded_dictionary = {}
for quest_namq, quest_info in questionnaire_presentation.items():
    for quest in quest_info["questions"]:
        reverse_coded_dictionary[quest["question_id"]] = quest["reverse_coded"]

In [5]:
with open(inputs_path.joinpath(f"exp_inputs/rewards/{ground_truth_file}.json")) as json_file:
    ground_truths = json.load(json_file)

In [6]:
# load data
full_data = {}

# read in sessions
for run in sessions:
    for file_path in raw_data_path.glob(f"{run}/*.csv"):
        # don't want to save identifiable bonuses
        # file, information is already in data
        if "bonuses" not in str(file_path):
            file_name = file_path.stem
            curr_data_frame = pd.read_csv(file_path)
            curr_data_frame["run"] = analysis_run #TODO

            # remove participants who are me, or for other reasons
            curr_data_frame = curr_data_frame[~curr_data_frame["pid"].isin(participants_to_remove)]
            curr_data_frame = curr_data_frame.rename({"response" : "responses"}, axis="columns")
            
            if "internal_node_id" in curr_data_frame.columns:
                curr_data_frame["name"] = curr_data_frame.apply(lambda row : name_mapping[row["internal_node_id"]] if row["internal_node_id"] in name_mapping.keys() else row["name"], axis=1)
            if "response" in curr_data_frame.columns:
                curr_data_frame = curr_data_frame.rename({"response" : "responses"}, axis="columns")
                
            if file_name not in full_data:
                full_data[file_name] = [curr_data_frame]
            else:
                full_data[file_name].append(curr_data_frame)
                

full_data = {k: pd.concat(v) for k,v in full_data.items()}

In [7]:
individual_variables = full_data["general_info"].merge(full_data["question_data"], on=["pid","run"])

if all(pd.isnull(individual_variables["endhit"])):
    begin_hit = full_data["survey"].groupby("pid").min()["time_elapsed"]
    end_hit = full_data['survey-text'][full_data['survey-text']["responses"].apply(lambda responses: "Q3" not in responses)].groupby("pid").max()["time_elapsed"]
    finished_df = individual_variables[individual_variables["pid"].isin(end_hit.index)]
    finished_df["time_diff"] =  finished_df["pid"].apply(lambda pid: (end_hit[pid])/(60*1000))
else:
    time_format = '%Y-%m-%d %H:%M:%S.%f'
    finished_df = individual_variables[individual_variables["endhit"].apply(lambda endhit: isinstance(endhit, str))].reset_index()
    finished_df["time_diff"] = finished_df.apply(lambda row: ((datetime.strptime(row["endhit"], time_format) - datetime.strptime(row["beginhit"], time_format)).seconds % 3600 )/ 60.0, axis=1)
individual_variables = individual_variables.merge(finished_df[["time_diff", "pid", "run"]], how="left", on=["pid", "run"])

# check saved cost makes sense
if (COST is not None) and (DEPTH is not None):
    if isinstance(COST, dict):
        assert(np.all(individual_variables.apply(lambda row: row["COST"] == COST[row["codeversion"]][int(row["cond"])],axis=1)))
    else:
        unique_costs = np.unique(individual_variables["COST"])
        assert(len(unique_costs) == 1)
        assert(unique_costs[0] == COST)
    if DEPTH:
        if isinstance(DEPTH, dict):
            assert(np.all(individual_variables.apply(lambda row: row["DEPTH"] == DEPTH[row["codeversion"]][int(row["cond"])],axis=1)))
        else:
            unique_costs = np.unique(individual_variables["DEPTH"])
            assert(len(unique_costs) == 1)
            assert(unique_costs[0] == DEPTH)
    print(finished_df.groupby(["DEPTH", "COST"]).mean())
else:
    print(finished_df.groupby(["cond"]).mean())

  begin_hit = full_data["survey"].groupby("pid").min()["time_elapsed"]


      counterbalance  endhit  bonus   status    pid  final_bonus  \
cond                                                               
0                0.0     NaN    0.0  4.38587  418.1     2.872065   

      displayed_bonus  DEPTH  COST  MIN_TIME  inspectCost  bonusRate  \
cond                                                                   
0            2.872065    0.0   0.0       7.0          1.0      0.002   

      branching  first_trial  time_diff  
cond                                     
0         312.0     0.526087  67.359726  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finished_df["time_diff"] =  finished_df["pid"].apply(lambda pid: (end_hit[pid])/(60*1000))


In [8]:
finished_df=finished_df.drop_duplicates(subset=["pid"],keep="last")

In [9]:
finished_df["time_diff"].mean()

69.24415902014653

In [10]:
finished_df["final_bonus"].mean()

2.8794767441860465

In [11]:
finished_df["time_diff"].median()

64.53743333333334

In [12]:
time_fields = {"startTime":lambda date: np.nan if not isinstance(date,str) else datetime.strptime(date[:15], "%a %b %d %Y") ,"beginhit":lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0]), "beginexp" : lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0]), "endhit" : lambda date: np.nan if  not isinstance(date,str) else datetime.fromisoformat(date.split(" ")[0])}

for time_field, time_func in time_fields.items():
    individual_variables[time_field] = individual_variables[time_field].apply(time_func)

individual_variables["beginhit"].describe()

  individual_variables["beginhit"].describe()


count                    5647
unique                      6
top       2022-11-07 00:00:00
freq                     4075
first     2022-11-07 00:00:00
last      2022-11-12 00:00:00
Name: beginhit, dtype: object

In [13]:
if not isinstance(old_experiment, list):
    old_experiment = [old_experiment] * len(sessions)

In [14]:
from collections import ChainMap

survey = full_data["survey"][full_data["survey"]["name"] == "survey"]

pages = []
names = []
question_ids = []
responses = []
reverse_coded = []
for row_idx, row in survey.iterrows():
    (keys, vals) = zip(*eval(row["responses"]).items())
    pages.append(str(keys))

    full_dict = dict(ChainMap(*[ans_dict for ans_dict in vals if ans_dict]))
    assert (len(full_dict) == 342)

    curr_question_ids = [[key for key in val.keys()] if val else [] for val in vals]
    names.append(str([[key.split(".")[0] for key in val.keys()][0] if val else [] for val in vals]))
    responses.append(str([[full_dict[quest] for quest in page] for page in curr_question_ids]))
    reverse_coded.append(str([[reverse_coded_dictionary[quest] for quest in page] for page in curr_question_ids]))
    question_ids.append(str(curr_question_ids))

survey["question_id"] = question_ids
survey["name"] = names
survey["responses"] = responses
survey["pages"] = pages
survey["reverse_coded"] = reverse_coded

explode_columns = ["question_id", "name", "reverse_coded", "responses", "pages"]
for explode_column in explode_columns:
    survey[explode_column] = survey[explode_column].apply(eval)
exploded_survey = survey.explode(explode_columns)

# check for straightlining
assert (len(exploded_survey["responses"][exploded_survey["responses"].apply(
    lambda responses: (len(np.unique(responses)) <= 1) & len(responses) > 5)]) == 0)

with open(inputs_path.joinpath(f"questionnaire_files/solutions_OnePart.pkl"), "rb") as f:
    questionnaires_presentation = pickle.load(f)
adjusted_questionnaires_presentation = {}

for key, val in questionnaires_presentation.items():
    new_key = [k.split(".")[0] for k in val.keys()][0]
    if new_key not in adjusted_questionnaires_presentation:
        adjusted_questionnaires_presentation[new_key] = {}
    adjusted_questionnaires_presentation[new_key] = {**adjusted_questionnaires_presentation[new_key], **val}

exploded_survey = exploded_survey[exploded_survey["name"].apply(lambda name: isinstance(name, str))]
scored_questionnaire_df = score_generic_questionnaires(exploded_survey[(exploded_survey["pid"]==736)&(exploded_survey["name"].isin(["bis"]))],
                                                       {analysis_run: adjusted_questionnaires_presentation},
                                                       group_identifier="name", default_open_ended={})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey["question_id"] = question_ids
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey["name"] = names
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey["responses"] = responses
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

In [15]:
from collections import Counter
Counter(exploded_survey[exploded_survey.apply(
    lambda row: (len(np.unique(row["responses"])) <= 1) & (any(row["reverse_coded"])) & (len(row["responses"]) > 5), axis=1)].groupby(["pid"]).count()["rt"])

Counter({1: 25, 3: 2, 5: 1, 2: 5, 4: 2, 10: 1, 12: 1})

In [16]:
exploded_survey[exploded_survey.apply(
    lambda row: (len(np.unique(row["responses"])) <= 1) & (any(row["reverse_coded"])) & (len(row["responses"]) > 5), axis=1)].groupby(["pid"]).count()["rt"]

pid
41      1
73      1
94      1
106     1
187     1
226     1
244     3
285     1
292     1
294     5
333     1
367     2
372     4
395    10
410     1
430     1
433     2
473     1
478     1
519     1
550    12
568     1
595     1
614     2
653     1
669     2
673     1
715     1
722     1
727     1
753     2
789     1
806     3
839     1
850     4
889     1
912     1
Name: rt, dtype: int64

In [17]:
straightliners = (list(np.unique(exploded_survey[exploded_survey.apply(
    lambda row: (len(np.unique(row["responses"])) <= 1) & (any(row["reverse_coded"])) & (len(row["responses"]) > 5), axis=1)]["pid"])))

In [18]:
catch_failures = list(np.unique(scored_questionnaire_df[(scored_questionnaire_df["question_id"]=="catch.1")&(scored_questionnaire_df["score"]==0)].pid))

In [19]:
def score_new_crt(crt_df, crt_quiz_solutions):
    crt_df["name"] = "crt"
    #     crt_df["question_id"]=crt_df["question_id"].apply(lambda crt_string: crt_string.replace("“","").replace("”",""))
    crt_df["responses"] = crt_df["responses"].apply(lambda crt_string: crt_string.replace("“", "").replace("”", ""))
    #     return crt_df
    scored_questionnaire_df = score_generic_questionnaires(crt_df, {analysis_run: {"crt": crt_quiz_solutions}}, open_ended=True,
                                                           group_identifier="name", default_open_ended={"crt": "other"})
    return scored_questionnaire_df

crt_quiz_solutions = {"crt1": {".10": "intuitive", "10": "intuitive", ".05": "correct", "5": "correct", "": "no response"}, "crt2": {"100": "intuitive", "5": "correct", "": "no response"}, "crt3": {"24": "intuitive", "47": "correct", "": "no response"}, "crt4": {"9": "intuitive", "4": "correct", "": "no response"}, "crt5": {"30": "intuitive", "29": "correct", "": "no response"}, "crt6": {"10": "intuitive", "20": "correct", "": "no response"}, "crt7": {"is ahead of where he began": "intuitive", "has lost money": "correct", "": "no response"}}
crt = full_data["survey-html-form"][full_data["survey-html-form"]["internal_node_id"]=="0.0-3.0-5.0"]

scored_crt = score_new_crt(full_data["survey-html-form"][full_data["survey-html-form"]["name"]=="crt"], crt_quiz_solutions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crt_df["name"] = "crt"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crt_df["responses"] = crt_df["responses"].apply(lambda crt_string: crt_string.replace("“", "").replace("”", ""))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questionnaires["responses"] = questionnaires["responses"].apply(
A va

In [20]:
full_data["survey-multi-choice"]["name"] = "IQ"

scored_iq = score_generic_questionnaires(full_data["survey-multi-choice"], {analysis_run: questionnaires_presentation},group_identifier="name", default_open_ended={})

In [21]:
scored_questionnaire_df = pd.concat([scored_questionnaire_df, scored_iq, scored_crt])

In [None]:
scored_questionnaire_df

In [None]:
# survey_texts = preprocess_survey_text(full_data["survey-text"])


if any(old_experiment):
    old_runs = np.asarray(sessions)[old_experiment]
    old_demographics, demo_text = get_old_demographics(survey_texts[survey_texts["run"].isin(old_runs)], experiment_specific_gender=experiment_specific_mapping, manual_age_mapping=manual_age_mapping)
    print(demo_text)

new_runs =  np.asarray(sessions)[[not ex for ex in old_experiment]]
html_survey = full_data["survey-html-form"]
if len(html_survey_names) > 0:
    if "name" not in html_survey:
        html_survey["name"] = np.nan

    html_survey["name"] = html_survey.apply(
        lambda row: get_mouselab_quiz_name(row, html_survey_names[row["run"]]) if not isinstance(row["name"], str) else row["name"],
        axis=1)
    demographics, demo_text = process_html_demographics(html_survey[(html_survey["name"] == "demographics")&(html_survey["run"].isin(new_runs))])
    full_data["survey-multi-choice"] = pd.concat([html_survey[html_survey["name"] != "demographics"]])
else:
    demographics, demo_text = process_html_demographics(full_data["survey-html-form"])

print(demo_text)


if any(old_experiment):
    gender_name, gender_count = np.unique(pd.concat([old_demographics["Q2"],demographics["gender"]]), return_counts=True)
    ages = pd.concat([old_demographics["Q1"],demographics["age"]]).apply(int)
    print(get_demo_string(ages, gender_count, gender_name))

In [None]:
# TODO fix this 
# mouselab_datas = preprocess_mouselab_data(full_data["mouselab-mdp"],trials_per_block, ground_truths)
mouselab_datas = full_data["mouselab-mdp"]
mouselab_datas["queries"] = mouselab_datas["queries"].apply(eval)

mouselab_datas = mouselab_datas.merge(individual_variables, how="left", on=["pid", "run"])

# path may contain a bunch of 0s at the start due to miscoding
mouselab_datas["path"] = mouselab_datas["path"].apply(lambda path : eval(path)[-3:])

node_classification = {key : [str(node) for node in val] for key, val in node_classification.items()}
mouselab_datas = add_click_count_columns(mouselab_datas, node_classification)

In [None]:
#TODO would be nice to refactor this out and import it
def expand_range_dictionary(input_dictionary):
    trial_to_block = {}
    for block, trial_range in input_dictionary.items():
        if isinstance(trial_range, str):
            for trial_index in eval(trial_range):
                trial_to_block[trial_index] = block
        else:
            trial_to_block[block] = expand_range_dictionary(trial_range)
    return trial_to_block

if ranges_to_extract:
    trial_to_block = expand_range_dictionary(ranges_to_extract)

    mouselab_datas["block"] = mouselab_datas.apply(lambda row: trial_to_block[row["run"]][row["trial_index"]] if row["run"] in trial_to_block else trial_to_block[row["trial_index"]], axis=1)

In [22]:
questionnaires = full_data["survey"]
questionnaires["correct"] = np.nan

if "name" not in questionnaires:
    questionnaires["name"] = np.nan

questionnaires["name"] = questionnaires.apply(
    lambda row: get_mouselab_quiz_name(row, mouselab_mapping) if not isinstance(row["name"], str) else row["name"],
    axis=1)

mouselab_questionnaires = questionnaires[questionnaires["name"].isin(mouselab_mapping.values())].reset_index()

#TODO check if multiple responses
mouselab_questionnaires["responses"] = mouselab_questionnaires["responses"].apply(lambda responses: str({k.split("_")[1]: v for k,v in eval(responses).items() if k.split("_")[1] != "Q0"}))

mouselab_questionnaires["name"] = mouselab_questionnaires.apply(lambda row: get_mouselab_quiz_name(row, mouselab_mapping) if not isinstance(row["name"], str) else row["name"],axis=1)
mouselab_questionnaires = score_mouselab_questionnaires(mouselab_questionnaires, mouselab_quiz_solutions, mouselab_column_identifier)
mouselab_quiz = mouselab_questionnaires.drop_duplicates(["pid","question_id"], keep="last")

pivoted_mouselab_quiz = mouselab_quiz.pivot_table(values="score", index=["pid","run"], columns="question_id")

In [23]:
quiz_passers = get_quiz_passer_ids(mouselab_questionnaires, max_attempts=max_attempts, passing_score=passing_score, identifying_columns = ["pid", "run"])
passed_all_quizzes = list(set.intersection(*map(set,quiz_passers.values())))
print(len(passed_all_quizzes))
pivoted_mouselab_quiz["passed_quizzes"] = 0
pivoted_mouselab_quiz.loc[passed_all_quizzes, "passed_quizzes"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_data["attempt_num"] = survey_data["internal_node_id"].apply(


739


In [25]:
quiz_passers = set([pid for pid, run in pivoted_mouselab_quiz[pivoted_mouselab_quiz["passed_quizzes"]==1].index])

In [26]:
len(set.intersection(set(finished_df["pid"]), quiz_passers)-set(straightliners))

684

In [27]:
len(np.unique(mouselab_questionnaires["pid"]))

837

In [None]:
684/837

In [None]:
# TODO
# quiz_and_demo = pivoted_mouselab_quiz.join(demographics)
# quiz_and_demo = quiz_and_demo.merge(individual_variables, how="left", on=["pid", "run"])

# quiz_and_demo_subselection = quiz_and_demo[quiz_and_demo.apply(lambda row: (row["pid"], row["run"]) in \
#                                  passed_all_quizzes, axis=1)]

# quiz_and_demo_subselection["gender"]=quiz_and_demo_subselection["gender"].replace(np.nan, "participants with no demographic")
# ages = [int(age) for age in quiz_and_demo_subselection["age"] if not pd.isnull(age)]

# gender_values, gender_counts = np.unique(quiz_and_demo_subselection["gender"].values, return_counts = True)
# print(get_demo_string(ages, gender_counts, gender_values))


if len(questionnaires)>0:
        scored_questionnaire_df[scored_questionnaire_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("questionnaires.csv"))
# quiz_and_demo[quiz_and_demo.apply(lambda row: (row["pid"], row["run"]) in \
#                                  passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("quiz-and-demo.csv"))
mouselab_datas[mouselab_datas.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("mouselab-mdp.csv"))
# survey_texts[survey_texts.apply(lambda row: (row["pid"], row["run"]) in \
#                                  passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("survey-text.csv"))
individual_variables[individual_variables.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].to_csv(processed_data_path.joinpath("individual-variables.csv"))

In [None]:
valid_questionnaires = scored_questionnaire_df[scored_questionnaire_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)]
valid_questionnaires = valid_questionnaires[~pd.isnull(valid_questionnaires["question_id"])]


crt_mapping = {"correct": 1, "other": 0, "intuitive":0, "no response":0}
valid_questionnaires["score"] = valid_questionnaires["score"].apply(lambda score: score if not score in crt_mapping.keys() else crt_mapping[score])

individual_items = valid_questionnaires.pivot_table(
        index=["pid"], columns="question_id", values="score"
    )
# individual_items = individual_items.join(demographics[["age", "effort", "gender"]], on="pid")
individual_items.to_csv(processed_data_path.joinpath("individual_items.csv"))

summed_scores = valid_questionnaires.groupby(["pid", "name"]).sum()["score"].reset_index()
combined_scores = summed_scores.pivot_table(
    index=["pid"], columns="name", values="score"
)
# combined_scores = combined_scores.join(demographics[demographics.columns.difference(["female", "male", "other"])], on="pid")
combined_scores.to_csv(processed_data_path.joinpath("combined_scores.csv"))

In [None]:
if num_parts:
    #TODO only supports one digit num_parts
    for part_num in range(1, num_parts +1):
        curr_runs = [run for run in finished_df["run"].unique() if run.endswith(f"Part{part_num}")]
        curr_runs_df = finished_df[finished_df["run"].isin(curr_runs)]
        median_info = curr_runs_df[curr_runs_df.apply(lambda row: (row["pid"], row["run"][:-1]+str(1)) in \
                                     passed_all_quizzes, axis=1)].median()
        print(f"median time: {median_info['time_diff']:.2f}, median bonus: {median_info['final_bonus']:.2f}")
else:
    median_info = finished_df[finished_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].median()
    print(f"median time: {median_info['time_diff']:.2f}, median bonus: {median_info['final_bonus']:.2f}")

In [None]:
mean_info = finished_df[finished_df.apply(lambda row: (row["pid"], row["run"]) in \
                                 passed_all_quizzes, axis=1)].mean()
print(f"mean time: {mean_info['time_diff']:.2f}, mean bonus: {mean_info['final_bonus']:.2f}")