In [98]:
import pandas as pd
import json
import numpy as np

# Load the CSV file
csv_file_path = './data/prolific_export_65566952c6b6b608da9084b0.csv'
csv_data = pd.read_csv(csv_file_path)

# Load the JSON file
json_file_path = './data/user-study-narrative-sketch-default-rtdb-export.json'
with open(json_file_path, 'r') as file:
    actions = json.load(file)

# Extracting and renaming demographic information from the CSV data
# demographics = csv_data[['Participant id', 'Age', 'Sex', 'Ethnicity simplified', 'Country of birth', 'Country of residence', 'Nationality', 'Language', 'Student status', 'Employment status']]
demographics = csv_data[['Participant id', 'Age', 'Sex']]
demographics = demographics.rename(columns={
    'Participant id': 'Participant ID'
})


In [117]:
# Flatten the JSON data 
flattened_data = []
for participant_id, data in actions.items():


    # Route Path Navigation Data
    for route_data in data.get('routePathNavigation', {}).values():
        combined_data = {**route_data, 'navType': 'routePathNav'}
        flattened_data.append(combined_data)

    # User Selections Data
    for selection_data in data.get('userSelections', {}).values():
        combined_data = {**selection_data, 'navType': 'userSelectionNav'}
        flattened_data.append(combined_data)

actions_data_df = pd.DataFrame(flattened_data)

# Convert the currentIndex if exist to an integer
actions_data_df['currentIndex'] = actions_data_df['currentIndex'].astype('Int64').astype('str')


# Updating the Route Path for 'training' and 'task' modes using vectorized operations
condition = actions_data_df['mode'].isin(['training', 'task'])
actions_data_df['routePath'] = np.where(
    condition, 
    "/" + actions_data_df['mode'] + 
    "/" + actions_data_df['currentIndex'] +
    actions_data_df['routePath']
)


actions_data_df[actions_data_df['mode']=='training'].head(10)


TypeError: can only concatenate str (not "int") to str

In [111]:

# Flatten the JSON data and combine with demographic data
flattened_data = []
for participant_id, data in actions.items():
    # Ensure participant is in demographics
    if participant_id not in demographics['Participant ID'].values:
        continue

    demo_info = demographics[demographics['Participant ID'] == participant_id].iloc[0].to_dict()

    # Route Path Navigation Data
    for route_data in data.get('routePathNavigation', {}).values():
        combined_data = {**demo_info, **route_data, 'navType': 'routePathNav'}
        flattened_data.append(combined_data)

    # User Selections Data
    for selection_data in data.get('userSelections', {}).values():
        combined_data = {**demo_info, **selection_data, 'navType': 'userSelectionNav'}
        flattened_data.append(combined_data)

# Convert to DataFrame
combined_data_df = pd.DataFrame(flattened_data)
# exclude "reason" column
combined_data_df

Unnamed: 0,Participant ID,Age,Sex,routePath,timestamp,navType,currentIndex,decision,mode,reason,rightSelection,selection,story
0,5d05c531dc2d54001838ce76,44,Female,/intro,1700167160650,routePathNav,,,,,,,
1,5d05c531dc2d54001838ce76,44,Female,/motifs,1700167258392,routePathNav,,,,,,,
2,5d05c531dc2d54001838ce76,44,Female,/about,1700167298863,routePathNav,,,,,,,
3,5d05c531dc2d54001838ce76,44,Female,/training,1700167328371,routePathNav,,,,,,,
4,5d05c531dc2d54001838ce76,44,Female,/task,1700167709762,routePathNav,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,603b750eab9d377aeac41724,25,Female,,1700167732736,userSelectionNav,5.0,confirm,task,Each passage leads on from one another.,ShortFork,Ladder,story_Muddle_Puddle_ShortFork_seed158.json
117,603b750eab9d377aeac41724,25,Female,,1700167793622,userSelectionNav,6.0,confirm,task,last passage is a bit random and disrupts the ...,Ladder,LongFork,story_Muddle_Puddle_Ladder_seed241.json
118,603b750eab9d377aeac41724,25,Female,,1700167840064,userSelectionNav,7.0,confirm,task,The story is relevant and follows on from each...,LongFork,Ladder,story_Muddle_Puddle_LongFork_seed288.json
119,603b750eab9d377aeac41724,25,Female,,1700167888720,userSelectionNav,8.0,confirm,task,"Three separate passages. However, the first an...",WideBranch,WideMerge,story_Muddle_Puddle_WideBranch_seed496.json


In [None]:

# Ensure 'Timestamp (Decision)' and 'Timestamp (Route Path)' columns are correctly formatted and exist
combined_data_df['Timestamp (Decision)'] = combined_data_df.get('timestamp', pd.NaT)
combined_data_df['Timestamp (Route Path)'] = combined_data_df.get('timestamp', pd.NaT)

# group by participant id and sort by timestamp, 
# and get the difference between the current and next timestamp, /end has no next difference duration



# Merging Timestamps using combine_first
combined_data_df['Timestamp'] = combined_data_df['Timestamp (Decision)'].combine_first(combined_data_df['Timestamp (Route Path)'])

# Convert the currentIndex if exist to an integer
combined_data_df['currentIndex'] = combined_data_df['currentIndex'].astype('Int64')


# Updating the Route Path for 'training' and 'task' modes using vectorized operations
condition = combined_data_df['mode'].isin(['training', 'task'])
combined_data_df['Route Path'] = np.where(
    condition, 
    "/" + combined_data_df['mode'] + "/" + combined_data_df['currentIndex'].astype(str),
    combined_data_df['routePath']
)

# Dropping old timestamp columns and other unnecessary columns
combined_data_df = combined_data_df.drop(columns=['Timestamp (Route Path)', 'Timestamp (Decision)', 'timestamp', 'mode', 'currentIndex', 'routePath'])

# Show the updated DataFrame structure
combined_data_df.head(10)

In [63]:
combined_data_df.to_csv('output.csv', index=False)

In [64]:
# Group the data by participant ID
grouped_data = combined_data_df.groupby('Participant ID')

# Sort the data by timestamp and route path within each group
sorted_data = grouped_data.apply(lambda x: x.sort_values(['Timestamp']))

# Reset the index
sorted_data.reset_index(drop=True, inplace=True)



# Show the updated DataFrame structure
sorted_data


Unnamed: 0,Participant ID,Age,Sex,decision,reason,rightSelection,selection,story,Timestamp,Route Path
0,5d05c531dc2d54001838ce76,44,Female,,,,,,1700167160650,/intro
1,5d05c531dc2d54001838ce76,44,Female,,,,,,1700167258392,/motifs
2,5d05c531dc2d54001838ce76,44,Female,,,,,,1700167298863,/about
3,5d05c531dc2d54001838ce76,44,Female,,,,,,1700167328371,/training
4,5d05c531dc2d54001838ce76,44,Female,cancel,,ShortFork,Linear,story_Muddle_Puddle_ShortFork_seed942.json,1700167401299,/training/0
...,...,...,...,...,...,...,...,...,...,...
116,603b750eab9d377aeac41724,25,Female,confirm,last passage is a bit random and disrupts the ...,Ladder,LongFork,story_Muddle_Puddle_Ladder_seed241.json,1700167793622,/task/6
117,603b750eab9d377aeac41724,25,Female,confirm,The story is relevant and follows on from each...,LongFork,Ladder,story_Muddle_Puddle_LongFork_seed288.json,1700167840064,/task/7
118,603b750eab9d377aeac41724,25,Female,confirm,"Three separate passages. However, the first an...",WideBranch,WideMerge,story_Muddle_Puddle_WideBranch_seed496.json,1700167888720,/task/8
119,603b750eab9d377aeac41724,25,Female,confirm,First and last passage seem to link together.,SharpBranch,WideMerge,story_Muddle_Puddle_SharpBranch_seed178.json,1700167928701,/task/9
