In [166]:
import numpy as np
import pandas as pd
import json
import re
import csv
import ast

In [167]:
multi_task_data = pd.read_csv('../raw_data/multi_task_data_with_dv_by_rounds.csv')

In [168]:
multi_task_dv = multi_task_data[["stageIds", "task", "complexity", "playerCount", "score", "speed", "efficiency", "raw_duration_min", "default_duration_min"]]

In [169]:
multi_task_dv = multi_task_dv.rename(columns = {"stageIds": "stageId"})

In [170]:
multi_task_dv

Unnamed: 0,stageId,task,complexity,playerCount,score,speed,efficiency,raw_duration_min,default_duration_min
0,wTtnYPymbumNdGkqS,Moral Reasoning,Low,3,0.0,38.276872,0.000000,3.086167,5
1,jKvQoGxqtb4JZ2q9o,Moral Reasoning,Medium,3,0.0,57.929140,0.000000,2.103550,5
2,8eDYCY8W9uCGCbg8K,Moral Reasoning,High,3,0.0,66.676111,0.000000,1.666200,5
3,7HwXhDZAbBbjimTma,Room Assignment,Low,3,86.0,40.481865,3481.440395,2.975917,5
4,DPMTfdfK7TNBMQzr6,Room Assignment,Medium,3,91.0,0.168333,15.318282,4.991600,5
...,...,...,...,...,...,...,...,...,...
1390,y5BnQMBzyuqYPi4Cx,Wolf Goat Cabbage,High,3,60.0,17.278609,1036.716544,4.136083,5
1391,yZEpzgxumLFMEaY4T,Wolf Goat Cabbage,Low,3,0.0,15.752281,0.000000,4.212400,5
1392,J7aBjR8d5BkdReojo,Allocating Resources,Medium,3,100.0,0.000556,0.055555,3.019517,3
1393,4xfgW4gtPoBK6yrKS,Allocating Resources,High,3,50.0,77.660124,3883.006206,0.670200,3


In [171]:
'''
Build a dictionary that allows us to look up the stages from the gameId
key: gameId
  value:
    { key: task
        value:{
            key: "low"
            value: stageId for low, 

            key: "med"
            value: stageId for low, 

            key: "high"
            value: stageId for low, 
        }
    }
'''
game_id_lookup = {}

for index, row in multi_task_data.iterrows():
    cur_gameid = row['gameId']
    if cur_gameid not in game_id_lookup.keys():
        # add it
        game_id_lookup.update({cur_gameid: {}})

    # get current task, stageId, and complexity
    task = row["task"]
    stageId = row["stageIds"]
    complexity = row["complexity"]

    # ensure current task is updated in dict:
    if task not in game_id_lookup[cur_gameid].keys():
        # add it
        game_id_lookup[cur_gameid].update({task: {}})

    # update the dictionary
    game_id_lookup[cur_gameid][task].update({complexity: stageId})

In [172]:
game_id_lookup['9qn6LAzd4Fs9nf2KD']

{'Moral Reasoning': {'Low': 'wTtnYPymbumNdGkqS',
  'Medium': 'jKvQoGxqtb4JZ2q9o',
  'High': '8eDYCY8W9uCGCbg8K'},
 'Room Assignment': {'Low': '7HwXhDZAbBbjimTma',
  'Medium': 'DPMTfdfK7TNBMQzr6',
  'High': 'tziywZNRtREfFkNnn'},
 'Sudoku': {'Low': 'QrbemxQDvBjTvqGqw',
  'Medium': 'XvAo5Yfpy6u3ZPqjJ',
  'High': 'vkmBiBwZQ8G9HKXex'},
 'Guess the Correlation': {'Low': 'CsPJ8XWYNBk6pXGY5',
  'Medium': 'N7zAttJGJCZF5ejCf',
  'High': 'HFbCmmSfbAzFnSF4m'}}

In [173]:
multi_task_stages = pd.read_csv('../raw_data/multi_task_stages.csv')

  multi_task_stages = pd.read_csv('../raw_data/multi_task_stages.csv')


In [174]:
multi_task_stages[multi_task_stages["_id"]=="wTtnYPymbumNdGkqS"]["startTimeAt"].unique()[0]

'2023-04-10T15:48:47Z'

In [175]:
with open('../raw_data/multi_task_conversations.csv', 'w', newline='') as csvfile:
    # start writing the header of the CSV
    fieldnames = ["conversation_num", "stageId", "gameId", "message", "speaker_nickname", "timestamp"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for index, row in multi_task_data.iterrows():
        chat = "[" + row['data.A'] + "]" # convert messages to a list of dict objects
        chat_list = eval(chat)
        
        # Declare variables to store the Low/Med/High complexity chats
        task = row["task"]
        game_id = row["gameId"]

        try:
            low_id = game_id_lookup[game_id][task]["Low"]
            LOW_START = multi_task_stages[multi_task_stages["_id"]==low_id]["startTimeAt"].unique()[0]
        except KeyError:
            # low ID doesn't exist
            low_id = None

        try: 
            med_id = game_id_lookup[game_id][task]["Medium"]
            MED_START = multi_task_stages[multi_task_stages["_id"]==med_id]["startTimeAt"].unique()[0]
        except KeyError:
            # med ID doesn't exist
            med_id = None

        try:
            hi_id = game_id_lookup[game_id][task]["High"]
            HI_START = multi_task_stages[multi_task_stages["_id"]==hi_id]["startTimeAt"].unique()[0]
        except KeyError:
            # high ID doesn't exist
            hi_id = None


        for chat in chat_list:
            text = chat['text']
            speaker = chat['player']['_id']
            timestamp = chat['timeStamp']

            # Split the chat up based on whether people were working on the low, medium, or high-complexity task
            # if this is low
            if med_id:
                if timestamp < MED_START:
                    num = game_id + "_" + task + "_LOW"
                    stageId = low_id
                if(hi_id and med_id): # if hi exists
                    # if this is med
                    if timestamp >= MED_START and timestamp < HI_START:
                        num = game_id + "_" + task + "_MEDIUM"
                        stageId = med_id
                    # if this is high
                    elif timestamp > HI_START:
                        num = game_id + "_" + task + "_HIGH"
                        stageId = hi_id
                elif(med_id): #only med exists
                    num = game_id + "_" + task + "_MED"
                    stageId = med_id
                elif(hi_id): #only hi exists
                    num = game_id + "_" + task + "_HIGH"
                    stageId = hi_id
                else: #only lo exists
                    num = game_id + "_" + task + "_LOW"
                    stageId = low_id
                
                # build the chat object
                chat_obj = {'conversation_num': num, 'stageId':stageId, 'gameId':game_id, 'message': text, 'speaker_nickname': speaker, 'timestamp': timestamp}

            # Write the row
            writer.writerow(chat_obj)

# Remove duplicate rows
conversations_raw = pd.read_csv('../raw_data/multi_task_conversations.csv').drop_duplicates(ignore_index=True)
conversations_raw.to_csv('../raw_data/multi_task_conversations.csv')

In [176]:
conversations_raw = pd.read_csv('../raw_data/multi_task_conversations.csv')

This generates the chat conversations alongside the dependent variables

In [196]:
conversations_with_dv = conversations_raw.merge(multi_task_dv, on="stageId", how="left")

conversations_with_dv.to_csv('../raw_data/multi_task_conversations_with_dv.csv')

Add in user/composition information

In [178]:
user_info = pd.read_csv('../raw_data/results_by_user_detailed.csv').rename(columns = {"stageIds": "stageId"})

Summary Statistics

Numeric: 
- 'birth_year', 'CRT', 'income_max', 'income_min', 'IRCS_GS', 'IRCS_GV', 'IRCS_IB', 'IRCS_IR', 'IRCS_IV', 'IRCS_RS', 'political_fiscal', 'political_social', 'RME'

Non-Numeric (recoded manually):
- 'country', 'education_level', 'gender', 'marital_status', 'political_party', 'race'

In [179]:
threshold = 0.8  # 80% threshold
user_info = user_info.dropna(axis=1, thresh=int(threshold * len(user_info)))

Recode everything into numeric:
- Country: 1 if US, 0 otherwise
- Education: ordinal variable (-1 if NA)
- Gender: 0 if Male, 1 if Female, 2 if other, -1 if NA
- Marital Status: ordinal variable (-1 if NA)
- Political party: ordinal variable (-1 if NA)
- Race: 1 if White, 0 otherwise

In [180]:
user_info.loc[:, 'country'] = (user_info['country'] == 'United States').astype(int)

In [181]:
education_order = [
    'Less than a high school diploma',
    'High school diploma',
    'Some college or vocational training',
    '2-year college degree',
    '4-year college degree',
    'Post-college degree'
]
user_info['education_level'] = pd.Categorical(user_info['education_level'], categories=education_order, ordered=True)
user_info['education_level_numeric'] = user_info['education_level'].cat.codes

In [182]:
gender_order = [
    'Male',
    'Female',
    'Other'
]
user_info['gender'] = pd.Categorical(user_info['gender'], categories=gender_order, ordered=True)
user_info['gender'] = user_info['gender'].cat.codes

In [184]:
maritalstatus_order = [
    'Single Never Married',
    'Married or Domestic Partnership',
    'Divorced',
    'Widowed',
    'Separated'
]
user_info['marital_status'] = pd.Categorical(user_info['marital_status'], categories=maritalstatus_order, ordered=True)
user_info['marital_status'] = user_info['marital_status'].cat.codes

In [185]:
politicalparty_order = [
    'Democrat',
    'Independent',
    'Republican',
    'Other Party',
    'Neutral'
]
user_info['political_party'] = pd.Categorical(user_info['political_party'], categories=politicalparty_order, ordered=True)
user_info['political_party'] = user_info['political_party'].cat.codes

In [186]:
user_info.loc[:, 'race'] = (user_info['race'] == 'White').astype(int)

In [193]:
user_features_numeric = user_info[['stageId', 'birth_year', 'CRT', 'income_max', 'income_min', 'IRCS_GS', 'IRCS_GV', 'IRCS_IB', 'IRCS_IR', 'IRCS_IV', 'IRCS_RS', 'political_fiscal', 'political_social', 'RME', 'country', 'education_level', 'gender', 'marital_status', 'political_party', 'race']]

In [194]:
numeric_columns = user_info.select_dtypes(include='number').columns.difference(['ID'])

# Group by 'ID' and calculate mean and std for all numeric columns
grouped_df = user_info.groupby('stageId')[numeric_columns].agg(['mean', 'std'])

# Flatten the multi-level column index
grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]


In [198]:
# This dataframe aggregates all the composition-level features
conversations_with_dv_and_composition = grouped_df.merge(conversations_with_dv, on = "stageId", how = "left")

conversations_with_dv_and_composition.to_csv('../raw_data/multi_task_conversations_with_dv_and_composition.csv')