In [1]:
import pandas as pd
import numpy as np
import os, sys, re, json
from ast import literal_eval
import random
from sklearn.model_selection import train_test_split, StratifiedKFold
import krippendorff
from sklearn.metrics import cohen_kappa_score
from metric_utils import CAC

# Construction

In [44]:
df = {
    'Model': [],
    'Experiment': [],
    'Proposer Belief': [],
    'Responder Belief': [],
    'Proposal': [],
    'Proposer Reasoning': [],
    'Decision': [],
    'Responder Reasoning': []
}

In [45]:
for model in os.listdir('../simulations'):
    print(model)
    curr_dir = os.path.join('../simulations', model)

    if not os.path.isdir(curr_dir):
        continue

    for setting in [('greedy', 'fair'), ('fair', 'greedy'), ('greedy', 'greedy'), ('selfless', 'greedy'), ('selfless', 'fair'), ('greedy', 'selfless'), ('fair', 'fair'), ('fair', 'selfless'), ('selfless', 'selfless')]:
        print(setting)
        set_dir = os.path.join(curr_dir, f'{setting[0]}-{setting[1]}')

        if not os.path.isdir(set_dir):
            continue

        for exp in os.listdir(set_dir):
            if 'tom' not in exp:
                continue
            print(exp)
            exp_dir = os.path.join(set_dir, exp)
            if not os.path.isdir(exp_dir):
                continue
            
            for _ in range(2):
                session = random.randint(1, 10)

                with open(os.path.join(exp_dir, f'{session}.json'), 'r') as f:
                    data = json.load(f)
                f.close()

                turn = random.randint(0, len(data) - 1)
                data = data[turn]

                df['Model'].append(model)
                df['Experiment'].append(exp.replace('belief_private_', ''))
                df['Proposer Belief'].append(setting[0])
                df['Responder Belief'].append(setting[1])
                df['Proposal'].append(data['proposer'])
                df['Proposer Reasoning'].append(data['proposer_tom'])
                df['Decision'].append(data['responder'])
                df['Responder Reasoning'].append(data['responder_tom'])

llama-3.1-8b-instant
('greedy', 'fair')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('fair', 'greedy')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('greedy', 'greedy')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('selfless', 'greedy')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('selfless', 'fair')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('greedy', 'selfless')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('fair', 'fair')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('fair', 'selfless')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('selfless', 'selfless')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
gpt-4o
('greedy', 'fair')
belief_private_tom-both
belief_private_tom-zero
belief_private_tom-first
('fair', 'greedy')
belief_private_

In [46]:
df = pd.DataFrame(df)

In [47]:
df.shape

(378, 8)

In [69]:
prop_df = df[['Model', 'Experiment', 'Proposer Belief', 'Responder Belief', 'Proposal', 'Proposer Reasoning']]
resp_df = df[['Model', 'Experiment', 'Proposer Belief', 'Responder Belief', 'Proposal', 'Decisions', 'Responder Reasoning']]

In [70]:
x1 = prop_df[prop_df['Model'] == 'deepseek-r1-distill-qwen-32b'].sample(frac=1).reset_index(drop=True)
x2 = prop_df[prop_df['Model'] == 'gpt-4o'].sample(frac=1).reset_index(drop=True)
x3 = prop_df[prop_df['Model'] == 'llama-3.3-70b-versatile'].sample(frac=1).reset_index(drop=True)

In [71]:
z1 = np.array_split(x1, 3)
z2 = np.array_split(x2, 3)
z3 = np.array_split(x3, 3)

  return bound(*args, **kwds)


In [89]:
df_ver = [pd.DataFrame([]) for _ in range(3)]

In [90]:
for i in range(3):
    df_ver[i] = pd.concat([df_ver[i], z1[i]])
    df_ver[i] = pd.concat([df_ver[i], z2[i]])
    df_ver[i] = pd.concat([df_ver[i], z3[i]])

    df_ver[i]['Proposal Consistency'] = ["" for _ in range(54)]
    df_ver[i]['Belief Consistency'] = ["" for _ in range(54)]

In [103]:
df_ver[0].reset_index(drop=True).to_csv('../human_verification/prop-s1.csv', columns=['Proposer Belief', 'Proposal', 'Proposer Reasoning', 'Proposal Consistency', 'Belief Consistency'])
df_ver[1].reset_index(drop=True).to_csv('../human_verification/prop-s2.csv', columns=['Proposer Belief', 'Proposal', 'Proposer Reasoning', 'Proposal Consistency', 'Belief Consistency'])
df_ver[2].reset_index(drop=True).to_csv('../human_verification/prop-s3.csv', columns=['Proposer Belief', 'Proposal', 'Proposer Reasoning', 'Proposal Consistency', 'Belief Consistency'])

In [77]:
y1 = resp_df[resp_df['Model'] == 'deepseek-r1-distill-qwen-32b'].sample(frac=1).reset_index(drop=True)
y2 = resp_df[resp_df['Model'] == 'gpt-4o'].sample(frac=1).reset_index(drop=True)
y3 = resp_df[resp_df['Model'] == 'llama-3.3-70b-versatile'].sample(frac=1).reset_index(drop=True)

In [78]:
a1 = np.array_split(y1, 3)
a2 = np.array_split(y2, 3)
a3 = np.array_split(y3, 3)

  return bound(*args, **kwds)


In [95]:
df_ver_r = [pd.DataFrame([]) for _ in range(3)]

In [96]:
for i in range(3):
    df_ver_r[i] = pd.concat([df_ver_r[i], a1[i]])
    df_ver_r[i] = pd.concat([df_ver_r[i], a2[i]])
    df_ver_r[i] = pd.concat([df_ver_r[i], a3[i]])

    df_ver_r[i]['Decision Consistency'] = ["" for _ in range(54)]
    df_ver_r[i]['Belief Consistency'] = ["" for _ in range(54)]

In [104]:
df_ver_r[0].reset_index(drop=True).to_csv('../human_verification/resp-s1.csv', columns=['Proposal', 'Responder Belief', 'Decision', 'Responder Reasoning', 'Decision Consistency', 'Belief Consistency'])
df_ver_r[1].reset_index(drop=True).to_csv('../human_verification/resp-s2.csv', columns=['Proposal', 'Responder Belief', 'Decision', 'Responder Reasoning', 'Decision Consistency', 'Belief Consistency'])
df_ver_r[2].reset_index(drop=True).to_csv('../human_verification/resp-s3.csv', columns=['Proposal', 'Responder Belief', 'Decision', 'Responder Reasoning', 'Decision Consistency', 'Belief Consistency'])

# Statistics

In [2]:
p1 = pd.read_csv('../human_verification/prop-s1.csv')
p2 = pd.read_csv('../human_verification/prop-s2.csv')
p3 = pd.read_csv('../human_verification/prop-s3.csv')

In [3]:
p1['q1'] = p1.apply(lambda x: 1 if 'yes' in x['q1'] else 0, axis=1)
p1['q2'] = p1.apply(lambda x: 1 if 'yes' in x['q2'] else 0, axis=1)

p2['q1'] = p2.apply(lambda x: 1 if 'yes' in x['q1'] else 0, axis=1)
p2['q2'] = p2.apply(lambda x: 1 if 'yes' in x['q2'] else 0, axis=1)

p3['q1'] = p3.apply(lambda x: 1 if 'yes' in x['q1'] else 0, axis=1)
p3['q2'] = p3.apply(lambda x: 1 if 'yes' in x['q2'] else 0, axis=1)

In [4]:
r1 = pd.read_csv('../human_verification/resp-s1.csv')
r2 = pd.read_csv('../human_verification/resp-s2.csv')
r3 = pd.read_csv('../human_verification/resp-s3.csv')

In [5]:
r1['q1'] = r1.apply(lambda x: 1 if 'yes' in x['q1'] else 0, axis=1)
r1['q2'] = r1.apply(lambda x: 1 if 'yes' in x['q2'] else 0, axis=1)

r2['q1'] = r2.apply(lambda x: 1 if 'yes' in x['q1'] else 0, axis=1)
r2['q2'] = r2.apply(lambda x: 1 if 'yes' in x['q2'] else 0, axis=1)

r3['q1'] = r3.apply(lambda x: 1 if 'yes' in x['q1'] else 0, axis=1)
r3['q2'] = r3.apply(lambda x: 1 if 'yes' in x['q2'] else 0, axis=1)

In [7]:
h_p1 = pd.read_csv('../human_verification/responses/prop-s1.csv')
h_p2 = pd.read_excel('../human_verification/responses/prop-s2.xlsx')
h_p3 = pd.read_csv('../human_verification/responses/prop-s3.csv')

In [8]:
h_p1['Proposal Consistency'] = h_p1.apply(lambda x: 1 if 'yes' in x['Proposal Consistency'] else 0, axis=1)
h_p1['Belief Consistency'] = h_p1.apply(lambda x: 1 if 'yes' in x['Belief Consistency'] else 0, axis=1)

h_p2['Proposal Consistency'] = h_p2.apply(lambda x: 1 if 'yes' in x['Proposal Consistency'] else 0, axis=1)
h_p2['Belief Consistency'] = h_p2.apply(lambda x: 1 if 'yes' in x['Belief Consistency'] else 0, axis=1)

h_p3['Proposal Consistency'] = h_p3.apply(lambda x: 1 if 'yes' in x['Proposal Consistency'] else 0, axis=1)
h_p3['Belief Consistency'] = h_p3.apply(lambda x: 1 if 'yes' in x['Belief Consistency'] else 0, axis=1)

In [9]:
h_r1 = pd.read_csv('../human_verification/responses/resp-s1.csv')
h_r2 = pd.read_excel('../human_verification/responses/resp-s2.xlsx')
h_r3 = pd.read_csv('../human_verification/responses/resp-s3.csv')

In [10]:
h_r1['Decision Consistency'] = h_r1.apply(lambda x: 1 if 'yes' in x['Decision Consistency'] else 0, axis=1)
h_r1['Belief Consistency'] = h_r1.apply(lambda x: 1 if 'yes' in x['Belief Consistency'] else 0, axis=1)

h_r2['Decision Consistency'] = h_r2.apply(lambda x: 1 if 'yes' in x['Decision Consistency'] else 0, axis=1)
h_r2['Belief Consistency'] = h_r2.apply(lambda x: 1 if 'yes' in x['Belief Consistency'] else 0, axis=1)

h_r3['Decision Consistency'] = h_r3.apply(lambda x: 1 if 'yes' in x['Decision Consistency'] else 0, axis=1)
h_r3['Belief Consistency'] = h_r3.apply(lambda x: 1 if 'yes' in x['Belief Consistency'] else 0, axis=1)

In [26]:
sum([h_r3['Decision Consistency'].mean(), h_r3['Belief Consistency'].mean(), h_p3['Proposal Consistency'].mean(), h_p3['Belief Consistency'].mean()])/4

np.float64(0.7962962962962963)

In [28]:
sum([h_r2['Decision Consistency'].mean(), h_r2['Belief Consistency'].mean(), h_p2['Proposal Consistency'].mean(), h_p2['Belief Consistency'].mean()])/4

np.float64(0.8518518518518519)

In [27]:
sum([h_r1['Decision Consistency'].mean(), h_r1['Belief Consistency'].mean(), h_p1['Proposal Consistency'].mean(), h_p1['Belief Consistency'].mean()])/4

np.float64(0.6898148148148149)

In [11]:
df = {
    'Model': [],
    'Experiment': [],
    'Proposer Belief': [],
    'Responder Belief': [],
    'Proposal': [],
    'Proposer Reasoning': [],
    'Decision': [],
    'Responder Reasoning': []
}

for model in os.listdir('../simulations'):
    print(model)
    curr_dir = os.path.join('../simulations', model)

    if not os.path.isdir(curr_dir):
        continue

    for setting in [('greedy', 'fair'), ('fair', 'greedy'), ('greedy', 'greedy'), ('selfless', 'greedy'), ('selfless', 'fair'), ('greedy', 'selfless'), ('fair', 'fair'), ('fair', 'selfless'), ('selfless', 'selfless')]:
        print(setting)
        set_dir = os.path.join(curr_dir, f'{setting[0]}-{setting[1]}')

        if not os.path.isdir(set_dir):
            continue

        for exp in os.listdir(set_dir):
            if 'tom' not in exp:
                continue
            print(exp)
            exp_dir = os.path.join(set_dir, exp)
            if not os.path.isdir(exp_dir):
                continue
            
            for session in range(10):
                try:
                    with open(os.path.join(exp_dir, f'{session+1}.json'), 'r') as f:
                        data = json.load(f)
                    f.close()

                    for turn in range(len(data)):
                        curr = data[turn]

                        df['Model'].append(model)
                        df['Experiment'].append(exp.replace('belief_private_', ''))
                        df['Proposer Belief'].append(setting[0])
                        df['Responder Belief'].append(setting[1])
                        df['Proposal'].append(curr['proposer'])
                        df['Proposer Reasoning'].append(curr['proposer_tom'])
                        df['Decision'].append(curr['responder'])
                        df['Responder Reasoning'].append(curr['responder_tom'])
                except FileNotFoundError as e:
                    continue

claude-3-5-haiku-20241022
('greedy', 'fair')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('fair', 'greedy')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('greedy', 'greedy')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('selfless', 'greedy')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('selfless', 'fair')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('greedy', 'selfless')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('fair', 'fair')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('fair', 'selfless')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('selfless', 'selfless')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
deepseek-r1-distill-qwen-32b
('greedy', 'fair')
belief_private_tom-both
belief_private_tom-first
belief_private_tom-zero
('fair'

In [12]:
df = pd.DataFrame(df)

In [13]:
p1['reasoning_type'] = ['' for _ in range(p1.shape[0])]
p2['reasoning_type'] = ['' for _ in range(p2.shape[0])]

p1['model'] = ['' for _ in range(p1.shape[0])]
p2['model'] = ['' for _ in range(p2.shape[0])]

p3['reasoning_type'] = ['' for _ in range(p3.shape[0])]
p3['model'] = ['' for _ in range(p3.shape[0])]

r1['reasoning_type'] = ['' for _ in range(r1.shape[0])]
r2['reasoning_type'] = ['' for _ in range(r2.shape[0])]

r1['model'] = ['' for _ in range(r1.shape[0])]
r2['model'] = ['' for _ in range(r2.shape[0])]

r3['reasoning_type'] = ['' for _ in range(r3.shape[0])]
r3['model'] = ['' for _ in range(r3.shape[0])]

In [15]:
reasonings = []

for i in range(len(df)):
    for j in range(len(p1)):
        if p1['Proposer Reasoning'][j] == df['Proposer Reasoning'][i] and p1['Proposer Belief'][j] == df['Proposer Belief'][i]:
            p1.at[j, 'reasoning_type'] = df['Experiment'][i]
            p1.at[j, 'model'] = df['Model'][i]
        
        if p2['Proposer Reasoning'][j] == df['Proposer Reasoning'][i] and p2['Proposer Belief'][j] == df['Proposer Belief'][i]:
            p2.at[j, 'reasoning_type'] = df['Experiment'][i]
            p2.at[j, 'model'] = df['Model'][i]
        
        if p3['Proposer Reasoning'][j] == df['Proposer Reasoning'][i] and p3['Proposer Belief'][j] == df['Proposer Belief'][i]:
            p3.at[j, 'reasoning_type'] = df['Experiment'][i]
            p3.at[j, 'model'] = df['Model'][i]
        

        if r1['Responder Reasoning'][j] == df['Responder Reasoning'][i] and r1['Responder Belief'][j] == df['Responder Belief'][i]:
            r1.at[j, 'reasoning_type'] = df['Experiment'][i]
            r1.at[j, 'model'] = df['Model'][i]
        
        if r2['Responder Reasoning'][j] == df['Responder Reasoning'][i] and r2['Responder Belief'][j] == df['Responder Belief'][i]:
            r2.at[j, 'reasoning_type'] = df['Experiment'][i]
            r2.at[j, 'model'] = df['Model'][i]

        if r3['Responder Reasoning'][j] == df['Responder Reasoning'][i] and r3['Responder Belief'][j] == df['Responder Belief'][i]:
            r3.at[j, 'reasoning_type'] = df['Experiment'][i]
            r3.at[j, 'model'] = df['Model'][i]

In [17]:
df

Unnamed: 0,Model,Experiment,Proposer Belief,Responder Belief,Proposal,Proposer Reasoning,Decision,Responder Reasoning
0,claude-3-5-haiku-20241022,tom-both,greedy,fair,Proposal: I get $9 and you get $1,Round 1 Reasoning:\n\n1. My Beliefs:\n- As a g...,"Based on my previous reasoning and analysis, w...",Let me analyze the situation based on the give...
1,claude-3-5-haiku-20241022,tom-both,greedy,fair,Proposal: I get $7 and you get $3,Round 2 Reasoning:\n\n1. My Beliefs:\n- My ini...,"Based on my previous reasoning and analysis, w...",Let me analyze the new proposal:\n\n1. My beli...
2,claude-3-5-haiku-20241022,tom-both,greedy,fair,Proposal: I get $6 and you get $4,Round 3 Reasoning:\n\n1. My Beliefs:\n- Player...,"Based on my previous reasoning and analysis, w...",Let me analyze the new proposal:\n\n1. My beli...
3,claude-3-5-haiku-20241022,tom-both,greedy,fair,Proposal: I get $5 and you get $5,Round 4 Reasoning:\n\n1. My Beliefs:\n- Player...,"Based on my previous reasoning and analysis, w...",Let me analyze the new proposal:\n\n1. My beli...
4,claude-3-5-haiku-20241022,tom-both,greedy,fair,Based on my reasoning and belief of being a gr...,Let me analyze the situation based on the give...,Based on my previous reasoning and analysis of...,Let me analyze the situation carefully:\n\n1. ...
...,...,...,...,...,...,...,...,...
4930,o3-mini,tom-zero,selfless,selfless,Proposal: I get $0 and you get $10.,"I'm sorry, but I can't share details about my ...",Decision: Accept,"I'm sorry, but I can’t share details about my ..."
4931,o3-mini,tom-zero,selfless,selfless,Proposal: I get $0 and you get $10.,"I'm sorry, but I can't share that information.",Decision: Accept,"I'm sorry, but I can’t share details about my ..."
4932,o3-mini,tom-zero,selfless,selfless,Proposal: I get $0 and you get $10,"I'm sorry, but I can’t share details about my ...",Decision: Accept,"I'm sorry, but I can’t share details about my ..."
4933,o3-mini,tom-zero,selfless,selfless,Proposal: I get $0 and you get $10.,"I'm sorry, but I can't share details about my ...",Decision: Accept,"I'm sorry, but I can’t share details about my ..."


In [19]:
h_p1['reasoning_type'] = p1['reasoning_type']
h_p2['reasoning_type'] = p2['reasoning_type']

h_r1['reasoning_type'] = r1['reasoning_type']
h_r2['reasoning_type'] = r2['reasoning_type']

h_r3['reasoning_type'] = r3['reasoning_type']
h_p3['reasoning_type'] = p3['reasoning_type']


h_p1['model'] = p1['model']
h_p2['model'] = p2['model']

h_r1['model'] = r1['model']
h_r2['model'] = r2['model']

h_r3['model'] = r3['model']
h_p3['model'] = p3['model']

In [77]:
# aggregate columns by reasoning_type for each df
pd.concat([h_p1, h_p2, h_p3]).groupby('reasoning_type').agg({
    'Proposal Consistency': 'mean',
    'Belief Consistency': 'mean',
})

Unnamed: 0_level_0,Proposal Consistency,Belief Consistency
reasoning_type,Unnamed: 1_level_1,Unnamed: 2_level_1
tom-both,0.62963,0.907407
tom-first,0.555556,0.740741
tom-zero,0.722222,0.907407


In [78]:
pd.concat([h_p1, h_p2, h_p3]).groupby('model').agg({
    'Proposal Consistency': 'mean',
    'Belief Consistency': 'mean',
})

Unnamed: 0_level_0,Proposal Consistency,Belief Consistency
model,Unnamed: 1_level_1,Unnamed: 2_level_1
deepseek-r1-distill-qwen-32b,0.62963,0.888889
gpt-4o,0.592593,0.759259
llama-3.3-70b-versatile,0.685185,0.907407


In [20]:
pd.concat([h_r1, h_r2, h_r3]).groupby('reasoning_type').agg({
    'Decision Consistency': 'mean',
    'Belief Consistency': 'mean',
})

Unnamed: 0_level_0,Decision Consistency,Belief Consistency
reasoning_type,Unnamed: 1_level_1,Unnamed: 2_level_1
,0.876543,0.753086


In [81]:
pd.concat([h_r1, h_r2, h_r3]).groupby('model').agg({
    'Decision Consistency': 'mean',
    'Belief Consistency': 'mean',
})

Unnamed: 0_level_0,Decision Consistency,Belief Consistency
model,Unnamed: 1_level_1,Unnamed: 2_level_1
deepseek-r1-distill-qwen-32b,0.944444,0.685185
gpt-4o,0.814815,0.740741
llama-3.3-70b-versatile,0.87037,0.833333


In [86]:
pd.concat([p1, p2, p3]).groupby('model').agg({
    'q1': 'mean',
    'q2': 'mean',
})

Unnamed: 0_level_0,q1,q2
model,Unnamed: 1_level_1,Unnamed: 2_level_1
deepseek-r1-distill-qwen-32b,0.611111,0.759259
gpt-4o,0.740741,0.685185
llama-3.3-70b-versatile,0.851852,0.87037


In [87]:
pd.concat([p1, p2, p3]).groupby('reasoning_type').agg({
    'q1': 'mean',
    'q2': 'mean',
})

Unnamed: 0_level_0,q1,q2
reasoning_type,Unnamed: 1_level_1,Unnamed: 2_level_1
tom-both,0.740741,0.796296
tom-first,0.740741,0.722222
tom-zero,0.722222,0.796296


In [88]:
pd.concat([r1, r2, r3]).groupby('model').agg({
    'q1': 'mean',
    'q2': 'mean',
})

Unnamed: 0_level_0,q1,q2
model,Unnamed: 1_level_1,Unnamed: 2_level_1
deepseek-r1-distill-qwen-32b,0.962963,0.703704
gpt-4o,0.888889,0.722222
llama-3.3-70b-versatile,0.962963,0.777778


In [89]:
pd.concat([r1, r2, r3]).groupby('reasoning_type').agg({
    'q1': 'mean',
    'q2': 'mean',
})

Unnamed: 0_level_0,q1,q2
reasoning_type,Unnamed: 1_level_1,Unnamed: 2_level_1
tom-both,0.962963,0.740741
tom-first,0.925926,0.611111
tom-zero,0.925926,0.851852


In [98]:
h_r3['reasoning_type'].value_counts()

reasoning_type
tom-first    22
tom-zero     19
tom-both     13
Name: count, dtype: int64

In [115]:
pq1_annot = {
    'p1_q1_h': h_p1['Proposal Consistency'].values.tolist(),
    'p1_q1': p1['q1'].values.tolist(),
    'p2_q1_h': h_p2['Proposal Consistency'].values.tolist(),
    'p2_q1': p2['q1'].values.tolist(),
    'p3_q1_h': h_p3['Proposal Consistency'].values.tolist(),
    'p3_q1': p3['q1'].values.tolist(),
    'm1': p1['model'].values.tolist(),
    'm2': p2['model'].values.tolist(),
    'm3': p3['model'].values.tolist(),
    'rt1': p1['reasoning_type'].values.tolist(),
    'rt2': p2['reasoning_type'].values.tolist(),
    'rt3': p3['reasoning_type'].values.tolist(),
}

pq2_annot = {
    'p1_q2_h': h_p1['Belief Consistency'].values.tolist(),
    'p1_q2': p1['q2'].values.tolist(),
    'p2_q2_h': h_p2['Belief Consistency'].values.tolist(),
    'p2_q2': p2['q2'].values.tolist(),
    'p3_q2_h': h_p3['Belief Consistency'].values.tolist(),
    'p3_q2': p3['q2'].values.tolist(),
    'm1': p1['model'].values.tolist(),
    'm2': p2['model'].values.tolist(),
    'm3': p3['model'].values.tolist(),
    'rt1': p1['reasoning_type'].values.tolist(),
    'rt2': p2['reasoning_type'].values.tolist(),
    'rt3': p3['reasoning_type'].values.tolist(),
}

In [116]:
rq1_annot = {
    'r1_q1_h': h_r1['Decision Consistency'].values.tolist(),
    'r1_q1': r1['q1'].values.tolist(),
    'r2_q1_h': h_r2['Decision Consistency'].values.tolist(),
    'r2_q1': r2['q1'].values.tolist(),
    'r3_q1_h': h_r3['Decision Consistency'].values.tolist(),
    'r3_q1': r3['q1'].values.tolist(),
    'm1': r1['model'].values.tolist(),
    'm2': r2['model'].values.tolist(),
    'm3': r3['model'].values.tolist(),
    'rt1': r1['reasoning_type'].values.tolist(),
    'rt2': r2['reasoning_type'].values.tolist(),
    'rt3': r3['reasoning_type'].values.tolist(),
}

rq2_annot = {
    'r1_q2_h': h_r1['Belief Consistency'].values.tolist(),
    'r1_q2': r1['q2'].values.tolist(),
    'r2_q2_h': h_r2['Belief Consistency'].values.tolist(),
    'r2_q2': r2['q2'].values.tolist(),
    'r3_q2_h': h_r3['Belief Consistency'].values.tolist(),
    'r3_q2': r3['q2'].values.tolist(),
    'm1': r1['model'].values.tolist(),
    'm2': r2['model'].values.tolist(),
    'm3': r3['model'].values.tolist(),
    'rt1': r1['reasoning_type'].values.tolist(),
    'rt2': r2['reasoning_type'].values.tolist(),
    'rt3': r3['reasoning_type'].values.tolist(),
}

In [117]:
pq1 = pd.DataFrame(pq1_annot)
pq2 = pd.DataFrame(pq2_annot)

rq1 = pd.DataFrame(rq1_annot)
rq2 = pd.DataFrame(rq2_annot)

In [105]:
!pip install krippendorff

Collecting krippendorff
  Using cached krippendorff-0.8.1-py3-none-any.whl.metadata (3.0 kB)
Using cached krippendorff-0.8.1-py3-none-any.whl (18 kB)
Installing collected packages: krippendorff
Successfully installed krippendorff-0.8.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [133]:
overall = np.array([x1[:, 0], x1[:, 1]]).T

In [134]:
overall = np.concatenate((overall, np.array([x2[:, 0], x2[:, 1]]).T), axis=0)
overall = np.concatenate((overall, np.array([x3[:, 0], x3[:, 1]]).T), axis=0)

overall = np.concatenate((overall, np.array([x1[:, 2], x1[:, 3]]).T), axis=0)
overall = np.concatenate((overall, np.array([x1[:, 4], x3[:, 5]]).T), axis=0)
overall = np.concatenate((overall, np.array([x2[:, 2], x2[:, 3]]).T), axis=0)
overall = np.concatenate((overall, np.array([x2[:, 4], x2[:, 5]]).T), axis=0)
overall = np.concatenate((overall, np.array([x3[:, 2], x3[:, 3]]).T), axis=0)
overall = np.concatenate((overall, np.array([x3[:, 4], x3[:, 5]]).T), axis=0)

In [136]:
krippendorff.alpha(overall.T, level_of_measurement='nominal')

0.30526094276094284

In [106]:
import krippendorff

In [180]:
pq1

Unnamed: 0,p1_q1_h,p1_q1,p2_q1_h,p2_q1,p3_q1_h,p3_q1,m1,m2,m3,rt1,rt2,rt3
0,1,1,0,0,1,1,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-zero,tom-zero,tom-both
1,1,1,1,1,0,0,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-first,tom-first,tom-first
2,1,0,0,0,0,0,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-zero,tom-zero,tom-both
3,0,0,0,0,1,1,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-both,tom-both,tom-both
4,1,1,0,1,0,0,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-zero,tom-first,tom-first
5,1,1,1,1,0,0,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-first,tom-first,tom-first
6,0,0,1,1,0,0,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-zero,tom-both,tom-zero
7,1,1,1,1,0,0,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-first,tom-both,tom-zero
8,1,1,1,1,1,1,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-zero,tom-first,tom-zero
9,1,1,0,1,1,1,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,deepseek-r1-distill-qwen-32b,tom-first,tom-both,tom-both


In [189]:
alpha_all = {key: [] for key in ["Component", "Proposal Consistency", "Proposer Belief Consistency", "Decision Consistency", "Responder Belief Consistency"]}

for col, data in zip(['Proposal Consistency', 'Proposer Belief Consistency', 'Decision Consistency', 'Responder Belief Consistency'], [pq1, pq2, rq1, rq2]):
    for model in ['deepseek-r1-distill-qwen-32b', 'gpt-4o', 'llama-3.3-70b-versatile']:
        temp = data[data['m1'] == model].values
        
        curr_data = np.array([temp[:, 0], temp[:, 1]])
        curr_data = np.concatenate((curr_data, np.array([temp[:, 2], temp[:, 3]])), axis=1)
        curr_data = np.concatenate((curr_data, np.array([temp[:, 4], temp[:, 5]])), axis=1)

        # curr_data = curr_data.T

        if model not in alpha_all['Component']:
            alpha_all['Component'].append(model)
        alpha_all[col].append(round(krippendorff.alpha(reliability_data=curr_data.astype(int), level_of_measurement='nominal'), 2))

for col, data in zip(['Proposal Consistency', 'Proposer Belief Consistency', 'Decision Consistency', 'Responder Belief Consistency'], [pq1, pq2, rq1, rq2]):
    for r_type in ['tom-zero', 'tom-first', 'tom-both']:
        temp = data[data['rt1'] == r_type].values
        temp = np.concatenate((temp, data[data['rt2'] == r_type].values), axis=0)
        temp = np.concatenate((temp, data[data['rt3'] == r_type].values), axis=0)
        
        curr_data = np.array([temp[:, 0], temp[:, 1]])
        curr_data = np.concatenate((curr_data, np.array([temp[:, 2], temp[:, 3]])), axis=1)
        curr_data = np.concatenate((curr_data, np.array([temp[:, 4], temp[:, 5]])), axis=1)

        print(curr_data.shape)

        # curr_data = curr_data.T

        if r_type not in alpha_all['Component']:
            alpha_all['Component'].append(r_type)
        alpha_all[col].append(round(krippendorff.alpha(reliability_data=curr_data.astype(int), level_of_measurement='nominal'), 2))

(2, 162)
(2, 162)
(2, 162)
(2, 162)
(2, 162)
(2, 162)
(2, 162)
(2, 162)
(2, 162)
(2, 162)
(2, 162)
(2, 162)


In [190]:
pd.DataFrame(alpha_all)

Unnamed: 0,Component,Proposal Consistency,Proposer Belief Consistency,Decision Consistency,Responder Belief Consistency
0,deepseek-r1-distill-qwen-32b,0.81,0.3,0.79,0.09
1,gpt-4o,0.17,0.27,0.27,0.02
2,llama-3.3-70b-versatile,0.23,-0.11,0.16,0.47
3,tom-zero,0.49,0.1,0.48,0.01
4,tom-first,0.38,0.18,0.29,0.21
5,tom-both,0.41,0.34,0.24,0.23


In [17]:
corrs = []

for col, df in zip(["Proposal Consistency", "Proposer Belief Consistency", "Decision Consistency", "Responder Belief Consistency"], [pq1, pq2, rq1, rq2]):
    vals = df.values
    print((vals[:, 0] == 1).mean()*100)
    print((vals[:, 2] == 1).mean()*100)
    print((vals[:, 4] == 1).mean()*100)
    
    print('---')
    # corrs.append(vals[0:2, 0:2])
    # corrs.append(vals[2:4, 2:4])
    # corrs.append(vals[4:6, 4:6])

40.74074074074074
75.92592592592592
74.07407407407408
---
81.48148148148148
88.88888888888889
85.18518518518519
---
77.77777777777779
100.0
85.18518518518519
---
75.92592592592592
75.92592592592592
74.07407407407408
---
