In [1]:
import pandas as pd
import numpy as np
import plotly.io as pio
pio.renderers.default='notebook'

import random
import tqdm
import warnings
import plotly
import plotly.graph_objects as go

from scipy.stats import spearmanr
from plotly.subplots import make_subplots
from collections import Counter

DATA = "../data/experiment_2"

MODELS = [
    "gpt3_rating",
    "gpt3_text1_rating",
    "gpt3.5_text2_rating",
    "gpt3.5_text3_rating",
    "gpt3.5_chat_rating",
    "gpt4_rating",
    "gpt3_scm",
    "gpt4_scm",
    "human_scm"
]

np.random.seed(1)
random.seed(1)

NUM_PARTICIPANTS = 610

# Load experiment 2 participant ratings

In [2]:
human_df = pd.read_csv(f"{DATA}/clean_human_ratings.csv", index_col=0)

# Filter out control and tutorial arguments
human_df = human_df[~human_df["is_control"]].dropna()
human_df["ranking"] = human_df.groupby(["pid", "is_single_premise"])["rating"].rank(pct="True", ascending=True)

# Map participants to blocks based on the multi premise arguments that they saw
blocks = {}
block_splits = {}
for pid, pid_df in human_df[~human_df["is_single_premise"]].groupby(["pid"]):
    block = tuple(pid_df["argument"].sort_values().tolist())
    if block not in blocks:
        blocks[block] = [pid]
    else:
        blocks[block].append(pid)
        
    bs = (pid_df.iloc[0]["domain"], pid_df.iloc[0]["conclusion_type"])
    if bs not in block_splits:
        block_splits[bs] = []
    block_splits[bs].append(block)
    
pid_to_block = {}
for i,pids in enumerate(blocks.values()):
    for pid in pids:
        pid_to_block[pid] = i
human_df["block"] = human_df["pid"].map(pid_to_block)

assert len(human_df["pid"].unique()) == NUM_PARTICIPANTS
assert len(pid_to_block) == NUM_PARTICIPANTS
assert len(blocks) == 61
assert all(len(set(v))== 11 if k == ('Birds', 'General') else len(set(v)) == 10 for k,v in block_splits.items())
assert all(len(set(v)) == 10 for v in blocks.values())

human_df.head()

Unnamed: 0,pid,tid,argument,domain,conclusion_type,is_single_premise,is_control,premises,conclusion,rating,light_cut,medium_cut,hard_cut,ranking,block
0,0,0,"(('Eagles',), 'All birds')",Birds,General,True,False,"('Eagles',)",All birds,0,False,False,False,0.041667,
1,0,1,"(('Crows',), 'All birds')",Birds,General,True,False,"('Crows',)",All birds,7,False,False,False,0.583333,
2,0,10,"(('Vultures',), 'All birds')",Birds,General,True,False,"('Vultures',)",All birds,3,False,False,False,0.1875,
3,0,11,"(('Falcons',), 'All birds')",Birds,General,True,False,"('Falcons',)",All birds,3,False,False,False,0.1875,
4,0,12,"(('Herons',), 'All birds')",Birds,General,True,False,"('Herons',)",All birds,7,False,False,False,0.583333,


In [3]:
# Check our blocks
argument_block_map = {}
for split in block_splits:
    for block in block_splits[split]:
        for argument in block:
            if argument in argument_block_map:
                argument_block_map[argument].add(block)
            else:
                argument_block_map[argument] = set([block])
            
for argument, argument_blocks in argument_block_map.items():
    if len(set(argument_blocks)) > 1:
        print(argument)

(('Sparrows', 'Robins'), 'All birds')
(('Magpies', 'Ducks'), 'All birds')
(('Turkeys', 'Roosters'), 'All birds')
(('Seagulls', 'Ducks'), 'All birds')
(('Chickens', 'Magpies'), 'All birds')
(('Herons', 'Eagles'), 'All birds')
(('Sparrows', 'Ducks'), 'All birds')
(('Falcons', 'Vultures'), 'All birds')
(('Swans', 'Parrots'), 'All birds')
(('Swans', 'Sparrows'), 'All birds')


Some bird arguments appear in more than one block because we had 11 blocks for general birds rather than 10.

# Load experiment 2 model ratings

In [4]:
model_df = pd.read_csv(f"{DATA}/model_ratings.csv", index_col=0)
model_df = model_df[["argument"] + MODELS].reset_index(drop=True)

# Add in split labels to model_df
cols = model_df.columns.tolist()[1:]
s = model_df.shape[0]
model_df = model_df.merge(human_df[["argument","domain","conclusion_type","is_single_premise"]].drop_duplicates(), on="argument")
cols = ["argument","domain","conclusion_type","is_single_premise"] + cols
model_df = model_df[cols]

assert set(human_df["argument"]) == set(model_df["argument"])
assert len(set(model_df["argument"])) == model_df.shape[0]

model_df.head()

Unnamed: 0,argument,domain,conclusion_type,is_single_premise,gpt3_rating,gpt3_text1_rating,gpt3.5_text2_rating,gpt3.5_text3_rating,gpt3.5_chat_rating,gpt4_rating,gpt3_scm,gpt4_scm,human_scm
0,"(('Airplanes', 'Buses'), 'All vehicles')",Vehicles,General,False,0.0,85.657166,74.211105,59.515127,50.0,70.0,9.769913,10.916667,10.772222
1,"(('Airplanes', 'Helicopters'), 'All vehicles')",Vehicles,General,False,0.0,41.889276,22.741273,32.105938,50.0,55.0,8.313786,7.666667,8.063889
2,"(('Airplanes', 'Taxis'), 'All vehicles')",Vehicles,General,False,0.0,78.416258,79.044539,63.641823,,70.0,9.363121,10.583333,10.622222
3,"(('Airplanes', 'Trams'), 'Trains')",Vehicles,Specific,False,0.0,85.249731,82.56942,98.796711,100.0,85.0,12.175505,12.260417,13.163889
4,"(('Airplanes', 'Zeppelins'), 'All vehicles')",Vehicles,General,False,0.0,89.903792,17.75267,28.767269,90.0,50.0,7.396477,7.541667,8.252778


# Bootstrap model correlations

In [37]:
ITERS = 1000

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    rows = []
    for i in tqdm.tqdm(range(ITERS)):

        # Sample N participants with replacement from each block
        pid_sample = []
        for block_pids in blocks.values():
            
            block_sample = random.choices(block_pids, k=len(block_pids))
            
            assert len(block_sample) == len(block_pids)
            
            pid_sample += block_sample
        
        assert len(pid_sample) == NUM_PARTICIPANTS

        # Construct a new bootstrapped 'human_df' dataframe from the above sample
        tdf = pd.DataFrame([], columns=human_df.columns)
        for ps in pid_sample:
            ptdf = human_df[human_df["pid"] == ps]
            tdf = pd.concat([tdf, ptdf])
        
        assert tdf.shape[0] == human_df.shape[0]
            
        # Construct a new aggregated human ranking dataframe from the above dataframe
        sample_human_df = pd.DataFrame({"sample_human_ranking": tdf[["argument", "ranking"]].groupby("argument")["ranking"].mean()}).reset_index()
        
        assert sample_human_df.shape[0] == len(set(sample_human_df["argument"]))
        
        # Compare model ratings/rankings to sample human rankings
        for g, gdf in model_df.groupby(["domain", "conclusion_type", "is_single_premise"]):
            domain, conclusion_type, argument_type = g
            
            sample_human_model_df = gdf.merge(sample_human_df, on="argument")
            
            assert sample_human_model_df.shape[0] == gdf.shape[0]

            # Calculate Spearman R for all models
            statistics, pvalues = [], []
            for model in MODELS:
                spearman = spearmanr(
                    sample_human_model_df["sample_human_ranking"].tolist(), 
                    sample_human_model_df[model].tolist(),
                    nan_policy="omit"
                )
                
                statistics.append(spearman.correlation)
                pvalues.append(spearman.pvalue)

            rows.append((i, domain, conclusion_type, argument_type) + tuple(statistics) + tuple(pvalues))

bootstrap_df = pd.DataFrame(rows, columns=["i", "domain", "conclusion_type", "is_single_premise"] + [f"{r}_spearmanr" for r in MODELS] + [f"{r}_pval" for r in MODELS])
bootstrap_df = bootstrap_df.rename({c: f'{c.replace("_rating", "")}' for c in bootstrap_df.columns}, axis=1)
bootstrap_df.to_csv(f"{DATA}/results/blockwise_participant_bootstrap.csv")

100%|███████████████████████████████| 1000/1000 [39:47<00:00,  2.39s/it]


In [38]:
bootstrap_df = pd.read_csv(f"{DATA}/results/blockwise_participant_bootstrap.csv", index_col=0)

rows = []
for g, gdf in tqdm.tqdm(bootstrap_df.groupby(["domain", "conclusion_type", "is_single_premise"])):
    means, ses = [], []
    for model in MODELS:
        model = model.replace("_rating", "")
        means.append(np.mean(gdf[f"{model}_spearmanr"]))
        ses.append(np.std(gdf[f"{model}_spearmanr"], ddof=1))
    rows.append(g + tuple(means) + tuple(ses))

bootstrap_summary_df = pd.DataFrame(rows, columns=["domain", "conclusion_type", "is_single_premise"] + [f"{m}_mean" for m in MODELS] + [f"{m}_se" for m in MODELS])
bootstrap_summary_df = bootstrap_summary_df.rename({c: f'{c.replace("_rating", "")}_spearmanr' if "gpt" in c or "scm" in c else c for c in bootstrap_summary_df.columns}, axis=1)
bootstrap_summary_df.to_csv(f"{DATA}/results/blockwise_participant_bootstrap_summary.csv")
bootstrap_summary_df

100%|█████████████████████████████████| 12/12 [00:00<00:00, 1492.15it/s]


Unnamed: 0,domain,conclusion_type,is_single_premise,gpt3_mean_spearmanr,gpt3_text1_mean_spearmanr,gpt3.5_text2_mean_spearmanr,gpt3.5_text3_mean_spearmanr,gpt3.5_chat_mean_spearmanr,gpt4_mean_spearmanr,gpt3_scm_mean_spearmanr,...,human_scm_mean_spearmanr,gpt3_se_spearmanr,gpt3_text1_se_spearmanr,gpt3.5_text2_se_spearmanr,gpt3.5_text3_se_spearmanr,gpt3.5_chat_se_spearmanr,gpt4_se_spearmanr,gpt3_scm_se_spearmanr,gpt4_scm_se_spearmanr,human_scm_se_spearmanr
0,Birds,General,False,,0.021884,0.055346,0.279387,0.098381,0.233251,0.007667,...,0.171576,,0.061032,0.059287,0.071098,0.058819,0.072145,0.062361,0.072451,0.089803
1,Birds,General,True,,0.382415,0.305921,0.109419,0.032391,0.364681,0.245743,...,0.146546,,0.079212,0.071366,0.066782,0.092752,0.075509,0.070703,0.07233,0.071407
2,Birds,Specific,False,,0.197246,0.411904,0.327905,0.472194,0.265485,0.474981,...,0.473035,,0.057633,0.053736,0.06186,0.059341,0.056928,0.062002,0.060769,0.070659
3,Birds,Specific,True,,0.371659,0.252523,0.425692,0.438994,0.417338,0.511474,...,0.554556,,0.031609,0.036504,0.028531,0.033764,0.033975,0.033763,0.038935,0.034468
4,Mammals,General,False,,0.054404,0.041103,0.293893,-0.050206,0.183427,0.24847,...,0.369043,,0.07037,0.052151,0.073944,0.058783,0.079778,0.06575,0.069198,0.082646
5,Mammals,General,True,,0.636571,0.525624,0.338115,0.167898,0.010888,0.685547,...,0.592804,,0.053659,0.059356,0.064716,0.071517,0.086863,0.051721,0.063952,0.065705
6,Mammals,Specific,False,,0.520238,0.722954,0.552505,0.807703,0.740692,0.748993,...,0.822756,,0.026143,0.026189,0.028889,0.026354,0.02803,0.03178,0.030003,0.029073
7,Mammals,Specific,True,,0.541083,0.645416,0.621527,0.763255,0.706085,0.710837,...,0.81124,,0.021125,0.020878,0.020121,0.018905,0.019945,0.019663,0.020303,0.019978
8,Vehicles,General,False,,0.322138,0.509027,0.530854,0.175959,0.395483,0.497397,...,0.531232,,0.056972,0.060328,0.059456,0.057761,0.053417,0.055465,0.057793,0.060813
9,Vehicles,General,True,,0.854876,0.693178,0.835738,0.379631,0.640777,0.672265,...,0.88207,,0.017597,0.026782,0.022587,0.02765,0.020848,0.025303,0.029891,0.019079


# Calculate Split-half Reliability

In [77]:
ITERS = 1000

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    rows = []
    for i in tqdm.tqdm(range(ITERS)):
        for split, split_df in human_df.groupby(["domain", "conclusion_type", "is_single_premise"]):
            domain, conclusion_type, is_single_premise = split
            a_block_ratings = pd.DataFrame([], columns=split_df.columns)
            b_block_ratings = pd.DataFrame([], columns=split_df.columns)
    
            # Split each block of participants in two
            for block, block_df in split_df.groupby("block"):

                # Randomly divide this block's participants into sets A and B
                block_pids = list(block_df["pid"].unique())
                a_pids = random.sample(block_pids, k=len(block_pids)//2)
                b_pids = [bp for bp in block_pids if bp not in a_pids]
                a_block_df = block_df[block_df["pid"].isin(a_pids)]
                b_block_df = block_df[block_df["pid"].isin(b_pids)]

                a_block_ratings = pd.concat([a_block_ratings, a_block_df]).reset_index(drop=True)
                b_block_ratings = pd.concat([b_block_ratings, b_block_df]).reset_index(drop=True)
                
                assert set(a_block_df["argument"]) == set(b_block_df["argument"])
                assert a_block_df.shape[0] + b_block_df.shape[0] == block_df.shape[0]
                
                # Each argument in A and B can have a difference of at most 1 participant, so the difference in the total number of participants should be no more than the max number of arguments
                assert abs(a_block_df.shape[0] - b_block_df.shape[0]) <= max(len(a_block_df["argument"].unique()), len(b_block_df["argument"].unique()))
            
            assert set(a_block_ratings["argument"]) == set(b_block_ratings["argument"])
            assert a_block_ratings.shape[0] + b_block_ratings.shape[0] == split_df.shape[0]
            
            # Each argument in A and B can have a difference of at most 1 participant, so the difference in the total number of participants should be no more than the max number of arguments
            assert abs(a_block_ratings.shape[0] - b_block_ratings.shape[0]) <= max(len(a_block_ratings["argument"].unique()), len(b_block_ratings["argument"].unique()))
            
            # Calculated mean argument ranking for A and B
            a_ratings = pd.DataFrame({"a_ranking": a_block_ratings.groupby(["argument", "domain", "conclusion_type", "is_single_premise"])["ranking"].mean()}).reset_index()
            b_ratings = pd.DataFrame({"b_ranking": b_block_ratings.groupby(["argument", "domain", "conclusion_type", "is_single_premise"])["ranking"].mean()}).reset_index()
            
            assert set(a_ratings["argument"]) == set(a_block_ratings["argument"])
            assert set(b_ratings["argument"]) == set(b_block_ratings["argument"])
            
            # Calculate number of participants per argument in A and B
            a_num_participants = pd.DataFrame({"num_participants": a_block_ratings.groupby("argument").size()}).reset_index()
            b_num_participants = pd.DataFrame({"num_participants": b_block_ratings.groupby("argument").size()}).reset_index()
            a_ratings = a_ratings.merge(a_num_participants, on="argument")
            b_ratings = b_ratings.merge(b_num_participants, on="argument")
            
            # Ratings dataframes should contain no duplicates and all the right arguments
            for x_ratings in [a_ratings, b_ratings]:
                assert x_ratings.drop_duplicates().shape[0] == x_ratings.shape[0]
                assert x_ratings.shape[0] == len(set(x_ratings["argument"]))
                assert set(x_ratings["argument"]) == set(split_df["argument"])
                
            # Number of participants for each argument in samples a and b should be roughly the same
            num_ratings = a_ratings[["argument", "num_participants"]].merge(b_ratings[["argument", "num_participants"]], on="argument")
            num_ratings["num_diff"] = abs(num_ratings["num_participants_x"] - num_ratings["num_participants_y"])
            assert all(x <= 1 for x in num_ratings["num_diff"])
            assert a_ratings["num_participants"].sum() + b_ratings["num_participants"].sum() == split_df.shape[0]
            
            # Calculate correlation between A and B rankings
            t_ratings_df = a_ratings.merge(b_ratings, on=["argument", "domain", "conclusion_type", "is_single_premise"])
            ss =  spearmanr(t_ratings_df["a_ranking"], t_ratings_df["b_ranking"])
            rows.append((i,domain,conclusion_type,is_single_premise,ss.correlation,ss.pvalue))

ratings_df = pd.DataFrame(rows, columns=["i", "domain", "conclusion_type", "is_single_premise", "statistic", "pvalue"])
ratings_df.to_csv(f"{DATA}/results/blockwise_split_half_reliabilities.csv")

100%|██████████| 1000/1000 [13:18<00:00,  1.25it/s]


In [78]:
ratings_df = pd.read_csv(f"{DATA}/results/blockwise_split_half_reliabilities.csv", index_col=0)

rows = []
for g, gdf in tqdm.tqdm(ratings_df.groupby(["domain", "conclusion_type", "is_single_premise"])):
    rows.append(g + (gdf["statistic"].mean(), gdf["statistic"].std()))
summary_ratings_df = pd.DataFrame(rows, columns=["domain", "conclusion_type", "is_single_premise", "spearmanr_mean", "spearmanr_std"])

summary_ratings_df.to_csv(f"{DATA}/results/blockwise_split_half_reliabilities_summary.csv")
summary_ratings_df

100%|██████████| 12/12 [00:00<00:00, 2400.06it/s]


Unnamed: 0,domain,conclusion_type,is_single_premise,spearmanr_mean,spearmanr_std
0,Birds,General,False,0.115118,0.081461
1,Birds,General,True,0.733808,0.081343
2,Birds,Specific,False,0.326532,0.070603
3,Birds,Specific,True,0.562417,0.038141
4,Mammals,General,False,0.161668,0.084412
5,Mammals,General,True,0.755261,0.068096
6,Mammals,Specific,False,0.765347,0.030081
7,Mammals,Specific,True,0.804185,0.019442
8,Vehicles,General,False,0.372382,0.065552
9,Vehicles,General,True,0.941449,0.01799


# Plot bootstrapped model correlations + split half reliability

In [7]:
SCALE_FACTOR = 1 # Set to 10+ for higher res

def plot_e2_figure(supplemental_figure):

    summary_ratings_df = pd.read_csv(f"{DATA}/results/blockwise_split_half_reliabilities_summary.csv", index_col=0)
    summary_ratings_df["argument_type"] = summary_ratings_df["is_single_premise"].apply(lambda x: "single" if x else "multi")

    bootstrap_summary_df = pd.read_csv(f"{DATA}/results/blockwise_participant_bootstrap_summary.csv", index_col=0)
    bootstrap_summary_df["argument_type"] = bootstrap_summary_df["is_single_premise"].apply(lambda x: "single" if x else "multi")


    GRID_COLOUR= "#e6e6e6"
    SPLIT_RELIABILITY_COLOUR = "#cbc9ff"

    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            '<b>Specific, Single Premise</b>', 
            '<b>General, Single Premise</b>', 
            '<b>Specific, Two Premise</b>', 
            '<b>General, Two Premise</b>'
        ],
        shared_yaxes=True,
        vertical_spacing=0.05,
        horizontal_spacing=0.02,
        specs=[[{'type': 'scatter'}, {'type': 'scatter'}],
               [{'type': 'scatter'}, {'type': 'scatter'}]]
    )

    domain_mapping = {'Mammals': 1, 'Birds': 2, 'Vehicles': 3}

    if supplemental_figure:
        marker_colors = ['brown', 'purple', 'green', 'orange', 'red']
        offsets = [-0.2, -0.1, 0, 0.1, 0.2]
        rating_columns = [
            'gpt3_text1_mean_spearmanr', 
            'gpt3.5_text2_mean_spearmanr',
            'gpt3.5_text3_mean_spearmanr',
            'gpt3.5_chat_mean_spearmanr',
            'gpt4_mean_spearmanr', 
        ]
    else:
        marker_colors = ['green', '#94d1a4', 'red', 'pink', 'blue']
        offsets = [-0.2, -0.1, 0, 0.1, 0.2]
        rating_columns = [
            'gpt3_text1_mean_spearmanr', 
            'gpt3_scm_mean_spearmanr', 
            'gpt4_mean_spearmanr', 
            'gpt4_scm_mean_spearmanr', 
            'human_scm_mean_spearmanr'
        ]


    def add_dumbbell_trace(fig, df, rating_col, marker_color, row, col, legend, offset):
        temp_df = df.copy()
        temp_df['x'] = temp_df['domain'].map(domain_mapping) + offset
        if not supplemental_figure:
            name = rating_col.split('_')[0].replace("gpt", "GPT-") if 'scm' not in rating_col else ' '.join([rc for rc in rating_col.split('_')[:2] if rc != "corr"]).replace("gpt", "GPT-").replace("scm", "SCM")
        else:
            name = {
                'gpt3_text1_mean_spearmanr': "GPT-3 (text-davinci-001)",
                'gpt3_scm_mean_spearmanr': "GPT-3 SCM",
                'gpt3.5_text2_mean_spearmanr': "GPT-3.5 (text-davinci-002)",
                'gpt3.5_text3_mean_spearmanr': "GPT-3.5 (text-davinci-003)",
                'gpt3.5_chat_mean_spearmanr': "GPT-3.5 (chat-turbo)",
                'gpt4_mean_spearmanr': "GPT-4",
                'gpt4_scm_mean_spearmanr': "GPT-4 SCM",
                'human_scm_mean_spearmanr': "Human SCM"
            }[rating_col]
        fig.add_trace(go.Scatter(
            x=temp_df['x'],
            y=temp_df[rating_col],
            error_y=dict(type='data', array=df[rating_col.replace("mean", "se")], symmetric=True, visible=True, width=7*SCALE_FACTOR, thickness=5*SCALE_FACTOR, color=marker_color),
            mode='markers',
            marker=dict(color=marker_color, size=20*SCALE_FACTOR),
            showlegend=legend,
            legendgroup=rating_col,
            name=name,
            customdata=temp_df['domain'],
            texttemplate="%{customdata}<br>%{y}",
            hovertemplate="%{customdata}<br>%{y}",
        ), row=row, col=col)


    shapes = []
    for i, (conclusion_type, argument_type) in enumerate([
            ('Specific', 'single'), 
            ('General', 'single'), 
            ('Specific', 'multi'), 
            ('General', 'multi')
        ], start=1):

        row, col = (i - 1) // 2 + 1, (i - 1) % 2 + 1
        data = bootstrap_summary_df[(bootstrap_summary_df['conclusion_type'] == conclusion_type) & (bootstrap_summary_df['argument_type'] == argument_type)]

        split_half_df = summary_ratings_df[(summary_ratings_df["conclusion_type"] == conclusion_type) & (summary_ratings_df["argument_type"] == argument_type)]
        split_half_df["x"] = split_half_df["domain"].map(domain_mapping)
        xwidth = 0.25
        shapes = [
            dict(type="rect", xref=f"x{row if row > 1 else ''}", yref=f"y{col if col > 1 else ''}", x0=r["x"] - xwidth, y0=r["spearmanr_mean"] + r["spearmanr_std"], x1=r["x"] + xwidth, y1=r["spearmanr_mean"] - r["spearmanr_std"], fillcolor=SPLIT_RELIABILITY_COLOUR, line_width=0, layer="below")
            for _, r in split_half_df.iterrows()
        ]
        for s in shapes:
            fig.add_shape(s, row=row, col=col)

        for rating_col, marker_color, offset in zip(rating_columns, marker_colors, offsets):
            add_dumbbell_trace(fig, data, rating_col, marker_color, row, col, (i==1), offset)

    fig.update_layout(
        width=1600*SCALE_FACTOR,
        height=800*SCALE_FACTOR, 
        legend=dict(orientation='v', yanchor='top', xanchor='right', x=1.13 if not supplemental_figure else 1.31, font=dict(size=20*SCALE_FACTOR)),
        plot_bgcolor="#f5f6f7",
    )

    fig.update_annotations(font_size=24)

    fig.update_yaxes(range=[-0.5, 1], matches='y')
    fig.update_xaxes(tickvals=list(domain_mapping.values()), range=[0.5,3.5], ticktext=[f"{k}</br>" for k in list(domain_mapping.keys())])
    for r in range(1,3):
        for c in range(1,3):
            fig.update_xaxes(showgrid=False, tickfont={"size": 20*SCALE_FACTOR}, tickmode="array", tickvals=[1,2,3], row=r, col=c, showticklabels=r==2)
            fig.update_yaxes(gridwidth=6*SCALE_FACTOR, zerolinewidth=6*SCALE_FACTOR, zerolinecolor=GRID_COLOUR, gridcolor=GRID_COLOUR, tickfont={"size": 20*SCALE_FACTOR}, tickmode="array", tickvals=[-0.5,0,0.5,1], ticktext=[f"{k} " for k in [-0.5,0,0.5,1]], row=r, col=c)

    fig.show()

In [8]:
plot_e2_figure(True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [22]:
plot_e2_figure(False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

# Compare model bootstraps

In [81]:
bootstrap_df = pd.read_csv(f"{DATA}/results/blockwise_participant_bootstrap.csv", index_col=0)

# In each split, calculate proportion of times that pairs of models beat each other
pairs = [
    ("gpt3", "gpt3_scm"),
    ("gpt3_scm", "gpt3"),
    ("gpt3", "gpt4"),
    ("gpt3_scm", "gpt4"),
    ("gpt4_scm", "gpt3"),
    ("gpt4", "gpt3"),
    ("gpt4", "gpt4_scm"),
    ("gpt4_scm", "gpt4"),
    ("gpt4", "human_scm"),
    ("gpt4_scm", "human_scm"),
    ("gpt3", "human_scm"),
    ("gpt3_scm", "human_scm"),
]

rows = []
for g, gdf in bootstrap_df.groupby(["domain", "conclusion_type", "is_single_premise"]):
    
    assert gdf.shape[0] == 1000
    
    outcomes = []
    for pair in pairs:
        p1, p2 = pair
        outcomes.append(gdf[gdf[f"{p2}_spearmanr"] > gdf[f"{p1}_spearmanr"]].shape[0] / gdf.shape[0])
    rows.append(g + tuple(outcomes))

bootstrap_outcome_df = pd.DataFrame(rows, columns=["domain", "conclusion_type", "is_single_premise"] + [f"{p2}_beats_{p1}" for p1,p2 in pairs])
bootstrap_outcome_df.to_csv(f"{DATA}/blockwise_participant_bootstrap_model_comparisons.csv")

In [82]:
# Generate latex table for paper
bootstrap_outcome_df["ct_order"] = bootstrap_outcome_df["conclusion_type"].map({"Specific": 0, "General": 1})
bootstrap_outcome_df["isp_order"] = bootstrap_outcome_df["is_single_premise"].map({True:0, False: 1})
bootstrap_outcome_df["d_order"] = bootstrap_outcome_df["domain"].map({"Mammals":0,"Birds":1,"Vehicles":2})

bootstrap_outcome_df = bootstrap_outcome_df.sort_values(by=["ct_order", "isp_order", "d_order"])
bootstrap_outcome_df["num_premises"] = bootstrap_outcome_df["is_single_premise"].map({True: "Single", False: "Multi"})
df = bootstrap_outcome_df[["conclusion_type", "num_premises", "domain", "human_scm_beats_gpt4", "human_scm_beats_gpt3", "gpt4_beats_gpt3", "gpt4_beats_gpt4_scm", "gpt3_beats_gpt3_scm"]]

In [83]:
lines = [
    "\\textbf{Conc}. & \\textbf{Prems} & \\textbf{Domain} &  \\human{\\textbf{H}} vs \\gptfour{\\textbf{4}}   & \\human{\\textbf{H}} vs \\gptthree{\\textbf{3}}  & \\gptfour{\\textbf{4}} vs \\gptthree{\\textbf{3}} & \\gptfour{\\textbf{4}} vs \\gptfourscm{\\textbf{4 SCM}} & \\gptthree{\\textbf{3}} vs \\gptthreescm{\\textbf{3 SCM}}\\\\",  
]

first = True
prow = None
for _, row in df.iterrows():
    
    line = ""
    if not first:
        if row["conclusion_type"] != prow["conclusion_type"] or row["num_premises"] != prow["num_premises"]:
            lines.append("\hline")
            
        for i, c in enumerate(["conclusion_type", "num_premises", "domain"]):
            if row["num_premises"] != prow["num_premises"] or c == "domain":
                line += f"{' ' if i > 0 else ''}{row[c]} &"
            else:
                line += f"{' ' if i > 0 else ''}&"
    else:
        lines.append("\hline")
        for i, c in enumerate(["conclusion_type", "num_premises", "domain"]):
            line += f"{' ' if i > 0 else ''}{row[c]} &"
    
    latex_name = lambda x: x.replace("_", "").replace("3", "three").replace("4", "four").replace("humanscm", "human").lower()
    t = len(df.columns[3:])
    bs, hl, hr = "\\", "{", "}"
    for i, comparison in enumerate(df.columns[3:]):
            c1,c2 = comparison.split("_beats_")
            if row[comparison] >= 0.95:
                line += f" {bs}{latex_name(c1)}{hl}{bs}textbf{hl}{row[comparison]}{hr}{hr}{' &' if i != t-1 else ''}"
            elif row[comparison] <= 0.05:
                line += f" {bs}{latex_name(c2)}{hl}{bs}textbf{hl}{row[comparison]}{hr}{hr}{' &' if i != t-1 else ''}"
            else:
                line += f" {row[comparison]}{' &' if i != t-1 else ''}"
    
    lines.append(line + f"{bs}{bs}")
    first = False
    prow = row

with open("../tables/table6.txt", "w") as f:
    for l in lines:
        f.write("%s\n" % l)

# How often do chat models give a neutral response?

In [65]:
df = pd.read_csv("../data/experiment_2/llm_ratings.csv", index_col=0)

In [76]:
for m, mdf in df.groupby("llm_model"):
    mdf = mdf[mdf["conclusion_type"] == "General"]
    print(m)
    print(sum([1 if "difficult to determine" in rc or "neutral" in rc else 0 for rc in mdf["llm_raw_completion"]])/mdf.shape[0])
    print()

davinci
0.0

gpt-3.5-turbo-0613
0.23655913978494625

gpt-4-0314
0.0

text-davinci-001
0.0

text-davinci-002
0.0

text-davinci-003
0.0

