In [1]:
import pandas as pd
import numpy as np

import random
import tqdm
import warnings
import plotly
import plotly.graph_objects as go

from scipy.stats import spearmanr
from plotly.subplots import make_subplots

PROCESSED_DATA = "../data/processed"

MODELS = [
  "gpt3_normalised_rating",
  "gpt4_normalised_rating",
  "gpt3_scm",
  "gpt4_scm",
  "human_scm"
]

TOP_PROMPT_GENERATOR = "S3-C1-A1-Q1-O1-T"

np.random.seed(1)
random.seed(1)

# Load experiment 2 participant ratings

In [2]:
human_df = pd.read_csv(f"{PROCESSED_DATA}/experiment_2_dedeyne_master.csv", index_col=0)

# Filter out control and tutorial arguments
human_df = human_df[(~human_df["is_control"]) & (~human_df["is_tutorial"])].dropna().rename({"argument": "argument_id"}, axis=1)

# Transform 0-100 argument ratings into argument rankings for each participant and conclusion type
# human_df["ranking"] = human_df.groupby(["pid", "is_single_premise"])["rating"].rank(method="dense", ascending=True)

# CK: I think we should use fractional ranking given that we want to average rankings across participants. For example
# if a participant gave a rating of 0 to 5 arguments and a rating of 100 to 5 arguments, then the dense ranking would
# assign ranks of 1,1,1,1,1,2,2,2,2,2 to the arguments. That doesn't seem right given that we'd be comparing this ranking
# to rankings for other participants that might range from 1 to 10. Another way to see that there's a problem with
# dense rankings is that they're not symmetric -- all of them start at 1 but it's not the case that all of them end at 10,
# and it doesn't seem right for there to be an asymmetry in the way the top ranks and bottom ranks are handled.

human_df["ranking"] = human_df.groupby(["pid", "is_single_premise"])["rating"].rank(pct="True", ascending=True)

# Map participants to blocks based on the multi premise arguments that they saw
blocks = {}
for pid, pid_df in human_df[~human_df["is_single_premise"]].groupby(["pid"]):
    block = tuple(pid_df["argument_id"].sort_values().tolist())
    if block not in blocks:
        blocks[block] = [pid]
    else:
        blocks[block].append(pid)
pid_to_block = {}
for i,pids in enumerate(blocks.values()):
    for pid in pids:
        pid_to_block[pid] = i
human_df["block"] = human_df["pid"].map(pid_to_block)

# I don't understand these blocks. Why are there 67? And why do some of them include a single participant only?

human_df.head()

Unnamed: 0,pid,tid,domain,conclusion_type,is_osherson,premises,conclusion,rating,argument_id,is_single_premise,is_control,is_tutorial,light_cut,medium_cut,hard_cut,ranking,block
0,0,0,Birds,General,False,"('Eagles',)",All birds,0,"(('Eagles',), 'All birds')",True,False,False,False,False,False,0.041667,0
1,0,1,Birds,General,False,"('Crows',)",All birds,7,"(('Crows',), 'All birds')",True,False,False,False,False,False,0.583333,0
2,0,10,Birds,General,False,"('Vultures',)",All birds,3,"(('Vultures',), 'All birds')",True,False,False,False,False,False,0.1875,0
3,0,11,Birds,General,False,"('Falcons',)",All birds,3,"(('Falcons',), 'All birds')",True,False,False,False,False,False,0.1875,0
4,0,12,Birds,General,False,"('Herons',)",All birds,7,"(('Herons',), 'All birds')",True,False,False,False,False,False,0.583333,0


# Load experiment 2 model ratings

In [3]:
model_df = pd.read_csv(f"{PROCESSED_DATA}/e2_combined_df.csv", index_col=0)
model_df["argument_type"] = model_df["is_single_premise"].apply(lambda x: "single" if x else "multi")
model_df = model_df[model_df["prompt_generator"] == TOP_PROMPT_GENERATOR]
model_df = model_df[["argument_id", "domain", "conclusion_type", "argument_type"] + MODELS].reset_index(drop=True)

model_df.head()

Unnamed: 0,argument_id,domain,conclusion_type,argument_type,gpt3_normalised_rating,gpt4_normalised_rating,gpt3_scm,gpt4_scm,human_scm
0,"(('Airplanes', 'Buses'), 'All vehicles')",Vehicles,General,multi,0.914,0.998,0.652174,0.727053,0.758303
1,"(('Airplanes', 'Helicopters'), 'All vehicles')",Vehicles,General,multi,0.812,0.994,0.44369,0.52038,0.519173
2,"(('Airplanes', 'Taxis'), 'All vehicles')",Vehicles,General,multi,0.922,0.998,0.572917,0.719354,0.750755
3,"(('Airplanes', 'Trams'), 'Trains')",Vehicles,Specific,multi,0.976,0.996,0.819444,0.818765,0.816501
4,"(('Airplanes', 'Zeppelins'), 'All vehicles')",Vehicles,General,multi,0.8,0.992,0.563406,0.510417,0.534118


# Bootstrap model correlations

In [9]:
ITERS = 100
ITERS = 5 

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    rows = []
    for i in tqdm.tqdm(range(ITERS)):

        # Sample N participants with replacement from each block
        pid_sample = []
        for block_pids in blocks.values():
            pid_sample += random.choices(block_pids, k=len(block_pids))

        # Construct a new bootstrapped 'human_df' dataframe from the above sample
        tdf = pd.DataFrame([], columns=human_df.columns)
        for ps in pid_sample:
            ptdf = human_df[human_df["pid"] == ps]
            tdf = pd.concat([tdf, ptdf])
            
        # Construct a new aggregated human ranking dataframe from the above dataframe
        sample_human_df = pd.DataFrame({"sample_human_ranking": tdf[["argument_id", "ranking"]].groupby("argument_id")["ranking"].mean()}).reset_index()
        
        # Compare model ratings/rankings to sample human rankings
        for g, gdf in model_df.groupby(["domain", "conclusion_type", "argument_type"]):
            domain, conclusion_type, argument_type = g
            
            sample_human_model_df = gdf.merge(sample_human_df, on="argument_id")

            # Calculate Spearman R for all models
            statistics, pvalues = [], []
            for model in MODELS:
                spearman = spearmanr(
                    sample_human_model_df["sample_human_ranking"].tolist(), 
                    sample_human_model_df[model].tolist()
                )
                
                statistics.append(spearman.correlation)
                pvalues.append(spearman.pvalue)

            rows.append((i, domain, conclusion_type, argument_type) + tuple(statistics) + tuple(pvalues))

bootstrap_df = pd.DataFrame(rows, columns=["i", "domain", "conclusion_type", "argument_type"] + [f"{r}_spearmanr" for r in MODELS] + [f"{r}_pval" for r in MODELS])
bootstrap_df = bootstrap_df.rename({c: f'{c.replace("_normalised_rating", "")}' for c in bootstrap_df.columns}, axis=1)
bootstrap_df.to_csv(f"{PROCESSED_DATA}/experiment_2_prompt_sweep_bootstrap_correlations_participantblockwise.csv")

  0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
bootstrap_df = pd.read_csv(f"{PROCESSED_DATA}/experiment_2_prompt_sweep_bootstrap_correlations_participantblockwise.csv", index_col=0)

rows = []
for g, gdf in tqdm.tqdm(bootstrap_df.groupby(["domain", "conclusion_type", "argument_type"])):
    means, ses = [], []
    for model in MODELS:
        model = model.replace("_normalised_rating", "")
        means.append(np.mean(gdf[f"{model}_spearmanr"]))
        ses.append(np.std(gdf[f"{model}_spearmanr"], ddof=1))
    rows.append(g + tuple(means) + tuple(ses))

bootstrap_summary_df = pd.DataFrame(rows, columns=["domain", "conclusion_type", "argument_type"] + [f"{m}_mean" for m in MODELS] + [f"{m}_se" for m in MODELS])
bootstrap_summary_df = bootstrap_summary_df.rename({c: f'{c.replace("_normalised_rating", "")}_spearmanr' if "gpt" in c or "scm" in c else c for c in bootstrap_summary_df.columns}, axis=1)
bootstrap_summary_df.to_csv(f"{PROCESSED_DATA}/experiment_2_prompt_sweep_bootstrap_correlations_participantblockwise_summary.csv")
bootstrap_summary_df

100%|██████████| 12/12 [00:00<00:00, 643.86it/s]


Unnamed: 0,domain,conclusion_type,argument_type,gpt3_mean_spearmanr,gpt4_mean_spearmanr,gpt3_scm_mean_spearmanr,gpt4_scm_mean_spearmanr,human_scm_mean_spearmanr,gpt3_se_spearmanr,gpt4_se_spearmanr,gpt3_scm_se_spearmanr,gpt4_scm_se_spearmanr,human_scm_se_spearmanr
0,Birds,General,multi,0.317168,0.243728,-0.019329,0.224663,0.026925,0.072164,0.084235,0.056098,0.076364,0.12374
1,Birds,General,single,0.442261,0.370583,0.461217,0.581739,0.11913,0.02555,0.046738,0.057568,0.015224,0.043365
2,Birds,Specific,multi,0.32516,0.246834,0.443209,0.43557,0.378573,0.034819,0.011412,0.025505,0.034821,0.065925
3,Birds,Specific,single,0.429189,0.418749,0.500589,0.499124,0.552404,0.011068,0.023047,0.014409,0.016312,0.011675
4,Mammals,General,multi,0.291832,0.158038,0.282538,0.269701,0.334997,0.0785,0.072813,0.067246,0.093754,0.105222
5,Mammals,General,single,0.317243,0.020437,0.627102,0.726325,0.640234,0.08903,0.026956,0.061572,0.047344,0.050391
6,Mammals,Specific,multi,0.579871,0.717152,0.712019,0.720395,0.76579,0.031162,0.039049,0.037623,0.035065,0.033553
7,Mammals,Specific,single,0.589429,0.703619,0.671415,0.72084,0.776784,0.014205,0.013878,0.020092,0.022093,0.015542
8,Vehicles,General,multi,0.533878,0.366775,0.351941,0.401112,0.510037,0.06627,0.021981,0.072704,0.041836,0.051103
9,Vehicles,General,single,0.875652,0.629004,0.458087,0.622957,0.872174,0.012526,0.020763,0.052909,0.031441,0.019081


# Calculate Split-half Reliability

In [6]:
ITERS = 100
ITERS = 5 

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    rows = []
    for i in tqdm.tqdm(range(ITERS)):
        for group, grouped_human_df in human_df.groupby(["domain", "conclusion_type", "is_single_premise"]):
            domain, conclusion_type, is_single_premise = group
            a_ratings = pd.DataFrame([], columns=["argument_id", "domain", "conclusion_type", "is_single_premise", "a_rating", "a_ranking"])
            b_ratings = pd.DataFrame([], columns=["argument_id", "domain", "conclusion_type", "is_single_premise", "b_rating", "b_ranking"])
    
            # Split each block of participants in two
            for block, block_df in grouped_human_df.groupby("block"):

                # Randomly divide this block's participants into sets A and B
                block_pids = list(block_df["pid"].unique())
                a_pids = random.sample(block_pids, k=len(block_pids)//2)
                b_pids = [bp for bp in block_pids if bp not in a_pids]
                a_block_df = block_df[block_df["pid"].isin(a_pids)]
                b_block_df = block_df[block_df["pid"].isin(b_pids)]

                # Calculated mean argument ranking for A and B
                a_block_ratings = pd.DataFrame({"a_ranking": a_block_df.groupby(["argument_id", "domain", "conclusion_type", "is_single_premise"])["ranking"].mean()}).reset_index()
                b_block_ratings = pd.DataFrame({"b_ranking": b_block_df.groupby(["argument_id", "domain", "conclusion_type", "is_single_premise"])["ranking"].mean()}).reset_index()
                a_ratings = pd.concat([a_ratings, a_block_ratings]).reset_index(drop=True)
                b_ratings = pd.concat([b_ratings, b_block_ratings]).reset_index(drop=True)

            # Calculate correlation between A and B rankings
            t_ratings_df = a_ratings.merge(b_ratings, on=["argument_id", "domain", "conclusion_type", "is_single_premise"])
            ss = spearmanr(t_ratings_df["a_ranking"], t_ratings_df["b_ranking"])
            rows.append((i,domain,conclusion_type,is_single_premise,ss.correlation,ss.pvalue))

ratings_df = pd.DataFrame(rows, columns=["i", "domain", "conclusion_type", "is_single_premise", "statistic", "pvalue"])
ratings_df.to_csv(f"{PROCESSED_DATA}/e2_split_half_reliabilities_blockwise.csv")

100%|██████████| 5/5 [00:11<00:00,  2.38s/it]


In [7]:
ratings_df = pd.read_csv(f"{PROCESSED_DATA}/e2_split_half_reliabilities_blockwise.csv")

rows = []
for g, gdf in ratings_df.groupby(["domain", "conclusion_type", "is_single_premise"]):
    rows.append(g + (gdf["statistic"].mean(), gdf["statistic"].std()))
summary_ratings_df = pd.DataFrame(rows, columns=["domain", "conclusion_type", "is_single_premise", "spearmanr_mean", "spearmanr_std"])

summary_ratings_df.to_csv(f"{PROCESSED_DATA}/e2_split_half_reliabilities_blockwise_summary.csv")
summary_ratings_df

Unnamed: 0,domain,conclusion_type,is_single_premise,spearmanr_mean,spearmanr_std
0,Birds,General,False,0.124096,0.057336
1,Birds,General,True,0.160775,0.017047
2,Birds,Specific,False,0.282848,0.049961
3,Birds,Specific,True,0.68891,0.021108
4,Mammals,General,False,0.149761,0.090976
5,Mammals,General,True,0.202847,0.020258
6,Mammals,Specific,False,0.443075,0.037939
7,Mammals,Specific,True,0.815847,0.005856
8,Vehicles,General,False,0.338872,0.040009
9,Vehicles,General,True,0.629368,0.020778


# Plot bootstrapped model correlations + split half reliability

In [8]:
summary_ratings_df = pd.read_csv(f"{PROCESSED_DATA}/e2_split_half_reliabilities_blockwise_summary.csv", index_col=0)
summary_ratings_df["argument_type"] = summary_ratings_df["is_single_premise"].apply(lambda x: "single" if x else "multi")

bootstrap_summary_df = pd.read_csv(f"{PROCESSED_DATA}/experiment_2_prompt_sweep_bootstrap_correlations_participantblockwise_summary.csv", index_col=0)

GRID_COLOUR= "#e6e6e6"
SPLIT_RELIABILITY_COLOUR = "#cbc9ff"

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        '<b>Specific, Single Premise</b>', 
        '<b>General, Single Premise</b>', 
        '<b>Specific, Two Premise</b>', 
        '<b>General, Two Premise</b>'
    ],
    shared_yaxes=True,
    vertical_spacing=0.05,
    horizontal_spacing=0.02,
    specs=[[{'type': 'scatter'}, {'type': 'scatter'}],
           [{'type': 'scatter'}, {'type': 'scatter'}]]
)

domain_mapping = {'Mammals': 1, 'Birds': 2, 'Vehicles': 3}
marker_colors = ['green', '#94d1a4', 'red', 'pink', 'blue']
offsets = [-0.2, -0.1, 0, 0.1, 0.2]
rating_columns = [
                    'gpt3_mean_spearmanr', 
                    'gpt3_scm_mean_spearmanr', 
                    'gpt4_mean_spearmanr', 
                    'gpt4_scm_mean_spearmanr', 
                    'human_scm_mean_spearmanr'
                 ]


def add_dumbbell_trace(fig, df, rating_col, marker_color, row, col, legend, offset):
    temp_df = df.copy()
    temp_df['x'] = temp_df['domain'].map(domain_mapping) + offset
    fig.add_trace(go.Scatter(
        x=temp_df['x'],
        y=temp_df[rating_col],
        error_y=dict(type='data', array=df[rating_col.replace("mean", "se")], symmetric=True, visible=True, width=7, thickness=5, color=marker_color),
        mode='markers',
        marker=dict(color=marker_color, size=20),
        showlegend=legend,
        legendgroup=rating_col,
        name=rating_col.split('_')[0].replace("gpt", "GPT-") if 'scm' not in rating_col else ' '.join([rc for rc in rating_col.split('_')[:2] if rc != "corr"]).replace("gpt", "GPT-").replace("scm", "SCM"),
        customdata=temp_df['domain'],
        texttemplate="%{customdata}<br>%{y}",
        hovertemplate="%{customdata}<br>%{y}",
    ), row=row, col=col)


shapes = []
for i, (conclusion_type, argument_type) in enumerate([
        ('Specific', 'single'), 
        ('General', 'single'), 
        ('Specific', 'multi'), 
        ('General', 'multi')
    ], start=1):
    
    row, col = (i - 1) // 2 + 1, (i - 1) % 2 + 1
    data = bootstrap_summary_df[(bootstrap_summary_df['conclusion_type'] == conclusion_type) & (bootstrap_summary_df['argument_type'] == argument_type)]

    split_half_df = summary_ratings_df[(summary_ratings_df["conclusion_type"] == conclusion_type) & (summary_ratings_df["argument_type"] == argument_type)]
    split_half_df["x"] = split_half_df["domain"].map(domain_mapping)
    shapes = [
        dict(type="rect", xref=f"x{row if row > 1 else ''}", yref=f"y{col if col > 1 else ''}", x0=r["x"] - 0.25, y0=r["spearmanr_mean"] + r["spearmanr_std"], x1=r["x"] + 0.25, y1=r["spearmanr_mean"] - r["spearmanr_std"], fillcolor=SPLIT_RELIABILITY_COLOUR, line_width=0, layer="below")
        for _, r in split_half_df.iterrows()
    ]
    for s in shapes:
        fig.add_shape(s, row=row, col=col)

    for rating_col, marker_color, offset in zip(rating_columns, marker_colors, offsets):
        add_dumbbell_trace(fig, data, rating_col, marker_color, row, col, (i==1), offset)

fig.update_layout(
    width=1600,
    height=800, 
    legend=dict(orientation='v', yanchor='top', xanchor='right', x=1.13, font=dict(size=20)),
    plot_bgcolor="#f5f6f7",
)

fig.update_annotations(font_size=24)

fig.update_yaxes(range=[-0.5, 1], matches='y')
fig.update_xaxes(tickvals=list(domain_mapping.values()), range=[0.5,3.5], ticktext=[f"{k}</br>" for k in list(domain_mapping.keys())])
for r in range(1,3):
    for c in range(1,3):
        fig.update_xaxes(showgrid=False, tickfont={"size": 20}, tickmode="array", tickvals=[1,2,3], row=r, col=c, showticklabels=r==2)
        fig.update_yaxes(gridwidth=6, zerolinewidth=6, zerolinecolor=GRID_COLOUR, gridcolor=GRID_COLOUR, tickfont={"size": 20}, tickmode="array", tickvals=[-0.5,0,0.5,1], ticktext=[f"{k} " for k in [-0.5,0,0.5,1]], row=r, col=c)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/