In [60]:
import pandas as pd
import numpy as np
import plotly.io as pio
pio.renderers.default='notebook'

import plotly
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from statsmodels.stats.descriptivestats import sign_test
from typing import List, Dict

PHENOMENON_ORDER = [
    "Similarity",
    "Typicality",
    "Specificity",
    "Monotonicity (General)",
    "Monotonicity (Specific)",
    "Diversity (General)",
    "Diversity (Specific)",
    "Nonmonotonicity (General)",
    "Nonmonotonicity (Specific)",
    "Asymmetry",
    "Inclusion Fallacy"
]

DOMAIN_ORDER = ["Mammals", "Birds", "Vehicles"]


GRID_COLOUR= "#e6e6e6"
COLOURS = {"human_rating": "blue", "gpt-4-0314_rating": "red", "text-davinci-003_rating": "green"}

In [43]:
def read_df():
    human_df = pd.read_csv("../data/experiment_1/aggregated_human_ratings.csv", index_col=0)
    llm_df = pd.read_csv("../data/experiment_1/llm_ratings.csv", index_col=0)

    df = human_df[["argpair", "phenomenon", "domain", "average_human_rating"]].copy().rename({"average_human_rating": "human_rating"}, axis=1)
    df = df.sort_values(by="argpair")
    for llm_model, llm_model_df in llm_df.groupby("llm_model"):
        llm_model_df = llm_model_df.sort_values(by="argpair")
        df[f"{llm_model}_rating"] = llm_model_df["llm_rating"].tolist()

    # Realign ratings so that higher number means stronger SCM arg was stronger
    for c in df.columns:
        if "rating" in c:
            df[c] = 5 - df[c]

    df["phenomenon_order"] = df["phenomenon"].apply(lambda x: PHENOMENON_ORDER.index(x))
    
    return df

# Plot experiment 1 figure

In [45]:
df = read_df()

SCALE = 1

chat_rating_proportions = {}
for llm_model in ["gpt-3.5-turbo-0613", "gpt-4-0314"]:
    rating_counts = df.groupby(["domain", "phenomenon"])[f"{llm_model}_rating"].value_counts().to_dict()
    rating_total_counts = df.groupby(["domain", "phenomenon"]).size().to_dict()
    chat_rating_proportions[llm_model] = {k: v / rating_total_counts[(k[0], k[1])] for k,v in rating_counts.items()}

fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.02, subplot_titles=[f"<b>{d}</b>" for d in DOMAIN_ORDER])
for i in range(1,4):
    fig.add_shape(
        type='line',
        x0=2.5,
        x1=2.5,
        y0=-1,
        y1=12,
        yref='y',
        xref='x',
        line=dict(color=GRID_COLOUR, width=6*SCALE),
        layer="below",
        row=1, col=i
    )
    
for i, domain in enumerate(DOMAIN_ORDER):

    tdf = df[df["domain"] == domain].sort_values(by="phenomenon_order", ascending=False)

    # Add human and GPT-3 violins
    for rating_type in COLOURS:
        if "gpt-4" not in rating_type:
            fig.add_trace(go.Violin(
                x=np.array(tdf[rating_type]), 
                y=np.array([p.replace(" ", "    <br>") for p in tdf["phenomenon"]]), 
                line_color=COLOURS[rating_type],
                line_width=0,
                name={"text-davinci-003_rating": "GPT-3", "human_rating": "Human"}[rating_type], 
                showlegend=(i==0),
                fillcolor=COLOURS[rating_type],
                bandwidth=0.3,
            ), row=1, col=i+1)

    # Add GPT-4 bars
    for po, phenomenon in enumerate(PHENOMENON_ORDER):
        for rating in range(6):
            k = (domain, phenomenon, rating)
            if k in chat_rating_proportions["gpt-4-0314"]:
                if rating == 5:
                    rating -= 0.05
                if rating == 0:
                    rating += 0.05

                fig.add_shape(
                    type="line", x0=rating, x1=rating, y0=len(PHENOMENON_ORDER)-po-1, y1=len(PHENOMENON_ORDER)-po-1+0.8*chat_rating_proportions["gpt-4-0314"][k],
                    line=dict(
                        color=COLOURS["gpt-4-0314_rating"], width=12*SCALE,
                    ),
                    name="GPT-4",
                    row=1, col=i+1
                )
                
    fig.add_trace(go.Violin(x=[2.5],y=["Specificity"],line_width=0,fillcolor=COLOURS["gpt-4-0314_rating"],name="GPT-4",showlegend=(i==0)))

fig.update_yaxes(ticksuffix = "    ", tickfont={"size": 20*SCALE}, row=1, col=1)
fig.update_yaxes(showticklabels=False, row=1, col=2)
fig.update_yaxes(showticklabels=False, row=1, col=3)

for i in range(1,4):
    fig.update_xaxes(range=[0,5], gridwidth=6*SCALE, zerolinewidth=6*SCALE, gridcolor=GRID_COLOUR, tickfont={"size": 15*SCALE}, tickmode="array", tickvals=[0,1.25,2.5,3.75,5], row=1, col=i, ticktext=["<br>           Weaker<br>              argument<br>                is stronger", "<br>", "<br>Neutral", "<br>", "<br>Stronger            <br>argument              <br>is stronger               "])
  
fig.update_traces(orientation='h', side='positive', width=1.8, points=False)
fig.update_layout(
    xaxis_showgrid=True, 
    xaxis_zeroline=True, 
    width=1500*SCALE, height=1500*SCALE, 
    legend={"font": {"size": 25*SCALE}}, 
    plot_bgcolor="#f5f6f7"
)

fig.update_annotations(font_size=35*SCALE, borderpad=10)
fig.layout.annotations[0].update(x=0.08)
fig.layout.annotations[1].update(x=0.382)
fig.layout.annotations[2].update(x=0.75)
    
fig.show()

# Scorecard for table 4

In [137]:
sign_test(df[(df["phenomenon"] == "Similarity") & (df["domain"] == "Mammals")]["human_rating"].tolist())



(12.0, 8.046627044677734e-07)

In [148]:
df = read_df()

scorecard_ratings = [
    "text-davinci-003",
    "gpt-3.5-turbo-0613",
    "gpt-4-0314",
    "human",
]

for sr in scorecard_ratings:
    df[f"{sr}_rating"] = df[f"{sr}_rating"] - 2.5
df["phenomenon_order"] = df["phenomenon"].apply(lambda x: PHENOMENON_ORDER.index(x))

rows = []
for g, gdf in df.groupby(["phenomenon", "domain"]):
    phenomenon, domain = g
    for sr in scorecard_ratings:
        statistic, pvalue = sign_test(gdf[f"{sr}_rating"])
        rows.append((phenomenon, domain, sr, statistic, pvalue))

sign_test_results_df = pd.DataFrame(rows, columns=["phenomenon", "domain", "rating_type", "statistic", "pvalue"])
sign_test_results_df["significant"] = sign_test_results_df["pvalue"].apply(lambda x: x < 0.05)
sign_test_results_df["po"] = sign_test_results_df["phenomenon"].apply(lambda x: PHENOMENON_ORDER.index(x))
sign_test_results_df = sign_test_results_df.sort_values(by="po")

#sdf = pd.melt(sign_test_results_df, id_vars=["phenomenon", "domain", "rating_type"], value_vars=["pvalue", "statistic"], value_name="t")[["phenomenon", "domain", "rating_type", "t"]].rename({"t": "pvalue"}, axis=1)

sdf = sign_test_results_df[["phenomenon", "domain", "rating_type", "statistic", "pvalue"]]
sdf["phenomenon_order"] = sdf["phenomenon"].apply(lambda x: PHENOMENON_ORDER.index(x))
sdf["domain_order"] = sdf["domain"].apply(lambda x: DOMAIN_ORDER.index(x))
sdf["rating_type_order"] = sdf["rating_type"].apply(lambda x: scorecard_ratings.index(x))

sdf["pvalue"] = sdf["pvalue"].apply(lambda x: "{:.4f}".format(x))

human_df = sdf[sdf["rating_type"] == "human"].sort_values(by=["phenomenon_order", "domain_order", "rating_type_order"]).rename({"pvalue": "human_pvalue", "statistic": "human_statistic"}, axis=1)
for sr in scorecard_ratings:
    if sr != "human":
        human_df[f"{sr}_pvalue"] = sdf[sdf["rating_type"] == sr].sort_values(by=["phenomenon_order", "domain_order", "rating_type_order"])["pvalue"].tolist()
        human_df[f"{sr}_statistic"] = sdf[sdf["rating_type"] == sr].sort_values(by=["phenomenon_order", "domain_order", "rating_type_order"])["statistic"].tolist()
        
scorecard_df = human_df[["phenomenon", "domain"] + [f"{sr}_pvalue" for sr in scorecard_ratings] + [f"{sr}_statistic" for sr in scorecard_ratings]]
scorecard_df.to_csv("../data/experiment_1/scorecard.csv")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [155]:
scorecard_df

Unnamed: 0,phenomenon,domain,text-davinci-003_pvalue,gpt-3.5-turbo-0613_pvalue,gpt-4-0314_pvalue,human_pvalue,text-davinci-003_statistic,gpt-3.5-turbo-0613_statistic,gpt-4-0314_statistic,human_statistic
103,Similarity,Mammals,0.1686,0.1686,0.0,0.0,4.0,4.0,12.0,12.0
99,Similarity,Birds,0.69,0.4049,0.0,0.0,-1.5,2.5,11.5,10.5
107,Similarity,Vehicles,0.1078,0.0041,0.0,0.0,4.5,7.5,11.5,11.5
127,Typicality,Mammals,0.5413,0.2863,0.0227,0.0,-2.0,3.0,6.0,10.5
123,Typicality,Birds,0.5413,0.1516,0.0015,0.0,2.0,4.0,8.0,10.0
131,Typicality,Vehicles,1.0,0.0227,0.0,0.0,0.0,6.0,12.0,12.0
115,Specificity,Mammals,0.8388,0.0,0.0,0.0,1.0,12.0,12.0,12.0
111,Specificity,Birds,0.8388,0.0,0.0,0.0,1.0,10.0,12.0,12.0
119,Specificity,Vehicles,0.5413,0.0015,0.0,0.0,-2.0,8.0,12.0,12.0
55,Monotonicity (General),Mammals,0.0227,0.0,0.0,0.0,6.0,11.0,12.0,12.0


# Create LaTeX tables

In [159]:
def create_table(
    models: List[str],
    model_names: Dict[str,str],
    model_colors: Dict[str,str],
) -> str:
    
    scorecard_df = pd.read_csv("../data/experiment_1/scorecard.csv", index_col=0)
    
    output = ["\begin{table}[t!]","\centering"]
    n = len(models)
    
    m = ", ".join([model_names[x] for x in models[:-1]]) + f" and {model_names[models[-1]]}"
    caption = f"\caption{'{'}\small{'{'}Quantitative evaluation of {m} on the 11 phenomena across all three domains. The number represents the p-value on a sign test, with significant p-values indicating a preference for one argument over the other. Most of the time this preference is in the theoretically predicted direction (*), but when the weaker argument is endorsed significantly more ($\circ$) it is in the opposite.{'}'}{'}'}"
    
    output.append(caption)
    output.append("\vspace{1mm}")
    
    l = round(0.48 / n, 2)
    output.append("\begin{tabular}{|p{0.2\linewidth}| p{0.2\linewidth} | " + f"p{'{'}{l}\linewidth{'}'} "*n +"|}")
    
    output.append("\hline")
    output.append("\textbf{Phenomenon} & \textbf{Domain} & " + " & ".join([f"{model_colors[x]}{'{'}\textbf{'{'}{model_names[x]}{'}'}{'}'}" for x in models]) + "\\\\ \hline")
    
    for phenomenon in PHENOMENON_ORDER:
        pdf = scorecard_df[scorecard_df["phenomenon"] == phenomenon]
        leftover_phenomenon = None
        for domain in DOMAIN_ORDER:
            ddf = pdf[pdf["domain"] == domain]
            line = []
            if domain == DOMAIN_ORDER[0]:
                if len(phenomenon) > 10 and " " in phenomenon:
                    phenomenon, leftover_phenomenon = phenomenon.split(" ")
                line.append(phenomenon)
            elif leftover_phenomenon:
                line.append(leftover_phenomenon)
                leftover_phenomenon = None
            else:
                line.append(" ")
            line.append(domain)
            for model in models:
                score = round(ddf[f"{model}_pvalue"].iloc[0],2)
                negative = float(ddf[f"{model}_statistic"].iloc[0]) < 0
                if not negative:
                    point = "*"
                else:
                    point = "$\circ$"
                if score < 0.001:
                    line.append(f"{model_colors[model]}{'{'}\textbf{'{'}$<$0.001 {point}{'}'}{'}'}")
                elif score < 0.05:
                    line.append(f"{model_colors[model]}{'{'}\textbf{'{'}$<${score} {point}{'}'}{'}'}")
                else:
                    line.append(str(score))
            line[-1] += " \\\\"
            
            output.append(" & ".join(line))
        
        output.append("\hline")
    
    output.append("\end{tabular}")
    output.append("\label{tbl:expt1stats}")
    output.append("\vspace{-3mm}")
    output.append("\end{table}")
    
    output_table = "\n".join(output).replace("\b", "\\b").replace("\t", "\\t").replace("\v", "\\v")
    return output_table
    

In [162]:
t = create_table(
    models = [
        "text-davinci-003",
        "gpt-3.5-turbo-0613",
        "gpt-4-0314",
        "human",
    ],
    model_names = {
        "text-davinci-003": "GPT-3.5 (text-davinci-003)",
        "gpt-3.5-turbo-0613": "GPT-3.5 (chat-turbo)",
        "gpt-4-0314": "GPT-4",
        "human": "Humans",
    },
    model_colors = {
        "text-davinci-003": "\gptthreefivethree",
        "gpt-3.5-turbo-0613": "\gptthreefivechat",
        "gpt-4-0314": "\gptfour",
        "human": "\human",
    }
)

print(t)

\begin{table}[t!]
\centering
\caption{\small{Quantitative evaluation of GPT-3.5 (text-davinci-003), GPT-3.5 (chat-turbo), GPT-4 and Humans on the 11 phenomena across all three domains. The number represents the p-value on a sign test, with significant p-values indicating a preference for one argument over the other. Most of the time this preference is in the theoretically predicted direction (*), but when the weaker argument is endorsed significantly more ($\circ$) it is in the opposite.}}
\vspace{1mm}
\begin{tabular}{|p{0.2\linewidth}| p{0.2\linewidth} | p{0.12\linewidth} p{0.12\linewidth} p{0.12\linewidth} p{0.12\linewidth} |}
\hline
\textbf{Phenomenon} & \textbf{Domain} & \gptthreefivethree{\textbf{GPT-3.5 (text-davinci-003)}} & \gptthreefivechat{\textbf{GPT-3.5 (chat-turbo)}} & \gptfour{\textbf{GPT-4}} & \human{\textbf{Humans}}\\ \hline
Similarity & Mammals & 0.17 & 0.17 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\
  & Birds & 0.69 & 0.4 & \gptfour{\textbf{$<$0.

In [161]:
t = create_table(
    models = [
        "text-davinci-003",
        "gpt-4-0314",
        "human",
    ],
    model_names = {
        "text-davinci-003": "GPT-3.5 (text-davinci-003)",
        "gpt-3.5-turbo-0613": "GPT-3.5 (chat-turbo)",
        "gpt-4-0314": "GPT-4",
        "human": "Humans",
    },
    model_colors = {
        "text-davinci-003": "\gptthreefivethree",
        "gpt-3.5-turbo-0613": "\gptthreefivechat",
        "gpt-4-0314": "\gptfour",
        "human": "\human",
    }
)

print(t)

\begin{table}[t!]
\centering
\caption{\small{Quantitative evaluation of GPT-3.5 (text-davinci-003), GPT-4 and Humans on the 11 phenomena across all three domains. The number represents the p-value on a sign test, with significant p-values indicating a preference for one argument over the other. Most of the time this preference is in the theoretically predicted direction (*), but when the weaker argument is endorsed significantly more ($\circ$) it is in the opposite.}}
\vspace{1mm}
\begin{tabular}{|p{0.2\linewidth}| p{0.2\linewidth} | p{0.16\linewidth} p{0.16\linewidth} p{0.16\linewidth} |}
\hline
\textbf{Phenomenon} & \textbf{Domain} & \gptthreefivethree{\textbf{GPT-3.5 (text-davinci-003)}} & \gptfour{\textbf{GPT-4}} & \human{\textbf{Humans}}\\ \hline
Similarity & Mammals & 0.17 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\
  & Birds & 0.69 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\
  & Vehicles & 0.11 & \gptfour{\textbf{$<$0.001 *}} & \human{\t

#### Original table
Just saving the original table (before I wrote the above code) here just in case

\begin{table}[t!]
\centering
\caption{\small{Quantitative evaluation of GPT-3.5, GPT-4, and human performance on the 11 phenomena across all three domains. The number represents the p-value on a sign test, with significant p-values indicating a preference for one argument over the other. Most of the time this preference is in the theoretically predicted direction (*), but when the weaker argument is endorsed significantly more ($\circ$) it is in the opposite.}}
\vspace{1mm}
\begin{tabular}{|p{0.2\linewidth}| p{0.2\linewidth} | p{0.16\linewidth} p{0.15\linewidth} p{0.16\linewidth}|}
\hline
\textbf{Phenomenon} & \textbf{Domain} & \gptthree{\textbf{GPT-3.5}} & \gptfour{\textbf{GPT-4}} & \human{\textbf{Human}} \\ \hline
Similarity & Mammals & 0.84 & \gptfour{\textbf{$<$0.001 *}}  & \human{\textbf{$<$0.001 *}} \\ 
 & Birds & 0.42 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
 & Vehicles & 0.23 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
 %[0.5pt] \\
\hline
Typicality & Mammals & 0.54 & 0.15 & \human{\textbf{$<$0.001 *}} \\ 
 & Birds & 0.54 & \gptfour{\textbf{0.01 *}} & \human{\textbf{$<$0.001 *}} \\ 
 & Vehicles & 1.00 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
% [0.5pt] \\
\hline
Specificity & Mammals & 0.84 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
 & Birds & 0.84 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
 & Vehicles & 0.54 & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\
% [0.5pt] \\
\hline
Monotonicity & Mammals & \gptthree{\textbf{$<$0.001 *}} & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
(General) & Birds & \gptthree{\textbf{$<$0.001 *}} & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
 & Vehicles & \gptthree{\textbf{$<$0.001 *}} & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
% [0.5pt] \\
\hline
Monotonicity & Mammals & \gptthree{\textbf{$<$0.001 *}} & \gptfour{\textbf{$<$0.001 *}} & 0.06 \\ 
(Specific) & Birds & \gptthree{\textbf{$<$0.001 *}} & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
 & Vehicles & \gptthree{\textbf{$<$0.001 *}} & \gptfour{\textbf{$<$0.001 *}} & 0.29 \\ 
% [0.5pt] \\
\hline
Diversity & Mammals & 0.84 & \gptfour{\textbf{$<$0.001 *}} & 0.06 \\ 
(General) & Birds & 0.54 & \gptfour{\textbf{$<$0.001 *}} & 1.00 \\ 
 & Vehicles & 1.00 & 0.06 & \human{\textbf{0.03 $\circ$}} \\ 
% [0.5pt] \\
\hline
Diversity & Mammals & 1.00 & 0.54 & \human{\textbf{0.01 $\circ$}} \\ 
(Specific) & Birds & 0.54 & 0.84 & 0.68 \\ 
 & Vehicles & 0.31 & 0.84 & 0.68 \\ 
% [0.5pt] \\
\hline
Non-Monotonicity & Mammals & \gptthree{\textbf{$<$0.001 $\circ$}} & 0.15 & \human{\textbf{$<$0.001 *}} \\ 
(General) & Birds & \gptthree{\textbf{$<$0.001 $\circ$}} & \gptfour{\textbf{$<$0.001 *}} & \human{\textbf{$<$0.001 *}} \\ 
 & Vehicles & \gptthree{\textbf{$<$0.001 $\circ$}} & 0.54 & \human{\textbf{0.01 *}} \\ 
% [0.5pt] \\
\hline
Non-Monotonicity & Mammals & \gptthree{\textbf{$<$0.001 $\circ$}}  & \gptfour{\textbf{$<$0.001 $\circ$}} & 0.15 \\ 
(Specific) & Birds & \gptthree{\textbf{$<$0.001 $\circ$}} & \gptfour{\textbf{$<$0.001 $\circ$}} & \human{\textbf{$<$0.001 *}} \\ 
 & Vehicles & \gptthree{\textbf{$<$0.001 $\circ$}} & \gptfour{\textbf{$<$0.001 $\circ$}} & \human{\textbf{0.01 *}} \\ 
% [0.5pt] \\
\hline
Asymmetry & Mammals & 0.09 & 0.40 & 0.52 \\ 
 & Birds & 1.00 & 0.09 & \human{\textbf{$<$0.001 *}} \\ 
 & Vehicles & 0.68 & 1.00 & 0.05 \\ 
% [0.5pt] \\
\hline
Inclusion Fallacy & Mammals & 0.84 & \gptfour{\textbf{0.02 $\circ$}} & 0.06 \\ 
 & Birds & 0.84 & \gptfour{\textbf{$<$0.001 $\circ$}} & 1.00 \\ 
 & Vehicles & 0.84 & 0.54 & \human{\textbf{$<$0.001 $\circ$}} \\ 
\hline
\end{tabular}
\label{tbl:expt1stats}
\vspace{-3mm}
\end{table}