# Analysis for *Generating Event Descriptions*

In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import arviz

from typing import Literal, Callable
from dataclasses import dataclass
from glob import glob
from scipy.stats import spearmanr
from patsy import dmatrix
from cmdstanpy import CmdStanModel, CmdStanMCMC, from_csv

## Experiment 1: Naturalness

We begin analysis of the naturalness experiment by loading and combining the naturalness datasets into a single `pandas.DataFrame`.

In [2]:
nat_fnames = [
    "naturalness_llama-propbanksenses.csv",
    "naturalness_llama-llamasenses.csv",
    "naturalness_reddit.csv",
    "naturalness_manual.csv"
]

data_nat = []

for fname in nat_fnames:
    df = pd.read_csv(f"data/{fname}")

    generation_method = fname.replace("naturalness_", "").strip(".csv")

    if generation_method == "manual":
        df["list_type"] = "manual"
        df["generation_method_type"] = "manual"
        df["generation_method_subtype"] = "manual"
        df["generation_method_subsubtype"] = "manual"
        df = df.query("sentence_type == 'target'")
        df["generation_method"] = "manual_" + df.naturalness + "_" + df.typicality
    else:
        df["list_type"] = generation_method
        df["generation_method_type"] = df.sentence_type.map(lambda x: "manual" if x == "calibration" else "automated")
        df["generation_method_subtype"] = df.sentence_type.map(
            lambda x: "manual" if x == "calibration" else generation_method.split("-")[0]
        )
        df["generation_method_subsubtype"] = df.sentence_type.map(
            lambda x: "manual" if x == "calibration" else generation_method
        )
        df["generation_method"] = df[["sentence_type", "naturalness", "typicality"]].agg(
            lambda x: "manual_" + x.naturalness + "_" + x.typicality if x.sentence_type == "calibration" else generation_method,
            axis=1
        )

    data_nat.append(df)

data_nat = pd.concat(data_nat)

data_nat["surprisal_z"] = (data_nat.surprisal - data_nat.surprisal.mean())/data_nat.surprisal.std()
data_nat["freq_z"] = (data_nat.freq - data_nat.freq.mean())/data_nat.freq.std()

data_nat

Unnamed: 0,rater_id,sentence,rating,sense,verb,typicality,naturalness,sentence_type,surprisal,freq,verbnet_class,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,surprisal_z,freq_z
0,83bda277-9b4f-4c6e-a23b-a79ece20cc5c,The baby arranged the something.,50.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
1,8bc64c84-3e28-4f83-abaf-e585ff40ffe5,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
2,f09582f5-d39c-41eb-ab67-ff696f2c3459,The baby arranged the something.,22.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
3,e41ede6f-b8a8-4424-a276-520d63d0af85,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
4,03eaf938-1208-40ce-9b0b-bb58cc6048c9,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6815,48938107-ae25-4ac5-af1f-f5552d1228a9,The after tore the brick.,25.0,"tear.01: to pull apart, separate forcefully",tear,atypical,unnatural,target,139.264510,1030,hurt,manual,manual,manual,manual,manual_unnatural_atypical,-0.201797,-1.088339
6816,48938107-ae25-4ac5-af1f-f5552d1228a9,The athlete threw the javelin.,100.0,"throw.01: throw, sending through the air, manu...",throw,typical,natural,target,160.484510,3679,throw,manual,manual,manual,manual,manual_natural_typical,0.613941,0.696203
6817,48938107-ae25-4ac5-af1f-f5552d1228a9,The jellyfish threw the javelin.,92.0,"throw.01: throw, sending through the air, manu...",throw,atypical,natural,target,101.749570,3679,throw,manual,manual,manual,manual,manual_natural_atypical,-1.643943,0.696203
6818,48938107-ae25-4ac5-af1f-f5552d1228a9,The athlete threw the while.,38.0,"throw.01: throw, sending through the air, manu...",throw,typical,unnatural,target,169.375210,3679,throw,manual,manual,manual,manual,manual_unnatural_typical,0.955716,0.696203


### Participant filtration

Some participants did not complete the survey. We remove these participants.

In [3]:
def filter_participants(data, expected_counts: dict[str, int]) -> pd.DataFrame:
    """Filter participants who did not complete the task"""
    actual_counts = data.groupby("list_type").rater_id.value_counts().reset_index()

    exclude_participants = [
        r.rater_id for _, r in actual_counts.iterrows()
        if r["count"] != expected_counts[r.list_type]
    ]

    return data[~data.rater_id.isin(exclude_participants)]

In [4]:
expected_counts_nat = {
    "llama-llamasense": 97,
    "llama-propbanksense": 97,
    "manual": 124,
    "reddit": 72
}

data_nat = filter_participants(data_nat, expected_counts_nat)

data_nat

Unnamed: 0,rater_id,sentence,rating,sense,verb,typicality,naturalness,sentence_type,surprisal,freq,verbnet_class,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,surprisal_z,freq_z
0,83bda277-9b4f-4c6e-a23b-a79ece20cc5c,The baby arranged the something.,50.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
1,8bc64c84-3e28-4f83-abaf-e585ff40ffe5,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
2,f09582f5-d39c-41eb-ab67-ff696f2c3459,The baby arranged the something.,22.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
3,e41ede6f-b8a8-4424-a276-520d63d0af85,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
4,03eaf938-1208-40ce-9b0b-bb58cc6048c9,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6815,48938107-ae25-4ac5-af1f-f5552d1228a9,The after tore the brick.,25.0,"tear.01: to pull apart, separate forcefully",tear,atypical,unnatural,target,139.264510,1030,hurt,manual,manual,manual,manual,manual_unnatural_atypical,-0.201797,-1.088339
6816,48938107-ae25-4ac5-af1f-f5552d1228a9,The athlete threw the javelin.,100.0,"throw.01: throw, sending through the air, manu...",throw,typical,natural,target,160.484510,3679,throw,manual,manual,manual,manual,manual_natural_typical,0.613941,0.696203
6817,48938107-ae25-4ac5-af1f-f5552d1228a9,The jellyfish threw the javelin.,92.0,"throw.01: throw, sending through the air, manu...",throw,atypical,natural,target,101.749570,3679,throw,manual,manual,manual,manual,manual_natural_atypical,-1.643943,0.696203
6818,48938107-ae25-4ac5-af1f-f5552d1228a9,The athlete threw the while.,38.0,"throw.01: throw, sending through the air, manu...",throw,typical,unnatural,target,169.375210,3679,throw,manual,manual,manual,manual,manual_unnatural_typical,0.955716,0.696203


### Model fitting

To analyze the data, we use a generlized linear mixed effects model with an ordered beta link. This model is implemented in `scripts/analysis/models/ordered-beta.stan`. 

In [5]:
model = CmdStanModel(stan_file="scripts/analysis/models/ordered-beta.stan")

To fit this model to our data, we need to map it into the format assumd by the model's `data` block.

```stan
data {
  int<lower=1> N_resp;                             // number of responses
  int<lower=1> N_subj;                             // number of subjects
  int<lower=1> N_verb;                             // number of verbs
  int<lower=1> N_sense;                            // number of senses
  int<lower=1> N_item;                             // number of items
  int<lower=1> N_fixed;                            // number of fixed predictors
  int<lower=1> N_by_subj;                          // number of random by-subject predictors
  int<lower=1> N_by_verb;                          // number of random by-verb predictors
  int<lower=1> N_by_sense;                         // number of random by-sense predictors
  int<lower=1> N_by_item;                          // number of random by-item predictors
  matrix[N_resp,N_fixed] fixed_predictors;         // predictors including intercept
  matrix[N_resp,N_by_subj] by_subj_predictors;     // by-subject predictors including intercept
  matrix[N_resp,N_by_verb] by_verb_predictors;     // by-verb predictors including intercept
  matrix[N_resp,N_by_sense] by_sense_predictors;   // by-sense predictors including intercept
  matrix[N_resp,N_by_item] by_item_predictors;     // by-item predictors including intercept
  array[N_resp] int<lower=1,upper=N_subj> subj;    // subject who gave response n
  array[N_resp] int<lower=1,upper=N_verb> verb;    // verb corresponding to response n
  array[N_resp] int<lower=1,upper=N_sense> sense;  // sense corresponding to response n
  array[N_resp] int<lower=1,upper=N_item> item;    // item corresponding to response n
  array[N_resp] int<lower=1,upper=3> resp_bin;     // whether a response is 0=1, (0, 1)=1, or 1=2
  array[N_resp] real<lower=0,upper=1> resp;        // [0, 1] responses                                    
}
```

In [6]:
def bin_response(x: float) -> int:
    """Bin the response by whether it is an endpoint (0, 1) or not."""
    if x == 0.0:
        return 1
    elif x == 1.0:
        return 3
    else:
        return 2

def prepare_data(
    data: pd.DataFrame, 
    fixed_formula: str,
    by_subj_formula: str,
    by_verb_formula: str,
    by_sense_formula: str,
    by_item_formula: str,
    item_cols: list[str],
    sense_cols: list[str],
    subj_cols: list[str] = ["rater_id"], 
    verb_cols: list[str] = ["verb"], 
    resp_col: str = "rating",
) -> tuple[dict[str, int | np.ndarray], np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    # scale the responses to [0, 1]
    resp = data[resp_col] / 100

    # bin the responses
    resp_bin = resp.map(bin_response)

    # construct the design matrices
    fixed_predictors = dmatrix(fixed_formula, data, return_type="dataframe")
    by_subj_predictors = dmatrix(by_subj_formula, data, return_type="dataframe")
    by_verb_predictors = dmatrix(by_verb_formula, data, return_type="dataframe")
    by_sense_predictors = dmatrix(by_sense_formula, data, return_type="dataframe")
    by_item_predictors = dmatrix(by_item_formula, data, return_type="dataframe")

    # hash the items and subjects
    subjid = data[subj_cols].agg('-'.join, axis=1).astype("category")
    verbid = data[verb_cols].agg('-'.join, axis=1).astype("category")
    senseid = data[sense_cols].agg('-'.join, axis=1).astype("category")
    itemid = data[item_cols].agg('-'.join, axis=1).astype("category")
    
    # determine whether the subject is a discrete responder
    disc_responder = data[[resp_col]].isin([0, 100])
    disc_responder["subjid"] = subjid
    disc_responder = disc_responder.groupby("subjid")[resp_col].all()

    data_stan = {
        "N_resp": data.shape[0],
        "N_subj": subjid.cat.codes.max() + 1,
        "N_verb": verbid.cat.codes.max() + 1,
        "N_sense": senseid.cat.codes.max() + 1,
        "N_item": itemid.cat.codes.max() + 1,
        "N_fixed": fixed_predictors.shape[1],
        "N_by_subj": by_subj_predictors.shape[1],
        "N_by_verb": by_verb_predictors.shape[1],
        "N_by_sense": by_sense_predictors.shape[1],
        "N_by_item": by_item_predictors.shape[1],
        "fixed_predictors": fixed_predictors.values,
        "by_subj_predictors": by_subj_predictors.values,
        "by_verb_predictors": by_verb_predictors.values,
        "by_sense_predictors": by_sense_predictors.values,
        "by_item_predictors": by_item_predictors.values,
        "subj": subjid.cat.codes.values + 1,
        "verb": verbid.cat.codes.values + 1,
        "sense": senseid.cat.codes.values + 1,
        "item": itemid.cat.codes.values + 1,
        "resp_bin": resp_bin.values.astype(int),
        "resp": resp.values
    }

    return (
        data_stan, 
        fixed_predictors.columns.values, 
        by_subj_predictors.columns.values,
        by_verb_predictors.columns.values,
        by_sense_predictors.columns.values,
        by_item_predictors.columns.values,
    )

The estimates for the distribution over fixed and random effects reported in the paper are extracted from the model fits using:

In [7]:
def fixed_coef_stats(fit: CmdStanMCMC, coef_names: list[str]) -> pd.DataFrame:
    fixed_coefs = fit.stan_variable("fixed_coefs")

    posterior_means = fixed_coefs.mean(axis=0)
    
    stats = pd.DataFrame(
        np.quantile(fixed_coefs, [0.025, 0.05, 0.95,  0.975], axis=0).T, 
        index=coef_names,
        columns=["2.5%", "5%", "95%", "97.5%"]
    )

    stats["post_mean"] = posterior_means
    
    stats["p"] = np.mean(
        np.sign(posterior_means)[None,:] != np.sign(fixed_coefs), 
        axis=0
    )

    return stats[["post_mean", "2.5%", "5%", "95%", "97.5%", "p"]]

def random_coef_stats(
    fit: CmdStanMCMC, 
    var_name: Literal[
        "subj_cov", "subj_corr", 
        "verb_cov", "verb_corr", 
        "sense_cov", "sense_corr", 
        "item_cov", "item_corr"
    ], 
    coef_names: list[str]
) -> pd.DataFrame:
    matrices = fit.stan_variable(var_name)
    posterior_means = pd.DataFrame(
        matrices.mean(axis=0),
        index=coef_names,
        columns=coef_names
    )

    return posterior_means


And the actual model fits are done using:

In [8]:
@dataclass
class ModelResults:
    fit: CmdStanMCMC
    fixed_coefs: pd.DataFrame
    subj_cov:  pd.DataFrame
    verb_cov:  pd.DataFrame
    sense_cov: pd.DataFrame
    item_cov:  pd.DataFrame
    subj_corr: pd.DataFrame
    verb_corr: pd.DataFrame
    sense_corr: pd.DataFrame
    item_corr: pd.DataFrame

def fit_hmc(
    data: pd.DataFrame,
    fixed_formula: str, 
    by_subj_formula: str,
    by_verb_formula: str,
    by_sense_formula: str,
    by_item_formula: str,  
    item_cols: list[str],
    subj_cols: list[str],
    verb_cols: list[str],
    sense_cols: list[str], 
    seed: int = 30298,
    **kwargs
) -> ModelResults:
    data_stan, fixed_predictors, by_subj_predictors, by_verb_predictors, by_sense_predictors, by_item_predictors = prepare_data(
        data,
        fixed_formula=fixed_formula,
        by_subj_formula=by_subj_formula,
        by_verb_formula=by_verb_formula,
        by_sense_formula=by_sense_formula,
        by_item_formula=by_item_formula, 
        subj_cols=subj_cols,
        verb_cols=verb_cols,
        sense_cols=sense_cols,
        item_cols=item_cols
    )

    fit = model.sample(
        data=data_stan,
        seed=seed,
        **kwargs
    )

    return ModelResults(
        fit = fit, 
        fixed_coefs = fixed_coef_stats(fit, fixed_predictors),
        subj_cov = random_coef_stats(fit, "subj_cov", by_subj_predictors),
        verb_cov = random_coef_stats(fit, "verb_cov", by_verb_predictors),
        sense_cov = random_coef_stats(fit, "sense_cov", by_sense_predictors),
        item_cov = random_coef_stats(fit, "item_cov", by_item_predictors),
        subj_corr = random_coef_stats(fit, "subj_corr", by_subj_predictors),
        verb_corr = random_coef_stats(fit, "verb_corr", by_verb_predictors),
        sense_corr = random_coef_stats(fit, "sense_corr", by_sense_predictors),
        item_corr = random_coef_stats(fit, "item_corr", by_item_predictors)
    )

Because we don't want ot have to rerun the models if we already have them cached, we'll also define a method for loading a fit from CSVs dumped by `cmdstanpy`.

In [9]:
def load_fit(
    path: str, 
    data: pd.DataFrame,
    fixed_formula: str, 
    by_subj_formula: str,
    by_verb_formula: str,
    by_sense_formula: str,
    by_item_formula: str,  
    item_cols: list[str],
    subj_cols: list[str],
    verb_cols: list[str],
    sense_cols: list[str],
) -> ModelResults:
    _, fixed_predictors, by_subj_predictors, by_verb_predictors, by_sense_predictors, by_item_predictors = prepare_data(
        data,
        fixed_formula=fixed_formula,
        by_subj_formula=by_subj_formula,
        by_verb_formula=by_verb_formula,
        by_sense_formula=by_sense_formula,
        by_item_formula=by_item_formula, 
        subj_cols=subj_cols,
        verb_cols=verb_cols,
        sense_cols=sense_cols,
        item_cols=item_cols
    )

    fit = from_csv(path, method="sample")

    return ModelResults(
        fit = fit, 
        fixed_coefs = fixed_coef_stats(fit, fixed_predictors),
        subj_cov = random_coef_stats(fit, "subj_cov", by_subj_predictors),
        verb_cov = random_coef_stats(fit, "verb_cov", by_verb_predictors),
        sense_cov = random_coef_stats(fit, "sense_cov", by_sense_predictors),
        item_cov = random_coef_stats(fit, "item_cov", by_item_predictors),
        subj_corr = random_coef_stats(fit, "subj_corr", by_subj_predictors),
        verb_corr = random_coef_stats(fit, "verb_corr", by_verb_predictors),
        sense_corr = random_coef_stats(fit, "sense_corr", by_sense_predictors),
        item_corr = random_coef_stats(fit, "item_corr", by_item_predictors)
    )

We use the following sampler parameters throughout.

In [11]:
sampler_params = {
    "iter_warmup": 2500, 
    "iter_sampling": 2500
}

And to enforce a particular coding of the generation levels, we specify that ordering as a list that will be passed to `patsy`.

In [12]:
generation_levels = [
    'manual_natural_typical', 'manual_natural_atypical', 
    'manual_unnatural_typical', 'manual_unnatural_atypical',
    'reddit', 'llama-propbanksense', 'llama-llamasense'
]

Finally, we actually fit the model (or load it if we have a cached fit).

In [13]:
if glob("fits/nat/base/*.csv"):
    results_nat = load_fit(
        "fits/nat/base/*.csv", 
        data_nat,
        fixed_formula="~ 1 + C(generation_method, levels=generation_levels)",
        by_subj_formula="~ 1", # cannot fit anything bigger, because not all subjects saw items from every generation method
        by_verb_formula="~ 1", # cannot fit anything bigger, because not all verbs show up with each generation method
        by_sense_formula="~ 1", # cannot fit anything bigger, because not all verb senses show up with each generation method
        by_item_formula="~ 1", # cannot fit anything bigger, because items are specific to generation method
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"],
    )
else:
    results_nat = fit_hmc(
        data_nat,
        fixed_formula="~ 1 + C(generation_method, levels=generation_levels)",
        by_subj_formula="~ 1", # cannot fit anything bigger, because not all subjects saw items from every generation method
        by_verb_formula="~ 1", # cannot fit anything bigger, because not all verbs show up with each generation method
        by_sense_formula="~ 1", # cannot fit anything bigger, because not all verb senses show up with each generation method
        by_item_formula="~ 1", # cannot fit anything bigger, because items are specific to generation method
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"],
        output_dir="fits/nat/base",
        **sampler_params
    )

    results_nat.fit.diagnose()

    # handles a bug in cmdstanpy.from_csv that I suspect has to do with a version change in STAN
    # basically, I *think* STAN used to dump the save_warmup flag as an int, but now it uses a boolean
    !find . -type f -wholename './fits/nat/base/*.csv' | xargs sed -i 's/save_warmup = false/save_warmup = 0/g'

We can see the fixed effects coefficient estimates by looking at the `fixed_coefs` attribute of the `ModelResults`.

In [15]:
results_nat.fixed_coefs

Unnamed: 0,post_mean,2.5%,5%,95%,97.5%,p
Intercept,2.952102,2.610259,2.664675,3.244996,3.295697,0.0
"C(generation_method, levels=generation_levels)[T.manual_natural_atypical]",-2.151,-2.497434,-2.442919,-1.861324,-1.80863,0.0
"C(generation_method, levels=generation_levels)[T.manual_unnatural_typical]",-4.52374,-4.87066,-4.807511,-4.241509,-4.183922,0.0
"C(generation_method, levels=generation_levels)[T.manual_unnatural_atypical]",-4.75992,-5.103091,-5.046091,-4.47345,-4.414772,0.0
"C(generation_method, levels=generation_levels)[T.reddit]",-1.705771,-2.079491,-2.020847,-1.389939,-1.333382,0.0
"C(generation_method, levels=generation_levels)[T.llama-propbanksense]",-1.883348,-2.250912,-2.190697,-1.579477,-1.519272,0.0
"C(generation_method, levels=generation_levels)[T.llama-llamasense]",-1.840907,-2.198644,-2.14205,-1.541401,-1.4829,0.0


We generate the $\LaTeX$ table used in the paper using the following code.

In [36]:
def print_fixed_coef_tabular(fit: ModelResults, coef_map: Callable[str, str]) -> None:
    print(r"\begin{tabular}{rcccr}")

    print(r"\toprule")
    print(r"      & \textbf{Post. mean} & \textbf{2.5\%} & \textbf{97.5\%} & \textbf{Post.} $p$ \\")
    print(r"\midrule")

    for coef, row in fit.fixed_coefs.iterrows():
        if row.p < 0.001:
            print(f"{coef_map(coef)} & {row.post_mean:2.2f} & {row['2.5%']:2.2f} & {row['97.5%']:2.2f} & $<$ 0.01 \\\\")
        else:    
            print(f"{coef_map(coef)} & {row.post_mean:2.2f} & {row['2.5%']:2.2f} & {row['97.5%']:2.2f} & {row.p:2.2f} \\\\")

    print(r"\bottomrule")
    print(r"\end{tabular}")

coef_map = {
    "manual_unnatural_typical": "Manual (Unnatural \\& Typical)",
    "manual_natural_typical": "Manual (Natural \\& Typical )",
    "manual_unnatural_atypical": "Manual (Unnatural \\& Atypical)",
    "manual_natural_atypical": "Manual (Natural \\& Atypical)",
    "reddit": "Corpus",
    "llama-propbanksense": "LM with PropBank senses",
    "llama-llamasense": "LM with LM senses",
    "llama": "LM"
}

def process_coef_name_nat_typ(coef: str) -> str:
    if coef == "Intercept":
        return coef
    else:
        return " $\\times$ ".join(
            coef_map[v] for v in re.findall(
                "C\(generation_method, levels=generation_levels\)\[T\.(.*)\]", 
                coef
            )
        )

In [15]:
print_fixed_coef_tabular(results_nat, process_coef_name_nat_typ)

\begin{tabular}{rcccr}
\toprule
      & \textbf{Post. mean} & \textbf{2.5\%} & \textbf{97.5\%} & \textbf{Post.} $p$ \\
\midrule
Intercept & 2.95 & 2.61 & 3.30 & $<$ 0.01 \\
Manual (Natural \& Atypical) & -2.15 & -2.50 & -1.81 & $<$ 0.01 \\
Manual (Unnatural \& Typical) & -4.52 & -4.87 & -4.18 & $<$ 0.01 \\
Manual (Unnatural \& Atypical) & -4.76 & -5.10 & -4.41 & $<$ 0.01 \\
Corpus & -1.71 & -2.08 & -1.33 & $<$ 0.01 \\
LM with PropBank senses & -1.88 & -2.25 & -1.52 & $<$ 0.01 \\
LM with LM senses & -1.84 & -2.20 & -1.48 & $<$ 0.01 \\
\bottomrule
\end{tabular}


### Cutpoint distributions

In the caption of Table 8 of the paper, we report estimates of the posterior distributions over cutpoints. These estimates are calculated below.

In [14]:
def cutpoint_stats(fit: CmdStanMCMC) -> tuple[np.ndarray, np.ndarray]: 
    cutpoint0 = fit.stan_variable("cutpoint0")
    cutpoint1 = cutpoint0 + np.exp(fit.stan_variable("interval_size_logmean"))

    cutpoint0_stats = np.round(np.quantile(cutpoint0, [0.025, 0.5, 0.975]), 2)
    cutpoint1_stats = np.round(np.quantile(cutpoint1, [0.025, 0.5, 0.975]), 2)

    print("Cutpoint 0:", cutpoint0_stats[1], f"(95\\% CI = [{cutpoint0_stats[0]}, {cutpoint0_stats[2]}])")
    print("Cutpoint 1:", cutpoint1_stats[1], f"(95\\% CI = [{cutpoint1_stats[0]}, {cutpoint1_stats[2]}])")

    return cutpoint0_stats, cutpoint1_stats

In [None]:

_ = cutpoint_stats(results_nat.fit)

### Difference between manual generation and automatic generation

In Section 4.5 of the paper, we discuss the differences between the ratings for manually generated items that were constructed to be natural and those for automatically generated items. The difference between the manually generated natural, typical items can be read directly off the coefficient estimates. The difference between the manually generated natural, typical items and the automatically generated items needs to be calculated from the posterior samples.

To assess whether the automatic generation methods produce examples that are more natural than the manually generated natural, atypical examples, we compute the posterior distribution of the difference in the automatic effects and the manual natural-atypical effect, then test whether it is greater than 0. The proportion of samples on which it is greate than 0 gives us the posterior $p$ that the automatic effects are indeed more positive.

In [18]:
natural_atypical_effect = results_nat.fit.stan_variable("fixed_coefs")[:,[1]]
automatic_effects = results_nat.fit.stan_variable("fixed_coefs")[:,4:]

mean_difference_gt0 = ((automatic_effects - natural_atypical_effect) > 0).mean(0)

for coef_name, d in zip(generation_levels[4:], mean_difference_gt0):
    if d > 0.99:
        print(coef_map[coef_name], "(posterior $p >$ 0.99)")
    else:
        print(coef_map[coef_name], f"(posterior $p = $ {np.round(d, 2)})")

Corpus (posterior $p >$ 0.99)
LM with PropBank senses (posterior $p = $ 0.94)
LM with LM senses (posterior $p = $ 0.97)


### Predicting naturalness from frequency

In Section 4.6 of the paper, we report regressions assessing whether our automated methods are frequency-sensitiv with respect to naturalness, finding that they are not. These regressions are conducted below. 

In [15]:
data_nat_freqsub = data_nat.query('generation_method_type != "manual" or generation_method == "manual_natural_typical"')

data_nat_freqsub

Unnamed: 0,rater_id,sentence,rating,sense,verb,typicality,naturalness,sentence_type,surprisal,freq,verbnet_class,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,surprisal_z,freq_z
305,83bda277-9b4f-4c6e-a23b-a79ece20cc5c,The washer shrunk the laundry.,100.0,shrink.01: become or make smaller,shrink,typical,natural,calibration,117.04286,1846,bend,llama-propbanksense,manual,manual,manual,manual_natural_typical,-1.056040,-0.538627
306,8bc64c84-3e28-4f83-abaf-e585ff40ffe5,The washer shrunk the laundry.,83.0,shrink.01: become or make smaller,shrink,typical,natural,calibration,117.04286,1846,bend,llama-propbanksense,manual,manual,manual,manual_natural_typical,-1.056040,-0.538627
307,f09582f5-d39c-41eb-ab67-ff696f2c3459,The washer shrunk the laundry.,100.0,shrink.01: become or make smaller,shrink,typical,natural,calibration,117.04286,1846,bend,llama-propbanksense,manual,manual,manual,manual_natural_typical,-1.056040,-0.538627
308,e41ede6f-b8a8-4424-a276-520d63d0af85,The washer shrunk the laundry.,100.0,shrink.01: become or make smaller,shrink,typical,natural,calibration,117.04286,1846,bend,llama-propbanksense,manual,manual,manual,manual_natural_typical,-1.056040,-0.538627
309,03eaf938-1208-40ce-9b0b-bb58cc6048c9,The washer shrunk the laundry.,100.0,shrink.01: become or make smaller,shrink,typical,natural,calibration,117.04286,1846,bend,llama-propbanksense,manual,manual,manual,manual_natural_typical,-1.056040,-0.538627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6800,48938107-ae25-4ac5-af1f-f5552d1228a9,The cook smashed the potatoes.,100.0,"smash.02: move violently, so as to break apart",smash,typical,natural,target,157.26427,2442,"throw,hit",manual,manual,manual,manual,manual_natural_typical,0.490149,-0.137122
6804,48938107-ae25-4ac5-af1f-f5552d1228a9,The couple split the check.,100.0,"split.01: divide, separate",split,typical,natural,target,128.58272,2218,break,manual,manual,manual,manual,manual_natural_typical,-0.612425,-0.288023
6808,48938107-ae25-4ac5-af1f-f5552d1228a9,The thief stole the painting.,100.0,steal.01: to commit theft,steal,typical,natural,target,104.60133,5133,steal,manual,manual,manual,manual,manual_natural_typical,-1.534316,1.675713
6812,48938107-ae25-4ac5-af1f-f5552d1228a9,The kid tore the paper.,100.0,"tear.01: to pull apart, separate forcefully",tear,typical,natural,target,157.81557,1030,hurt,manual,manual,manual,manual,manual_natural_typical,0.511342,-1.088339


In [17]:
generation_subtype_levels = [
    "manual", "reddit", "llama"
]

In [18]:
if glob("fits/nat/frequency/*.csv"):
    results_nat_freq = load_fit(
        "fits/nat/frequency/*.csv",
        data_nat_freqsub,
        fixed_formula="~ 1 + C(generation_method_subtype, levels=generation_subtype_levels) * freq_z",
        by_subj_formula="~ 1 + freq_z", # cannot fit anything bigger, because not all subjects saw items from every generation method
        by_verb_formula="~ 1", # cannot fit anything bigger, because not all verbs show up with each generation method
        by_sense_formula="~ 1", # cannot fit anything bigger, because not all verb senses show up with each generation method
        by_item_formula="~ 1", # cannot fit anything bigger, because items are specific to generation method
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"]
    )
else:
    results_nat_freq = fit_hmc(
        data_nat_freqsub,
        fixed_formula="~ 1 + C(generation_method_subtype, levels=generation_subtype_levels) * freq_z",
        by_subj_formula="~ 1 + freq_z", # cannot fit anything bigger, because not all subjects saw items from every generation method
        by_verb_formula="~ 1", # cannot fit anything bigger, because not all verbs show up with each generation method
        by_sense_formula="~ 1", # cannot fit anything bigger, because not all verb senses show up with each generation method
        by_item_formula="~ 1", # cannot fit anything bigger, because items are specific to generation method
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"],
        output_dir="fits/nat/frequency",
        **sampler_params
    )

    results_nat_freq.fit.diagnose()

    # handles a bug in cmdstanpy.from_csv that I suspect has to do with a version change in STAN
    # basically, I *think* STAN used to dump the save_warmup flag as an int, but now it uses a boolean
    !find . -type f -wholename './fits/nat/frequency/*.csv' | xargs sed -i 's/save_warmup = false/save_warmup = 0/g'

15:01:51 - cmdstanpy - INFO - created output directory: /home/jovyan/fits/nat/frequency
15:01:51 - cmdstanpy - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

17:18:27 - cmdstanpy - INFO - CmdStan done processing.
Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: ordered_logistic: Cut-points is not a valid ordered vector. The element at 2 is -12918.2, but should be greater than the previous element, -12918.2 (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 169, column 4 to line 172, column 6)
	Exception: lk




In [21]:
results_nat_freq.fixed_coefs

Unnamed: 0,post_mean,2.5%,5%,95%,97.5%,p
Intercept,2.261613,1.892828,1.95159,2.578408,2.640529,0.0
"C(generation_method_subtype, levels=generation_subtype_levels)[T.reddit]",-0.892174,-1.325125,-1.253843,-0.543067,-0.474151,0.0
"C(generation_method_subtype, levels=generation_subtype_levels)[T.llama]",-0.965871,-1.350809,-1.290174,-0.645476,-0.58363,0.0
freq_z,0.197306,-0.143748,-0.084003,0.481029,0.537089,0.1296
"C(generation_method_subtype, levels=generation_subtype_levels)[T.reddit]:freq_z",-0.089678,-0.458895,-0.400567,0.21883,0.27721,0.3156
"C(generation_method_subtype, levels=generation_subtype_levels)[T.llama]:freq_z",-0.095564,-0.45101,-0.392031,0.20092,0.259494,0.3007


In [55]:
coef_map["freq_z"] = "Frequency"

def process_coef_name_freq(coef: str) -> str:
    if coef == "Intercept":
        return coef
    else:
        return " $\\times$ ".join(
            coef_map[v] for vs in re.findall(
                "(?:C\(generation_method_subtype, levels=generation_subtype_levels\)\[T\.(.*?)\])?:?(.*)", 
                coef
            ) for v in vs if v
        )

print_fixed_coef_tabular(results_nat_freq, process_coef_name_freq)

\begin{tabular}{rcccr}
\toprule
      & \textbf{Post. mean} & \textbf{2.5\%} & \textbf{97.5\%} & \textbf{Post.} $p$ \\
\midrule
Intercept & 2.26 & 1.89 & 2.64 & $<$ 0.01 \\
Corpus & -0.89 & -1.33 & -0.47 & $<$ 0.01 \\
LM & -0.97 & -1.35 & -0.58 & $<$ 0.01 \\
Frequency & 0.20 & -0.14 & 0.54 & 0.13 \\
Corpus $\times$ Frequency & -0.09 & -0.46 & 0.28 & 0.32 \\
LM $\times$ Frequency & -0.10 & -0.45 & 0.26 & 0.30 \\
\bottomrule
\end{tabular}


In [20]:
_ = cutpoint_stats(results_nat_freq.fit)

Cutpoint 0: -2.43 (95\% CI = [-2.53, -2.32])
Cutpoint 1: 0.05 (95\% CI = [-0.25, 0.37])


## Experiment 2: Typicality

We begin analysis of the typicality experiment by loading and combining the typicality datasets into a single `pandas.DataFrame`.

In [56]:
typ_fnames = [
    "typicality_llama-propbanksenses.csv",
    "typicality_llama-llamasenses.csv",
    "typicality_reddit.csv",
    "typicality_manual.csv"
]

data_typ = []

for fname in typ_fnames:
    df = pd.read_csv(f"data/{fname}")

    generation_method = fname.replace("typicality_", "").strip(".csv")

    if generation_method == "manual":
        df["list_type"] = "manual"
        df["generation_method_type"] = "manual"
        df["generation_method_subtype"] = "manual"
        df["generation_method_subsubtype"] = "manual"
        df = df.query("sentence_type == 'target'")
        df["generation_method"] = "manual_" + df.naturalness + "_" + df.typicality
    else:
        df["list_type"] = generation_method
        df["generation_method_type"] = df.sentence_type.map(lambda x: "manual" if x == "calibration" else "automated")
        df["generation_method_subtype"] = df.sentence_type.map(
            lambda x: "manual" if x == "calibration" else generation_method.split("-")[0]
        )
        df["generation_method_subsubtype"] = df.sentence_type.map(
            lambda x: "manual" if x == "calibration" else generation_method
        )
        df["generation_method"] = df[["sentence_type", "naturalness", "typicality"]].agg(
            lambda x: "manual_" + x.naturalness + "_" + x.typicality if x.sentence_type == "calibration" else generation_method,
            axis=1
        )

    data_typ.append(df)

data_typ = pd.concat(data_typ)

data_typ["rating_z"] = data_typ.groupby("rater_id").rating.transform(
    lambda x: (x - x.mean())/x.std()
)

data_typ["surprisal_z"] = (data_typ.surprisal - data_typ.surprisal.mean())/data_typ.surprisal.std()
data_typ["freq_z"] = (data_typ.freq - data_typ.freq.mean())/data_typ.freq.std()

data_typ

Unnamed: 0,rater_id,sentence,rating,sense,verb,typicality,naturalness,sentence_type,surprisal,freq,verbnet_class,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,rating_z,surprisal_z,freq_z
0,56122d99-f13f-4674-a58a-5ece8f7973f6,The chef drove the limo.,37.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-0.179108,-1.121964,0.348878
1,80e45189-916a-4f4a-9690-e83ef3ef5c2d,The chef drove the limo.,35.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-0.477253,-1.121964,0.348878
2,58c5437e-7879-4200-9cf3-b1b892d47fd6,The chef drove the limo.,3.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-1.294155,-1.121964,0.348878
3,6ad341a1-12e3-4a39-9ab0-77a32b7592bd,The chef drove the limo.,16.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-1.101403,-1.121964,0.348878
4,0d04029d-6889-4a8f-a56c-c7f81cdb0b9a,The chef drove the limo.,51.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-0.257870,-1.121964,0.348878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6567,e3482363-c474-4acf-af4b-10c3bfdeba1f,The after tore the brick.,0.0,"tear.01: to pull apart, separate forcefully",tear,atypical,unnatural,target,139.264510,1030,hurt,manual,manual,manual,manual,manual_unnatural_atypical,-0.640775,-0.122589,-1.098299
6568,e3482363-c474-4acf-af4b-10c3bfdeba1f,The athlete threw the javelin.,100.0,"throw.01: throw, sending through the air, manu...",throw,typical,natural,target,160.484510,3679,throw,manual,manual,manual,manual,manual_natural_typical,1.723989,0.699460,0.637134
6569,e3482363-c474-4acf-af4b-10c3bfdeba1f,The jellyfish threw the javelin.,0.0,"throw.01: throw, sending through the air, manu...",throw,atypical,natural,target,101.749570,3679,throw,manual,manual,manual,manual,manual_natural_atypical,-0.640775,-1.575893,0.637134
6570,e3482363-c474-4acf-af4b-10c3bfdeba1f,The athlete threw the while.,0.0,"throw.01: throw, sending through the air, manu...",throw,typical,unnatural,target,169.375210,3679,throw,manual,manual,manual,manual,manual_unnatural_typical,-0.640775,1.043880,0.637134


### Participant filtration

Some participants did not complete the survey. We remove these participants.

In [57]:
expected_counts_typ = {
    "llama-llamasense": 97,
    "llama-propbanksense": 97,
    "manual": 124,
    "reddit": 72
}

data_typ = filter_participants(data_typ, expected_counts_typ)

data_typ

Unnamed: 0,rater_id,sentence,rating,sense,verb,typicality,naturalness,sentence_type,surprisal,freq,verbnet_class,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,rating_z,surprisal_z,freq_z
0,56122d99-f13f-4674-a58a-5ece8f7973f6,The chef drove the limo.,37.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-0.179108,-1.121964,0.348878
1,80e45189-916a-4f4a-9690-e83ef3ef5c2d,The chef drove the limo.,35.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-0.477253,-1.121964,0.348878
2,58c5437e-7879-4200-9cf3-b1b892d47fd6,The chef drove the limo.,3.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-1.294155,-1.121964,0.348878
3,6ad341a1-12e3-4a39-9ab0-77a32b7592bd,The chef drove the limo.,16.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-1.101403,-1.121964,0.348878
4,0d04029d-6889-4a8f-a56c-c7f81cdb0b9a,The chef drove the limo.,51.0,drive.01: drive or be driven around in a vehicle,drive,atypical,natural,calibration,113.467090,3239,throw,llama-propbanksense,manual,manual,manual,manual_natural_atypical,-0.257870,-1.121964,0.348878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6567,e3482363-c474-4acf-af4b-10c3bfdeba1f,The after tore the brick.,0.0,"tear.01: to pull apart, separate forcefully",tear,atypical,unnatural,target,139.264510,1030,hurt,manual,manual,manual,manual,manual_unnatural_atypical,-0.640775,-0.122589,-1.098299
6568,e3482363-c474-4acf-af4b-10c3bfdeba1f,The athlete threw the javelin.,100.0,"throw.01: throw, sending through the air, manu...",throw,typical,natural,target,160.484510,3679,throw,manual,manual,manual,manual,manual_natural_typical,1.723989,0.699460,0.637134
6569,e3482363-c474-4acf-af4b-10c3bfdeba1f,The jellyfish threw the javelin.,0.0,"throw.01: throw, sending through the air, manu...",throw,atypical,natural,target,101.749570,3679,throw,manual,manual,manual,manual,manual_natural_atypical,-0.640775,-1.575893,0.637134
6570,e3482363-c474-4acf-af4b-10c3bfdeba1f,The athlete threw the while.,0.0,"throw.01: throw, sending through the air, manu...",throw,typical,unnatural,target,169.375210,3679,throw,manual,manual,manual,manual,manual_unnatural_typical,-0.640775,1.043880,0.637134


### Model fitting

We conduct the typicality model fits using the same setup as we used for naturalness.

In [28]:
if glob("fits/typ/base/*.csv"):
    results_typ = load_fit(
        "fits/typ/base/*.csv", 
        data_nat,
        fixed_formula="~ 1 + C(generation_method, levels=generation_levels)",
        by_subj_formula="~ 1", # cannot fit anything bigger, because not all subjects saw items from every generation method
        by_verb_formula="~ 1", # cannot fit anything bigger, because not all verbs show up with each generation method
        by_sense_formula="~ 1", # cannot fit anything bigger, because not all verb senses show up with each generation method
        by_item_formula="~ 1", # cannot fit anything bigger, because items are specific to generation method
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"],
    )
else:
    results_typ = fit_hmc(
        data_typ,
        fixed_formula="~ 1 + C(generation_method, levels=generation_levels)",
        by_subj_formula="~ 1", # cannot fit anything bigger, because not all subjects saw items from every generation method
        by_verb_formula="~ 1", # cannot fit anything bigger, because not all verbs show up with each generation method
        by_sense_formula="~ 1", # cannot fit anything bigger, because not all verb senses show up with each generation method
        by_item_formula="~ 1", # cannot fit anything bigger, because items are specific to generation method
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"],
        output_dir="fits/typ/base/",
        **sampler_params
    )

    results_typ.fit.diagnose()

    # handles a bug in cmdstanpy.from_csv that I suspect has to do with a version change in STAN
    # basically, I *think* STAN used to dump the save_warmup flag as an int, but now it uses a boolean
    !find . -type f -wholename './fits/typ/base/*.csv' | xargs sed -i 's/save_warmup = false/save_warmup = 0/g'

17:21:59 - cmdstanpy - INFO - created output directory: /home/jovyan/fits/typ/base
17:21:59 - cmdstanpy - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

21:26:01 - cmdstanpy - INFO - CmdStan done processing.
Exception: multi_normal_lpdf: LDLT_Factor of covariance parameter is not positive definite.  last conditional variance is 0. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 163, column 4 to line 165, column 6)
	Exception: multi_normal_lpdf: LDLT_Factor of covariance parameter is not positive definite.  last conditional variance is 0. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 163, column 4 to line 165, column 6)
	Exception: ordered_logistic: Final cut-point is inf, but must be finite! (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 169, column 4 to line 172, column 6)
	Exception: multi_normal_lpdf: LDLT_Factor of covariance parameter is not positive definite.  last conditional variance is 0. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 139, column 4 to line 141, column 6)
	Exception: multi_normal_lpdf: LDLT_Factor of covariance parameter is not posi




In [29]:
results_typ.fixed_coefs

Unnamed: 0,post_mean,2.5%,5%,95%,97.5%,p
Intercept,1.815261,1.426413,1.483588,2.143694,2.205287,0.0
"C(generation_method, levels=generation_levels)[T.manual_natural_atypical]",-3.443556,-3.856891,-3.793083,-3.088313,-3.024366,0.0
"C(generation_method, levels=generation_levels)[T.manual_unnatural_typical]",-3.596408,-4.01811,-3.946522,-3.242247,-3.17306,0.0
"C(generation_method, levels=generation_levels)[T.manual_unnatural_atypical]",-4.310754,-4.729321,-4.660474,-3.962269,-3.892445,0.0
"C(generation_method, levels=generation_levels)[T.reddit]",-1.388818,-1.81013,-1.743691,-1.026215,-0.95233,0.0
"C(generation_method, levels=generation_levels)[T.llama-propbanksense]",-1.348166,-1.761532,-1.694527,-0.993306,-0.923644,0.0
"C(generation_method, levels=generation_levels)[T.llama-llamasense]",-1.478187,-1.886689,-1.8193,-1.12169,-1.059498,0.0


In [31]:
print_fixed_coef_tabular(results_typ, process_coef_name_nat_typ)

\begin{tabular}{rcccr}
\toprule
      & \textbf{Post. mean} & \textbf{2.5\%} & \textbf{97.5\%} & \textbf{Post.} $p$ \\
\midrule
Intercept & 1.82 & 1.43 & 2.21 & $<$ 0.01 \\
Manual (Natural \& Atypical) & -3.44 & -3.86 & -3.02 & $<$ 0.01 \\
Manual (Unnatural \& Typical) & -3.60 & -4.02 & -3.17 & $<$ 0.01 \\
Manual (Unnatural \& Atypical) & -4.31 & -4.73 & -3.89 & $<$ 0.01 \\
Corpus & -1.39 & -1.81 & -0.95 & $<$ 0.01 \\
LM with PropBank senses & -1.35 & -1.76 & -0.92 & $<$ 0.01 \\
LM with LM senses & -1.48 & -1.89 & -1.06 & $<$ 0.01 \\
\bottomrule
\end{tabular}


In [None]:
_ = cutpoint_stats(results_typ.fit)

### Predicting typicality from frequency

In Section 5.6 of the paper, we report regressions assessing whether our automated methods are frequency-sensitive with respect to typicality, finding that they are not. These regressions are conducted below. 

In [58]:
data_typ_freqsub = data_typ.query('generation_method_type != "manual" or generation_method == "manual_natural_typical"')

data_typ_freqsub

Unnamed: 0,rater_id,sentence,rating,sense,verb,typicality,naturalness,sentence_type,surprisal,freq,verbnet_class,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,rating_z,surprisal_z,freq_z
60,56122d99-f13f-4674-a58a-5ece8f7973f6,The baby arranged the blocks.,50.0,arrange.01: build,arrange,typical,natural,calibration,151.75505,2289,build,llama-propbanksense,manual,manual,manual,manual_natural_typical,0.156486,0.361287,-0.273493
61,80e45189-916a-4f4a-9690-e83ef3ef5c2d,The baby arranged the blocks.,93.0,arrange.01: build,arrange,typical,natural,calibration,151.75505,2289,build,llama-propbanksense,manual,manual,manual,manual_natural_typical,0.733854,0.361287,-0.273493
62,58c5437e-7879-4200-9cf3-b1b892d47fd6,The baby arranged the blocks.,60.0,arrange.01: build,arrange,typical,natural,calibration,151.75505,2289,build,llama-propbanksense,manual,manual,manual,manual_natural_typical,0.385907,0.361287,-0.273493
63,6ad341a1-12e3-4a39-9ab0-77a32b7592bd,The baby arranged the blocks.,100.0,arrange.01: build,arrange,typical,natural,calibration,151.75505,2289,build,llama-propbanksense,manual,manual,manual,manual_natural_typical,0.951258,0.361287,-0.273493
64,0d04029d-6889-4a8f-a56c-c7f81cdb0b9a,The baby arranged the blocks.,80.0,arrange.01: build,arrange,typical,natural,calibration,151.75505,2289,build,llama-propbanksense,manual,manual,manual,manual_natural_typical,0.439619,0.361287,-0.273493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6552,e3482363-c474-4acf-af4b-10c3bfdeba1f,The cook smashed the potatoes.,100.0,"smash.02: move violently, so as to break apart",smash,typical,natural,target,157.26427,2442,"throw,hit",manual,manual,manual,manual,manual_natural_typical,1.723989,0.574710,-0.173259
6556,e3482363-c474-4acf-af4b-10c3bfdeba1f,The couple split the check.,100.0,"split.01: divide, separate",split,typical,natural,target,128.58272,2218,break,manual,manual,manual,manual,manual_natural_typical,1.723989,-0.536394,-0.320007
6560,e3482363-c474-4acf-af4b-10c3bfdeba1f,The thief stole the painting.,100.0,steal.01: to commit theft,steal,typical,natural,target,104.60133,5133,steal,manual,manual,manual,manual,manual_natural_typical,1.723989,-1.465418,1.589690
6564,e3482363-c474-4acf-af4b-10c3bfdeba1f,The kid tore the paper.,100.0,"tear.01: to pull apart, separate forcefully",tear,typical,natural,target,157.81557,1030,hurt,manual,manual,manual,manual,manual_natural_typical,1.723989,0.596067,-1.098299


In [59]:
if glob("fits/typ/frequency/*.csv"):
    results_typ_freq = load_fit(
        "fits/typ/frequency/*.csv",
        data_typ_freqsub,
        fixed_formula="~ 1 + C(generation_method_subtype, levels=generation_subtype_levels) * freq_z",
        by_subj_formula="~ 1 + freq_z", # cannot fit anything bigger, because not all subjects saw items from every generation method
        by_verb_formula="~ 1", # cannot fit anything bigger, because not all verbs show up with each generation method
        by_sense_formula="~ 1", # cannot fit anything bigger, because not all verb senses show up with each generation method
        by_item_formula="~ 1", # cannot fit anything bigger, because items are specific to generation method
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"]
    )
else:
    results_typ_freq = fit_hmc(
        data_typ_freqsub,
        fixed_formula="~ 1 + C(generation_method_subtype, levels=generation_subtype_levels) * freq_z",
        by_subj_formula="~ 1 + freq_z", # cannot fit anything bigger, because not all subjects saw items from every generation method
        by_verb_formula="~ 1", # cannot fit anything bigger, because not all verbs show up with each generation method
        by_sense_formula="~ 1", # cannot fit anything bigger, because not all verb senses show up with each generation method
        by_item_formula="~ 1", # cannot fit anything bigger, because items are specific to generation method
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"],
        output_dir="fits/typ/frequency",
        **sampler_params
    )

    results_typ_freq.fit.diagnose()

    # handles a bug in cmdstanpy.from_csv that I suspect has to do with a version change in STAN
    # basically, I *think* STAN used to dump the save_warmup flag as an int, but now it uses a boolean
    !find . -type f -wholename './fits/typ/frequency/*.csv' | xargs sed -i 's/save_warmup = false/save_warmup = 0/g'

18:07:32 - cmdstanpy - INFO - created output directory: /home/jovyan/fits/typ/frequency
18:07:32 - cmdstanpy - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

20:41:57 - cmdstanpy - INFO - CmdStan done processing.
Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/mode




In [60]:
results_typ_freq.fixed_coefs

Unnamed: 0,post_mean,2.5%,5%,95%,97.5%,p
Intercept,1.714823,1.301704,1.369044,2.055504,2.117254,0.0
"C(generation_method_subtype, levels=generation_subtype_levels)[T.reddit]",-1.343744,-1.793905,-1.722022,-0.96398,-0.886188,0.0
"C(generation_method_subtype, levels=generation_subtype_levels)[T.llama]",-1.233091,-1.655909,-1.581553,-0.875249,-0.806489,0.0
freq_z,0.187281,-0.256607,-0.189873,0.561571,0.628587,0.2035
"C(generation_method_subtype, levels=generation_subtype_levels)[T.reddit]:freq_z",-0.112771,-0.585738,-0.513312,0.291118,0.363925,0.3222
"C(generation_method_subtype, levels=generation_subtype_levels)[T.llama]:freq_z",-0.020354,-0.484105,-0.407205,0.363044,0.439159,0.4643


In [61]:
print_fixed_coef_tabular(results_typ_freq, process_coef_name_freq)

\begin{tabular}{rcccr}
\toprule
      & \textbf{Post. mean} & \textbf{2.5\%} & \textbf{97.5\%} & \textbf{Post.} $p$ \\
\midrule
Intercept & 1.71 & 1.30 & 2.12 & $<$ 0.01 \\
Corpus & -1.34 & -1.79 & -0.89 & $<$ 0.01 \\
LM & -1.23 & -1.66 & -0.81 & $<$ 0.01 \\
Frequency & 0.19 & -0.26 & 0.63 & 0.20 \\
Corpus $\times$ Frequency & -0.11 & -0.59 & 0.36 & 0.32 \\
LM $\times$ Frequency & -0.02 & -0.48 & 0.44 & 0.46 \\
\bottomrule
\end{tabular}


In [62]:
_ = cutpoint_stats(results_typ_freq.fit)

Cutpoint 0: -2.23 (95\% CI = [-2.3, -2.16])
Cutpoint 1: 0.79 (95\% CI = [0.5, 1.11])


### Predict naturalness from typicality

In Section 5.6 of the paper, we investigate the hypothesis that typicality plays a role in the degraded naturalness of the automatically generated sentences, finding that it does. These regressions are conducted below.

In [18]:
data_typ_mean = data_typ.pivot_table(
    index="sentence", values="rating_z", aggfunc=np.mean
).rename(
    columns={"rating_z": "typicality_rating"}
).reset_index()

data_typ_mean

Unnamed: 0,sentence,typicality_rating
0,The ALA cast the muppets.,-0.548768
1,The Americans abused the Mexicans.,0.205448
2,The Americans beat the Japanese.,0.611064
3,The BBC covered the inauguration.,1.029906
4,The Baudelaires encountered the unknown.,-0.048643
...,...,...
1835,The youngest met the queen.,0.561288
1836,The youngster pinched the cat.,0.711491
1837,The youngster reached the age.,-0.121412
1838,The youngster saw the girl.,0.771561


In [19]:
data_nat_typ = pd.merge(data_nat, data_typ_mean, how="left")

data_nat_typ

Unnamed: 0,rater_id,sentence,rating,sense,verb,typicality,naturalness,sentence_type,surprisal,freq,verbnet_class,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,surprisal_z,freq_z,typicality_rating
0,83bda277-9b4f-4c6e-a23b-a79ece20cc5c,The baby arranged the something.,50.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193,0.087812
1,8bc64c84-3e28-4f83-abaf-e585ff40ffe5,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193,0.087812
2,f09582f5-d39c-41eb-ab67-ff696f2c3459,The baby arranged the something.,22.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193,0.087812
3,e41ede6f-b8a8-4424-a276-520d63d0af85,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193,0.087812
4,03eaf938-1208-40ce-9b0b-bb58cc6048c9,The baby arranged the something.,0.0,arrange.01: build,arrange,typical,unnatural,calibration,167.027270,2289,build,llama-propbanksense,manual,manual,manual,manual_unnatural_typical,0.865457,-0.240193,0.087812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24533,48938107-ae25-4ac5-af1f-f5552d1228a9,The after tore the brick.,25.0,"tear.01: to pull apart, separate forcefully",tear,atypical,unnatural,target,139.264510,1030,hurt,manual,manual,manual,manual,manual_unnatural_atypical,-0.201797,-1.088339,-0.753087
24534,48938107-ae25-4ac5-af1f-f5552d1228a9,The athlete threw the javelin.,100.0,"throw.01: throw, sending through the air, manu...",throw,typical,natural,target,160.484510,3679,throw,manual,manual,manual,manual,manual_natural_typical,0.613941,0.696203,0.995130
24535,48938107-ae25-4ac5-af1f-f5552d1228a9,The jellyfish threw the javelin.,92.0,"throw.01: throw, sending through the air, manu...",throw,atypical,natural,target,101.749570,3679,throw,manual,manual,manual,manual,manual_natural_atypical,-1.643943,0.696203,-0.633328
24536,48938107-ae25-4ac5-af1f-f5552d1228a9,The athlete threw the while.,38.0,"throw.01: throw, sending through the air, manu...",throw,typical,unnatural,target,169.375210,3679,throw,manual,manual,manual,manual,manual_unnatural_typical,0.955716,0.696203,-0.786691


In [20]:
if glob("fits/nat/typicality/*.csv"):
    results_nat_typ = load_fit(
        "fits/nat/typicality/*.csv", 
        data_nat_typ,
        fixed_formula="~ 1 + typicality_rating",
        by_subj_formula="~ 1 + typicality_rating",
        by_verb_formula="~ 1 + typicality_rating",
        by_sense_formula="~ 1 + typicality_rating", 
        by_item_formula="~ 1", # cannot fit anything bigger, because each sentence has only one typicality rating
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"],
    )
else:
    results_nat_typ = fit_hmc(
        data_nat_typ,
        fixed_formula="~ 1 + typicality_rating",
        by_subj_formula="~ 1 + typicality_rating",
        by_verb_formula="~ 1 + typicality_rating",
        by_sense_formula="~ 1 + typicality_rating", 
        by_item_formula="~ 1", # cannot fit anything bigger, because each sentence has only one typicality rating
        item_cols = ["sentence"],
        subj_cols = ["rater_id"],
        verb_cols = ["verb"],
        sense_cols = ["verb", "sense"],
        output_dir="fits/nat/typicality",
        **sampler_params
    )

    results_nat_typ.fit.diagnose()

    # handles a bug in cmdstanpy.from_csv that I suspect has to do with a version change in STAN
    # basically, I *think* STAN used to dump the save_warmup flag as an int, but now it uses a boolean
    !find . -type f -wholename './fits/nat/typicality/*.csv' | xargs sed -i 's/save_warmup = false/save_warmup = 0/g'

18:05:12 - cmdstanpy - INFO - created output directory: /home/jovyan/fits/nat/typicality
18:05:12 - cmdstanpy - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

23:20:15 - cmdstanpy - INFO - CmdStan done processing.
Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: beta_proportion_lpdf: Precision parameter is 0, but must be positive finite! (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 176, column 6 to line 179, column 8)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jo




In [21]:
results_nat_typ.fixed_coefs

Unnamed: 0,post_mean,2.5%,5%,95%,97.5%,p
Intercept,1.051758,0.924804,0.94659,1.153381,1.17446,0.0
typicality_rating,1.467693,1.32742,1.349403,1.586463,1.609221,0.0


In [34]:
post_mean = np.round(results_nat_typ.fixed_coefs.post_mean[1], 2)
cilo = np.round(results_nat_typ.fixed_coefs["2.5%"][1], 2)
cihi = np.round(results_nat_typ.fixed_coefs["97.5%"][1], 2)

print(f"($\\beta=${post_mean}, 95\\% CrI=[{cilo}, {cihi}])")

($\beta=$1.47, 95\% CrI=[1.33, 1.61])


## Experiment 3: Distinctiveness

In [35]:
diff_fnames = [
    "difference_llama-propbanksenses.csv",
    "difference_llama-llamasenses.csv",
    "difference_reddit.csv",
    "difference_manual.csv"
]

data_diff = []

for fname in diff_fnames:
    df = pd.read_csv(f"data/{fname}")

    generation_method = fname.replace("difference_", "").strip(".csv")

    if generation_method == "manual":
        df["list_type"] = "manual"
        df["generation_method_type"] = "manual"
        df["generation_method_subtype"] = "manual"
        df["generation_method_subsubtype"] = "manual"
        df = df.query("pair_type == 'target'")
        df["generation_method"] = "manual"
    else:
        df["list_type"] = generation_method
        df["generation_method_type"] = df.pair_type.map(lambda x: "manual" if x == "calibration" else "automated")
        df["generation_method_subtype"] = df.pair_type.map(
            lambda x: "manual" if x == "calibration" else generation_method.split("-")[0]
        )
        df["generation_method_subsubtype"] = df.pair_type.map(
            lambda x: "manual" if x == "calibration" else generation_method
        )
        df["generation_method"] = df["generation_method_subsubtype"]

    data_diff.append(df)

data_diff = pd.concat(data_diff)

data_diff["surprisal1_z"] = (data_diff.surprisal1 - data_diff.surprisal1.mean())/data_diff.surprisal1.std()
data_diff["surprisal2_z"] = (data_diff.surprisal2 - data_diff.surprisal2.mean())/data_diff.surprisal2.std()

data_diff["freq_z"] = (data_diff.freq - data_diff.freq.mean())/data_diff.freq.std()

data_diff["logfreq"] = np.log1p(data_diff["freq"].fillna(0))
data_diff["logfreq_z"] = (data_diff.logfreq - data_diff.logfreq.mean())/data_diff.logfreq.std()

if data_diff.sense1.isnull().any() or data_diff.sense2.isnull().any():
    print("Inserting random dummy senses.")
    data_diff["sense1"] = np.random.choice(100, size=data_diff.shape[0]).astype(str)
    data_diff["sense2"] = np.random.choice(100, size=data_diff.shape[0]).astype(str)

if (data_diff.comparison == "filler").any():
    data_diff_automated_calibration = data_diff[data_diff.pair_type=="filler"].drop(columns=["comparison"])

    data_diff_automated_calibration = pd.merge(
        data_diff_automated_calibration ,
        data_diff[data_diff.generation_method=="manual"][["sentence1", "sentence2", "comparison"]].drop_duplicates()
    )

    data_diff = pd.concat(
        [data_diff[data_diff.pair_type != "filler"], data_diff_automated_calibration],
        axis=0
    )

data_diff

Unnamed: 0,rater_id,pair_type,comparison,verbnet_class,verb,sense1,sense2,sentence1,sentence2,surprisal1,...,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,surprisal1_z,surprisal2_z,freq_z,logfreq,logfreq_z
0,5756d4c8-c91e-4800-8c0c-e7bdb3f7cf8b,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
1,8ca93e95-9546-4060-a947-60ad0376f5b3,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
2,10e1fad2-1604-4f2c-8386-fa6b866e0fc0,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
3,3914e1b3-b31c-45c7-9e21-35a89d01491a,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
4,342cd4af-6828-4a6a-9cb3-5ac47327a434,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,c4df03a8-6960-40cf-b460-725db767f891,target,different,steal,steal,steal.01: to commit theft (physical),steal.01: to commit theft (abstract),The thief stole the painting.,The dancer stole the show.,104.60133,...,manual,manual,manual,manual,manual,-1.205378,-1.044352,1.468287,8.543640,0.838846
2538,c4df03a8-6960-40cf-b460-725db767f891,target,same,hurt,tear,"tear.01: to pull apart, separate forcefully (p...","tear.01: to pull apart, separate forcefully (p...",The kid tore the paper.,The nail tore the fabric.,157.81557,...,manual,manual,manual,manual,manual,0.858709,0.794185,-1.151063,6.938284,-0.607050
2539,c4df03a8-6960-40cf-b460-725db767f891,target,different,hurt,tear,"tear.01: to pull apart, separate forcefully (p...","tear.01: to pull apart, separate forcefully (a...",The kid tore the paper.,The poem tore the heartstrings.,157.81557,...,manual,manual,manual,manual,manual,0.858709,0.659621,-1.151063,6.938284,-0.607050
2540,c4df03a8-6960-40cf-b460-725db767f891,target,same,throw,throw,"throw.01: throw, sending through the air, manu...","throw.01: throw, sending through the air, manu...",The athlete threw the javelin.,The referee threw the flag.,160.48451,...,manual,manual,manual,manual,manual,0.962232,0.020273,0.540055,8.210668,0.538948


In [36]:
expected_counts_diff = {
    "llama-llamasense": 68,
    "llama-propbanksense": 68,
    "manual": 62,
    "reddit": 68
}

data_diff = filter_participants(data_diff, expected_counts_diff)

data_diff

Unnamed: 0,rater_id,pair_type,comparison,verbnet_class,verb,sense1,sense2,sentence1,sentence2,surprisal1,...,list_type,generation_method_type,generation_method_subtype,generation_method_subsubtype,generation_method,surprisal1_z,surprisal2_z,freq_z,logfreq,logfreq_z
0,5756d4c8-c91e-4800-8c0c-e7bdb3f7cf8b,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
1,8ca93e95-9546-4060-a947-60ad0376f5b3,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
2,10e1fad2-1604-4f2c-8386-fa6b866e0fc0,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
3,3914e1b3-b31c-45c7-9e21-35a89d01491a,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
4,342cd4af-6828-4a6a-9cb3-5ac47327a434,calibration,same,fire,dismiss,"dismiss.01: remove, release (abstract)","dismiss.01: remove, release (abstract)",The company dismissed the worker.,The teacher dismissed the class.,170.07234,...,llama-propbanksense,manual,manual,manual,manual,1.334127,-1.528673,0.594319,8.233503,0.559515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,c4df03a8-6960-40cf-b460-725db767f891,target,different,steal,steal,steal.01: to commit theft (physical),steal.01: to commit theft (abstract),The thief stole the painting.,The dancer stole the show.,104.60133,...,manual,manual,manual,manual,manual,-1.205378,-1.044352,1.468287,8.543640,0.838846
2538,c4df03a8-6960-40cf-b460-725db767f891,target,same,hurt,tear,"tear.01: to pull apart, separate forcefully (p...","tear.01: to pull apart, separate forcefully (p...",The kid tore the paper.,The nail tore the fabric.,157.81557,...,manual,manual,manual,manual,manual,0.858709,0.794185,-1.151063,6.938284,-0.607050
2539,c4df03a8-6960-40cf-b460-725db767f891,target,different,hurt,tear,"tear.01: to pull apart, separate forcefully (p...","tear.01: to pull apart, separate forcefully (a...",The kid tore the paper.,The poem tore the heartstrings.,157.81557,...,manual,manual,manual,manual,manual,0.858709,0.659621,-1.151063,6.938284,-0.607050
2540,c4df03a8-6960-40cf-b460-725db767f891,target,same,throw,throw,"throw.01: throw, sending through the air, manu...","throw.01: throw, sending through the air, manu...",The athlete threw the javelin.,The referee threw the flag.,160.48451,...,manual,manual,manual,manual,manual,0.962232,0.020273,0.540055,8.210668,0.538948


In [37]:
generation_levels = [
    'manual', 'reddit', 
    'llama-propbanksense', 'llama-llamasense'
]

comparison_levels = ['same', 'different']  

In [38]:
if glob("fits/pairdiff/base/*.csv"):
    results_diff = load_fit(
        "fits/pairdiff/base/*.csv", 
        data_diff,
        fixed_formula="~ 1 + C(comparison, levels=comparison_levels) * C(generation_method, levels=generation_levels)",
        by_subj_formula="~ 1 + C(comparison, levels=comparison_levels)",
        by_verb_formula="~ 1 + C(comparison, levels=comparison_levels)",
        by_sense_formula="~ 1", 
        by_item_formula="~ 1",
        item_cols=["sentence1", "sentence2"],
        sense_cols = ["verb", "sense1", "sense2"],
        subj_cols=["rater_id"],
        verb_cols = ["verb"],
    )
else:
    results_diff = fit_hmc(
        data_diff,
        fixed_formula="~ 1 + C(comparison, levels=comparison_levels) * C(generation_method, levels=generation_levels)",
        by_subj_formula="~ 1 + C(comparison, levels=comparison_levels)",
        by_verb_formula="~ 1 + C(comparison, levels=comparison_levels)",
        by_sense_formula="~ 1", 
        by_item_formula="~ 1",
        item_cols=["sentence1", "sentence2"],
        sense_cols = ["verb", "sense1", "sense2"],
        subj_cols=["rater_id"],
        verb_cols = ["verb"],
        output_dir="fits/pairdiff/base/",
        **sampler_params
    )

    results_diff.fit.diagnose()

    # handles a bug in cmdstanpy.from_csv that I suspect has to do with a version change in STAN
    # basically, I *think* STAN used to dump the save_warmup flag as an int, but now it uses a boolean
    !find . -type f -wholename './fits/pairdiff/base/*.csv' | xargs sed -i 's/save_warmup = false/save_warmup = 0/g'



13:33:51 - cmdstanpy - INFO - created output directory: /home/jovyan/fits/pairdiff/base
13:33:51 - cmdstanpy - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

15:10:37 - cmdstanpy - INFO - CmdStan done processing.
Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 121, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 133, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 133, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/models/ordered-beta.stan', line 133, column 2 to column 26)
	Exception: lkj_corr_lpdf: Correlation matrix is not positive definite. (in '/home/jovyan/scripts/analysis/mode




In [39]:
results_diff.fixed_coefs

Unnamed: 0,post_mean,2.5%,5%,95%,97.5%,p
Intercept,-1.483902,-1.890388,-1.825255,-1.138627,-1.077533,0.0
"C(comparison, levels=comparison_levels)[T.different]",1.912189,1.404263,1.486103,2.346952,2.429082,0.0
"C(generation_method, levels=generation_levels)[T.reddit]",0.886713,0.429008,0.498029,1.26615,1.33609,0.0
"C(generation_method, levels=generation_levels)[T.llama-propbanksense]",-0.087111,-0.552729,-0.478299,0.294963,0.36463,0.3582
"C(generation_method, levels=generation_levels)[T.llama-llamasense]",0.123328,-0.304732,-0.240279,0.477559,0.54356,0.2833
"C(comparison, levels=comparison_levels)[T.different]:C(generation_method, levels=generation_levels)[T.reddit]",-1.135977,-1.818265,-1.706411,-0.576955,-0.469768,0.0005
"C(comparison, levels=comparison_levels)[T.different]:C(generation_method, levels=generation_levels)[T.llama-propbanksense]",-0.286722,-0.897247,-0.810152,0.231456,0.326751,0.1833
"C(comparison, levels=comparison_levels)[T.different]:C(generation_method, levels=generation_levels)[T.llama-llamasense]",-0.867139,-1.442781,-1.352772,-0.383299,-0.296597,0.0012


In [42]:
def process_coef_name_pairdiff(coef: str) -> str:
    coef_map = {
        "manual": "Manual",
        "reddit": "Corpus",
        "llama-propbanksense": "LM with PropBank senses",
        "llama-llamasense": "LM with LM senses",
        "different": "Different Sense",
        "same": "Same Sense"
    }

    if coef == "Intercept":
        return coef
    else:
        return " $\\times$ ".join(
            coef_map[v] for v in re.findall(
                "C\(.*?\)\[T\.(.*?)\]", 
                coef
            )
        )

print_fixed_coef_tabular(results_diff, process_coef_name_pairdiff)

\begin{tabular}{rcccr}
\toprule
      & \textbf{Post. mean} & \textbf{2.5\%} & \textbf{97.5\%} & \textbf{Post.} $p$ \\
\midrule
Intercept & -1.48 & -1.89 & -1.08 & $<$ 0.01 \\
Different Sense & 1.91 & 1.40 & 2.43 & $<$ 0.01 \\
Corpus & 0.89 & 0.43 & 1.34 & $<$ 0.01 \\
LM with PropBank senses & -0.09 & -0.55 & 0.36 & 0.36 \\
LM with LM senses & 0.12 & -0.30 & 0.54 & 0.28 \\
Different Sense $\times$ Corpus & -1.14 & -1.82 & -0.47 & $<$ 0.01 \\
Different Sense $\times$ LM with PropBank senses & -0.29 & -0.90 & 0.33 & 0.18 \\
Different Sense $\times$ LM with LM senses & -0.87 & -1.44 & -0.30 & 0.00 \\
\bottomrule
\end{tabular}


In [54]:
intercept = results_diff.fit.stan_variable("fixed_coefs")[:,0]
different_effect = results_diff.fit.stan_variable("fixed_coefs")[:,1]
corpus_effect = results_diff.fit.stan_variable("fixed_coefs")[:,2]
corpus_different_effect = results_diff.fit.stan_variable("fixed_coefs")[:,5]

manual_different = intercept + different_effect
corpus_different = intercept + different_effect + corpus_effect + corpus_different_effect

((corpus_different - manual_different) < 0).mean()

0.8001

In [56]:
pbsense_effect = results_diff.fit.stan_variable("fixed_coefs")[:,3]
lmsense_effect = results_diff.fit.stan_variable("fixed_coefs")[:,4]
pbsense_different_effect = results_diff.fit.stan_variable("fixed_coefs")[:,6]
lmsense_different_effect = results_diff.fit.stan_variable("fixed_coefs")[:,7]

pbsense_different = intercept + different_effect + pbsense_effect + pbsense_different_effect
lmsense_different = intercept + different_effect + lmsense_effect + lmsense_different_effect

((lmsense_different - pbsense_different) < 0).mean()

0.9947

In [63]:
((pbsense_different - manual_different) < 0).mean()

0.9172

In [None]:
((corpus_different - pbsense_different) < 0).mean()

In [43]:
_ = cutpoint_stats(results_diff.fit)

Cutpoint 0: -2.51 (95\% CI = [-2.58, -2.43])
Cutpoint 1: 1.12 (95\% CI = [0.52, 1.83])


In [None]:
cutpoint1_pairdiff = cutpoint0_pairdiff + np.exp(fit_pairdiff["fit"].stan_variable("interval_size_logmean"))

print(
    np.round(cutpoint1_pairdiff.mean(0), 2), 
    f"[{np.round(np.quantile(cutpoint1_pairdiff, 0.025), 2)}, {np.round(np.quantile(cutpoint1_pairdiff, 0.975), 2)}]"
)

In [None]:
_ = arviz.plot_posterior(fit_pairdiff["fit"], var_names=["fixed_coefs"])

In [None]:
_ = arviz.plot_posterior(fit_pairdiff["fit"], var_names=["subj_scale", "item_scale", "verb_scale"])

In [None]:

_ = arviz.plot_posterior(fit_pairdiff["fit"], var_names=["sample_size", "interval_size"])

In [None]:
fit_pairdiff_surprisal = fit_hmc(
    data_diff,
    fixed_formula=("~ 1 + C(PairType, levels=comparison_levels) * surprisal1_z * surprisal2_z"
                      " + C(PairType, levels=comparison_levels) * typicality1 * typicality2" 
                      " + C(PairType, levels=comparison_levels) * freq"),
    by_subj_formula="~ 1 + C(PairType, levels=comparison_levels)",
    by_verb_formula="~ 1 + C(PairType, levels=comparison_levels)",
    by_sense_formula="~ 1", 
    by_item_formula="~ 1", 
    max_init_iter=0,
    item_cols=["sentence1", "sentence2"],
    sense_cols = ["verb", "sense1", "sense2"],
    subj_cols=["rater_id"],
    verb_cols = ["verb"],
    output_dir="fits/pairdiff/",
    **sampler_params
)

In [None]:
fit_pairdiff_surprisal["fixed_coefs"]

In [None]:
print_fixed_coef_tabular(fit_pairdiff_surprisal, process_coef_name_pairdiff_surprisal)

In [None]:
_ = arviz.plot_posterior(fit_pairdiff_surprisal["fit"], var_names=["fixed_coefs"])

In [None]:
_ = arviz.plot_posterior(fit_pairdiff_surprisal["fit"], var_names=["subj_scale", "item_scale", "verb_scale"])

In [None]:

_ = arviz.plot_posterior(fit_pairdiff_surprisal["fit"], var_names=["sample_size", "interval_size"])