# Setup

In [1]:
!pip install inflect
!pip install -U kaleido
!pip install openai

Collecting inflect
  Using cached inflect-7.0.0-py3-none-any.whl (34 kB)
Collecting pydantic>=1.9.1
  Using cached pydantic-2.3.0-py3-none-any.whl (374 kB)
Collecting annotated-types>=0.4.0
  Using cached annotated_types-0.5.0-py3-none-any.whl (11 kB)
Collecting pydantic-core==2.6.3
  Downloading pydantic_core-2.6.3-cp311-cp311-macosx_11_0_arm64.whl (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m0m
[?25hCollecting typing-extensions
  Using cached typing_extensions-4.8.0-py3-none-any.whl (31 kB)
Installing collected packages: typing-extensions, annotated-types, pydantic-core, pydantic, inflect
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.5.0
    Uninstalling typing_extensions-4.5.0:
      Successfully uninstalled typing_extensions-4.5.0
Successfully installed annotated-types-0.5.0 inflect-7.0.0 pydantic-2.3.0 

In [8]:
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import inflect
import os
import json
import kaleido
import tqdm
import time
import random
import math

from dataclasses import dataclass
from typing import Tuple, Dict, Callable
from abc import ABC, abstractmethod
from collections import defaultdict, Counter

np.random.seed(1)
random.seed(1)

ie = inflect.engine()

#### Helper functions

In [4]:
def save_map(m: object, filename: str) -> None:
    with open(filename, 'w') as f:
        json.dump(m, f)

def load_map(filename: str) -> None:
    with open(filename, 'r') as f:
        return json.load(f)

def get_similarity(a: str, b: str, sim_map: dict) -> float:
    if a in sim_map and b in sim_map[a]:
        return sim_map[a][b]
    elif b in sim_map and a in sim_map[b]:
        return sim_map[b][a]
    else:
        return 0.0

def scm(premises: list, c: str, all_c: list, sim_map: dict, specific: bool = True, alpha: float = 0.5) -> float:

    if not specific:
        conclusion_categories_1 = all_c
    else:
        conclusion_categories_1 = [c]
    a = np.mean([
        np.max([
            get_similarity(p, c_cat, sim_map)
            for p in premises if p
        ])
        for c_cat in conclusion_categories_1
    ])

    # calculate b
    conclusion_categories_2 = all_c
    b = np.mean([
        np.max([
            get_similarity(p, c_cat, sim_map)
            for p in premises if p
        ])
        for c_cat in conclusion_categories_2
    ])

    return alpha*a + (1-alpha)*b

def pluralise_category(category):
    if category not in ("Mammals", "Vehicles", "Birds", "Animals", "Things", "Objects"):
        return ie.plural(category).lower().capitalize()
    else:
        return "All " + category.lower()


#### Configs

In [5]:
ROOT = "../data/setup"

RAW_DATA = f"{ROOT}/raw"
PROCESSED_DATA = f"{ROOT}/processed"

NUM_PARTICIPANTS_PER_TRIAL = 10

NUM_SINGLE_PREMISE_ARGS_PER_BATCH = 24

EXPERIMENT_1_CONTROLS = {
    ((('Hippos',), 'Chickens'), (('Hippos',), 'Rhinos')): True,
    ((('Lions',), 'Penguins'), (('Lions',), 'Tigers')): True,
    ((('Mopeds',), 'Motorbikes'), (('Mopeds',), 'Screwdrivers')): False,
    ((('Robins',), 'Gorillas'), (('Robins',), 'Sparrows')): True
}
EXPERIMENT_2_SINGLE_PREMISE_CONTROLS = {
  "General": {
    "Mammals": [("All animals", "All mammals"), ("All living things", "All mammals"), ("All plants and animals", "All mammals"), ("All things that are alive", "All mammals")],
    "Birds": [("All animals", "All birds"), ("All living things", "All birds"), ("All plants and animals", "All birds"), ("All things that are alive", "All birds")],
    "Vehicles": [("All man-made things", "All vehicles"), ("All things", "All vehicles"), ("All things that can move", "All vehicles"), ("All things that can carry people", "All vehicles")],
  },
  "Specific": {
    "Mammals": [("Dogs", "Canines"), ("Cats", "Felines"), ("Horses", "Ponies"), ("Cows", "Cattle")],
    "Birds": [("Crows", "Ravens"), ("Hawks", "Falcons"), ("Pigeons", "Doves"), ("Finches", "Sparrows")],
    "Vehicles": [("Trucks", "Lorries"), ("Taxis", "Cabs"), ("Mopeds", "Scooters"), ("Trains", "Locomotives")],
  },
}
EXPERIMENT_2_MULTI_PREMISE_CONTROLS = {
    "General": {
        "Mammals": [["All animals", "All organisms"], "All mammals"],
        "Birds": [["All animals", "All organisms"], "All birds"],
        "Vehicles": [["All transport", "All moving things"], "All vehicles"],
    },
    "Specific": {
      "Mammals": [["Canines", "Puppies"], "Dogs"],
      "Birds": [["Hawks", "Eagles"], "Falcons"],
      "Vehicles": [["Locomotives", "Trams"], "Trains"]
    }
}

EXPERIMENT_2_TRAINING_TRIALS = {
    "General": {
        "single_premise": ["training_1_0", "training_1_1",],
        "multi_premise": ["training_2_0", "training_2_1",],
    },
    "Specific": {
        "single_premise": ["training_3_0", "training_3_1",],
        "multi_premise": ["training_4_0", "training_4_1",],
    }
}

OSHERSON_CATEGORIES = {
      "Mammals": ["Hippos", "Hamsters", "Rhinos", "Lions", "Giraffes", "Rabbits", "Tigers", "Foxes", "Pigs", "Wolves", "Gorillas", "Mice", "Bats", "Horses", "Cows", "Chimps", "Dolphins", "Squirrels", "Seals"],
      "Birds": ["Robins", "Penguins", "Bluejays", "Sparrows", "Hawks", "Falcons", "Geese", "Crows", "Peacocks", "Ostriches"],
    }
OSHERSON_E2_CATEGORIES = ["horse", "cow", "chimp", "gorilla", "mouse", "squirrel", "dolphin", "seal", "elephant", "rhino"]

OSHERSON_PHENOMENON_NUMBERS = {
    1: "Premise-conclusion Similarity",
    2: "Premise Typicality",
    3: "Conclusion Specificity",
    4: "Premise Monotonicity (General)",
    5: "Premise Monotonicity (Specific)",
    6: "Premise Diversity (General)",
    7: "Premise Diversity (Specific)",
    8: "Nonmonotonicity (General)",
    9: "Nonmonotonicity (Specific)",
    10: "Premise-conclusion Asymmetry",
    11: "Inclusion Fallacy",
}

PHENOMENON_ORDER = {
    "Similarity": 1,
    "Typicality": 2,
    "Specificity": 3,
    "Monotonicity (General)": 4,
    "Monotonicity (Specific)": 5,
    "Diversity (General)": 6,
    "Diversity (Specific)": 7,
    "Nonmonotonicity (General)": 8,
    "Nonmonotonicity (Specific)": 9,
    "Asymmetry": 10,
    "Inclusion Fallacy": 11,
}

PHENOMENON_TYPE = {
    "Similarity": "Specific",
    "Typicality": "General",
    "Specificity": "General",
    "Monotonicity (General)": "General",
    "Monotonicity (Specific)": "Specific",
    "Diversity (General)": "General",
    "Diversity (Specific)": "Specific",
    "Nonmonotonicity (General)": "Mixed",
    "Nonmonotonicity (Specific)": "Mixed",
    "Asymmetry": "Specific",
    "Inclusion Fallacy": "General",
}

DOMAINS = {
    "Vehicles": "all vehicle",
    "Birds": "all bird",
    "Mammals": "all mammal",
    "Things": "all thing",
    "Animals": "all animal"
}

DOMAIN_PARENTS = {
    "Mammals": "living things",
    "Birds": "living things",
    "Vehicles": "objects",
    "Reptiles": "living things",
    "Insects": "living things",
    "Fruits": "foods",
}

MAIN_DOMAINS = ["Mammals", "Birds", "Vehicles"]
DD_DOMAINS = ["Mammals", "Birds", "Vehicles", "Insects", "Reptiles", "Tools", "Professions", "Sports", "KitchenUtensils", "Vegetables", "Clothing", "MusicalInstruments", "Weapons", "Fruit", "Fish"]
NUMBERED_DOMAINS = {"Mammals": 0, "Birds": 1, "Vehicles": 2}

# One time setup
## Clean DeDeyne data

In [10]:
# Rename certain dedeyne categories into more common forms
# CATEGORY_RENAME = {"dromedary": "camel", "rhinocerous": "rhino", "hippopotamus": "hippo", "parakeet": "parrot", "cuckoo": "", "pheasant": "", "chickadee": "", "bison": "", "file": "", "caiman": "", "blindworm": "", "boa": "boa constrictor", "bat": ""}
CATEGORY_RENAME = {"dromedary": "camel", "rhinocerous": "rhino", "hippopotamus": "hippo", "parakeet": "parrot", "boa": "boa constrictor", "cuckoo": "", "pheasant": "", "chickadee": "", "bison": "", "caiman": "", "blindworm": "",}

# Category rows for each dedeyene dataframe
DD_CATEGORIES = {
    "Vehicles": ["car", "boat", "moped", "motorbike", "bus", "truck", "van", "caravan", "submarine", "bicycle", "go-cart", "helicopter", "hovercraft", "jeep", "cart", "carriage", "hot air balloon", "subway train", "motorbike", "rocket", "skateboard", "sled", "kick scooter", "taxi", "tractor", "tram", "train", "airplane", "truck", "zeppelin"],
    "Birds": ["eagle", "dove", "duck", "magpie", "pheasant", "vulture", "rooster", "turkey", "canary", "chicken", "cuckoo", "crow", "chickadee", "seagull", "blackbird", "sparrow", "stork", "parrot", "parakeet", "peacock", "pelican", "penguin", "heron", "robin", "woodpecker", "ostrich", "owl", "falcon", "swan", "swallow"],
    "Mammals": ["monkey", "beaver", "bison", "camel", "squirrel", "hedgehog", "donkey", "giraffe", "hamster", "deer", "dog", "polar bear", "kangaroo", "cat", "cow", "rabbit", "llama", "lion", "mouse", "rhinocerous", "hippopotamus", "elephant", "horse", "sheep", "tiger", "pig", "bat", "fox", "wolf", "zebra"],
    "Insects": ["bee", "leech", "horsefly", "centipede", "fruit fly", "bumblebee", "cockroach", "beetle", "cricket", "dragonfly", "ladybug", "louse", "cockchafer", "ant", "moth", "mosquito", "earwig", "wood louse", "caterpillar", "spider", "grasshopper", "fly", "butterfly", "flee", "wasp", "worm"],
    "Reptiles": ["viper", "alligator", "boa", "cobra", "dinosaur", "gecko", "lizard", "blindworm", "caiman", "chameleon", "frog", "crocodile", "iguana", "toad", "python", "salamander", "tortoise", "snake", "monitor lizard", "turtle"],
    "Tools": ["anvil", "chisel", "axe", "drill", "crowbar", "screw wrench", "lawn mower", "hammer", "spanner", "pickaxe", "crowbar", 'wheelbarrow', "knife", "wrench", "oil can", "filling knife", "plough", "plane", "shovel", "screwdriver", "grinding disc", "nail", "wire brush", "vacuum cleaner", "tongs", "rope", "paint brush", "file", "level", "saw"]
}

# Pluralised version of the above
DD_CATEGORIES_PLURAL = {k: [ie.plural(x).capitalize() for x in v] for k,v in DD_CATEGORIES.items()}

# Stuff to fix the inconsistencies in the DD datasets
@dataclass
class MatrixTranslationMixup:
    """Some of the De Deyne csv's have incorrectly labelled Dutch and English columns and rows that need to be manually coded for correction."""
    columns_are_switched: bool
    rows_are_switched: bool
        
mixups = {
    "Birds": MatrixTranslationMixup(False, True),
    "Clothing": MatrixTranslationMixup(False, True),
    "Fish": MatrixTranslationMixup(False, True),
    "Fruit": MatrixTranslationMixup(True,False),
    "Insects": MatrixTranslationMixup(False, False),
    "KitchenUtensils": MatrixTranslationMixup(False, False),
    "Mammals": MatrixTranslationMixup(False, False),
    "MusicalInstruments": MatrixTranslationMixup(False, True),
    "Professions": MatrixTranslationMixup(True, False),
    "Reptiles": MatrixTranslationMixup(False, False),
    "Sports": MatrixTranslationMixup(True, False),
    "Vegetables": MatrixTranslationMixup(True, False),
    "Vehicles": MatrixTranslationMixup(False, True),
    "Weapons": MatrixTranslationMixup(False, False),
    "Tools": MatrixTranslationMixup(False, False),
}

In [13]:
def process_category(category: str) -> str:
    o = ""
    for c in category.strip():
        if c.isalpha() or c == " ":
            o += c
    return o


def generate_similarity_typicality_dataframes(balance_osherson: bool = False) -> Tuple[Dict[str, pd.DataFrame], Dict[str, pd.DataFrame]]:

    similarity_dataframes, typicality_dataframes = {}, {}

    for domain in DD_DOMAINS:

        # Read raw typicality dataframe
        tdf = pd.read_csv(f"{RAW_DATA}/de_deyne_typicality/exemplarTypicalityRatings-{domain.lower()}.CSV", index_col=0, encoding = "ISO-8859-1").rename({"Unnamed: 1":"category", "mean": "mean_typicality"}, axis=1)[["category", "mean_typicality"]].reset_index(drop=True).dropna()
        tdf["category"] = [process_category(c) for c in tdf["category"]]
        if domain in DD_CATEGORIES:
            tdf["category"] = DD_CATEGORIES[domain]

        # Read raw similarity dataframes, average across all participants and fix column/row mixups
        folder = f"{RAW_DATA}/de_deyne_pairwise/pairwiseSimilarities{domain}"
        similarity_dfs = []
        for i in range(len(os.listdir(folder))):
            df = pd.read_csv(f"{folder}/pairwiseSimilarities{domain}-{i+1}.CSV", encoding = "ISO-8859-1")
            if mixups[domain].columns_are_switched:
                df.columns = df.columns.tolist()[:2] + df.iloc[0,2:].tolist()
            if mixups[domain].rows_are_switched:
                df["exemplar ENGLISH"] = df.iloc[:,0].tolist()
            similarity_dfs.append(df.iloc[1:,1:])
        average_similarities = np.mean([df.iloc[:,1:].values.astype(int) for df in similarity_dfs], axis=0)
        average_similarities = average_similarities + np.rot90(np.fliplr(average_similarities))
        average_similarity_df = pd.DataFrame(average_similarities, columns=DD_CATEGORIES[domain] if domain in DD_CATEGORIES else tdf["category"].tolist())
        sdf = average_similarity_df
        if domain in DD_CATEGORIES:
            sdf["category"] = DD_CATEGORIES[domain]
        else:
            sdf["category"] = tdf["category"].tolist()
        col = sdf.pop("category")
        sdf.insert(0, col.name, col)

        categories = [CATEGORY_RENAME[c] if c in CATEGORY_RENAME else c for c in sdf["category"].tolist()]
        keep_indices = []
        seen = set()

        # Cut all multiword or repeated categories
        for i, category in enumerate(categories):
            if category and " " not in category and category not in seen and category.isalpha():
                keep_indices.append(i)
            seen.add(category)

        typicality_dataframes[domain] = tdf.iloc[keep_indices].reset_index(drop=True)
        typicality_dataframes[domain]["category"] = [CATEGORY_RENAME[c] if c in CATEGORY_RENAME else c for c in typicality_dataframes[domain]["category"].tolist()]
        typicality_dataframes[domain] = typicality_dataframes[domain].rename(CATEGORY_RENAME, axis=1)

        similarity_dataframes[domain] = sdf.iloc[keep_indices, [0] + [1 + ki for ki in keep_indices]].reset_index(drop=True)
        similarity_dataframes[domain]["category"] = [CATEGORY_RENAME[c] if c in CATEGORY_RENAME else c for c in similarity_dataframes[domain]["category"].tolist()]
        similarity_dataframes[domain] = similarity_dataframes[domain].rename(CATEGORY_RENAME, axis=1)

        assert tuple(similarity_dataframes[domain]["category"]) == tuple(typicality_dataframes[domain]["category"])
        assert tuple(similarity_dataframes[domain]["category"]) == tuple(similarity_dataframes[domain].columns[1:])

    return similarity_dataframes, typicality_dataframes

In [14]:
similarity_dataframes, typicality_dataframes = generate_similarity_typicality_dataframes()
domain_categories = {domain: df["category"].tolist() for domain, df in similarity_dataframes.items()}

# Pluralise categories
for d,cs in domain_categories.items():
    if d != "Sports":
        domain_categories[d] = [ie.plural(c).capitalize() for c in cs]
    else:
        domain_categories[d] = [c.capitalize() for c in cs]
for d, sdf in similarity_dataframes.items():
    if d != "Sports":
        sdf["category"] = sdf["category"].apply(lambda x: ie.plural(x).capitalize())
        similarity_dataframes[d] = sdf.rename({c: ie.plural(c).capitalize() for c in sdf.columns[1:]}, axis=1)
    else:
        sdf["category"] = sdf["category"].apply(lambda x: x.capitalize())
        similarity_dataframes[d] = sdf.rename({c: c.capitalize() for c in sdf.columns[1:]}, axis=1)
for d, tdf in typicality_dataframes.items():
    if d != "Sports":
        tdf["category"] = tdf["category"].apply(lambda x: ie.plural(x).capitalize())
    else:
        tdf["category"] = tdf["category"].apply(lambda x: x.capitalize())

# Similarity map for category pairs in each domain
similarity_map = {}
for domain in domain_categories:
    df = similarity_dataframes[domain]
    similarity_map[domain] = {}
    for category1 in domain_categories[domain]:
        similarity_map[domain][category1] = {}
        for category2 in domain_categories[domain]:
            if category1 == category2:
                similarity_map[domain][category1][category2] = 20
            else:
                similarity_map[domain][category1][category2] = df[df["category"] == category2][category1].iloc[0]

# Typicality map for categories in each domain
typicality_map = {domain: typicality_dataframes[domain].set_index("category").to_dict()["mean_typicality"] for domain in typicality_dataframes}

# List of categories per domain, ordered by similarity to a particular category
similarity_rank_map = {}
for domain, cat in domain_categories.items():
    categories = [c for c in cat if c]
    similarity_rank_map[domain] = {}
    for category in categories:
        cs = [(oc, similarity_map[domain][category][oc]) for oc in categories if oc != category]
        cs = sorted(cs, key=lambda x: x[1], reverse=True)
        s, _ = zip(*cs)
        similarity_rank_map[domain][category] = list(s)

# List of categories per domain, ordered by typicality
typicality_rank_map = {domain: tdf.sort_values(by="mean_typicality", ascending=False)["category"].tolist() for domain, tdf in typicality_dataframes.items()}
typicality_rank_map = {domain: [c.strip() for c in dl] for domain, dl in typicality_rank_map.items()}

  tdf = pd.read_csv(f"{RAW_DATA}/de_deyne_typicality/exemplarTypicalityRatings-{domain.lower()}.CSV", index_col=0, encoding = "ISO-8859-1").rename({"Unnamed: 1":"category", "mean": "mean_typicality"}, axis=1)[["category", "mean_typicality"]].reset_index(drop=True).dropna()


In [15]:
# List of high dis/similarity categories for each category
high_similarity_map = {}
high_dissimilarity_map = {}
for domain, cat in domain_categories.items():
    high_similarity_map[domain] = {}
    high_dissimilarity_map[domain] = {}
    categories = [c for c in cat if c]

    sims = similarity_dataframes[domain].iloc[:,1:].values
    sims[sims == 0] = np.nan
    mean = np.nanmean(sims)
    std = np.nanstd(sims)

    for category in categories:
        other_categories = [c for c in categories if c != category]
        cs = [(oc, similarity_map[domain][category][oc]) for oc in other_categories]
        cdf = pd.DataFrame(cs, columns=["category", "similarity"]).sort_values(by="similarity", ascending=False).dropna()
        similar = cdf[cdf["similarity"] > mean + 0.75*std]["category"].tolist()
        dissimilar = cdf[cdf["similarity"] < mean - 0.75*std]["category"].tolist()

        high_similarity_map[domain][category] = similar
        high_dissimilarity_map[domain][category] = dissimilar

## Cut DeDeyne categories

Cut some categories to ensure that each domain has the same number of categories

In [16]:
category_utility_map = {}
for domain in MAIN_DOMAINS:
    hsm = high_similarity_map[domain]
    hdm = high_dissimilarity_map[domain]

    assert set(hsm.keys()) == set(hdm.keys())

    category_utility_map[domain] = {}
    for category in hsm:
        category_utility_map[domain][category] = len(hsm[category]) #(len(hsm[category]) + len(hdm[category])) / 2


# Cut based on 'utility', ie. how many highly similar categories there are in the domain set
num_arguments = min([len(category_utility_map[d]) for d in MAIN_DOMAINS])
cut_categories = set()
for domain in MAIN_DOMAINS:
    categories = [x[0] for x in sorted(category_utility_map[domain].items(), key=lambda x: x[1], reverse=True)]
    cut_categories = cut_categories.union(set(categories[num_arguments:]))

domain_categories = {d: [c for c in v if c not in cut_categories] for d,v in domain_categories.items()}

assert all(len(domain_categories[domain]) == num_arguments for domain in MAIN_DOMAINS)

In [18]:
# this may be different from actual data, since the original notebook was run in colab
print(cut_categories)

{'Monkeys', 'Kangaroos', 'Penguins', 'Owls', 'Pigs', 'Bats'}


## Osherson Similarity data

In [19]:
osherson_similarity_df = pd.read_csv(f"{RAW_DATA}/osherson/osherson_similarities.csv", names=["c1", "c2", "similarity"])
osherson_similarity_map = {}
for _, row in osherson_similarity_df.iterrows():
    c1 = ie.plural(row["c1"]).capitalize()
    c2 = ie.plural(row["c2"]).capitalize()
    for c in (c1, c2):
        if c not in osherson_similarity_map:
            osherson_similarity_map[c] = {}
    osherson_similarity_map[c1][c2] = float(row["similarity"])
    osherson_similarity_map[c2][c1] = float(row["similarity"])

## Save processed data

In [20]:
save_map(domain_categories, f"{PROCESSED_DATA}/domain_categories.json")
save_map(similarity_map, f"{PROCESSED_DATA}/similarity_map.json")
save_map(osherson_similarity_map, f"{PROCESSED_DATA}/osherson_similarity_map.json")
save_map(typicality_map, f"{PROCESSED_DATA}/typicality_map.json")
save_map(similarity_rank_map, f"{PROCESSED_DATA}/similarity_rank_map.json")
save_map(typicality_rank_map, f"{PROCESSED_DATA}/typicality_rank_map.json")
save_map(high_similarity_map, f"{PROCESSED_DATA}/high_similarity_map.json")
save_map(high_dissimilarity_map, f"{PROCESSED_DATA}/high_dissimilarity_map.json")

# Load in Osherson and DeDeyne data

In [21]:
domain_categories = load_map(f"{PROCESSED_DATA}/domain_categories.json")
similarity_map = load_map(f"{PROCESSED_DATA}/similarity_map.json")
osherson_similarity_map = load_map(f"{PROCESSED_DATA}/osherson_similarity_map.json")
typicality_map = load_map(f"{PROCESSED_DATA}/typicality_map.json")
similarity_rank_map = load_map(f"{PROCESSED_DATA}/similarity_rank_map.json")
typicality_rank_map = load_map(f"{PROCESSED_DATA}/typicality_rank_map.json")
high_similarity_map = load_map(f"{PROCESSED_DATA}/high_similarity_map.json")
high_dissimilarity_map = load_map(f"{PROCESSED_DATA}/high_dissimilarity_map.json")

# Read in original Osherson argument pair data
osherson_df = pd.read_csv(f"{RAW_DATA}/osherson/osherson_argument_pairs.csv", index_col=0)
number_map = {v:k for k,v in OSHERSON_PHENOMENON_NUMBERS.items()}
osherson_df["phenomenon_number"] = osherson_df["phenomenon_name"].map(number_map)
osherson_df["arg1_premises"] = osherson_df["arg1_premises"].apply(eval)
osherson_df["arg2_premises"] = osherson_df["arg2_premises"].apply(eval)
osherson_df["is_osherson"] = [True]*osherson_df.shape[0]

# Drop phenomenon number 9 because it uses insects
osherson_df = osherson_df[osherson_df["phenomenon_number"] != 9].reset_index(drop=True)

# Read in original Osherson argument ranking data
osherson_number_df = pd.read_csv(f"{RAW_DATA}/osherson/premisenumbering.txt")
oshmap = osherson_number_df.set_index("number").to_dict()["category"]

osdf = pd.read_csv(f"{RAW_DATA}/osherson/specificarguments.txt", names=["p1","p2","c","strength"], delim_whitespace=True)
osherson_specific_df = pd.DataFrame([("Mammals","Specific",(oshmap[row["p1"]],oshmap[row["p2"]]),oshmap[row["c"]],row["strength"]) for _, row in osdf.iterrows()], columns=["domain","argtype","premises","conclusion","human_rating"])
osherson_specific_df["is_osherson"] = [True]*osherson_specific_df.shape[0]

osdf = pd.read_csv(f"{RAW_DATA}/osherson/generalarguments.txt", names=["p1","p2","p3","strength"], delim_whitespace=True)
osherson_general_df = pd.DataFrame([("Mammals","General",(oshmap[row["p1"]],oshmap[row["p2"]],oshmap[row["p3"]]),"Mammals",row["strength"]) for _, row in osdf.iterrows()], columns=["domain","argtype","premises","conclusion","human_rating"])
osherson_general_df["is_osherson"] = [True]*osherson_general_df.shape[0]

# Generate Experiment 1 argument pairs

In [22]:
# Generate samples of possible arguments

NUM_SAMPLES = 20000
rows = (
    []
)  # (domain, phenomenon_number, sample_num, arg1_premises, arg1_conclusion, arg2_premises, arg2_conclusion)
domains = list(domain_categories.keys())
other_categories = {
    "Mammals": domain_categories["Reptiles"],
    "Birds": domain_categories["Insects"],
    "Vehicles": domain_categories["Tools"],
}

for domain in other_categories:
    dl = domain_categories[domain]

    for phenomenon_number in OSHERSON_PHENOMENON_NUMBERS:
        for i in range(NUM_SAMPLES):
            if phenomenon_number == 1:
                # Premise-conclusion Similarity
                # X|Y vs Z|Y
                # Three categories total

                c = random.choice(dl)
                cti = typicality_rank_map[domain].index(c)
                dissimilar = [
                    p
                    for p in high_dissimilarity_map[domain][c]
                    if typicality_rank_map[domain].index(p) > cti
                ]
                similar = [
                    p
                    for p in high_similarity_map[domain][c]
                    if typicality_rank_map[domain].index(p) > cti
                ]
                if not dissimilar or not similar:
                    continue
                p1 = random.choice(similar)
                p2 = random.choice(dissimilar)
                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1],
                        c,
                        [p2],
                        c,
                    )
                )

            elif phenomenon_number == 2:
                # Premise Typicality
                # X | D vs Y | D
                # Two categories total, conclusion is domain

                p1 = random.choice(typicality_rank_map[domain][:5])
                p2 = random.choice(typicality_rank_map[domain][-5:])
                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1],
                        domain,
                        [p2],
                        domain,
                    )
                )

            elif phenomenon_number == 3:
                # Conclusion Specificity
                # X, Y | D vs X, Y | A
                # Two categories total, conc1 is domain, conclusion2 is some larger domain
                sample_categories = random.sample(dl, 2)
                p1, p2 = sample_categories
                if domain == "Vehicles":
                    c2 = "Things"
                else:
                    c2 = "Animals"
                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1, p2],
                        domain,
                        [p1, p2],
                        c2,
                    )
                )

            elif phenomenon_number == 4:
                # Premise Monotonicity (General)
                # X, Y, Z | D vs X, Y | D
                # Three categories total, conclusion is domain
                # Sample so that third premise of first argument is not more typical than either of the first two premises
                sample_categories = random.sample(dl, 2)
                p1, p2 = sample_categories
                min_typicality_index, max_typicality_index = sorted(
                    [
                        typicality_rank_map[domain].index(p1),
                        typicality_rank_map[domain].index(p2),
                    ]
                )
                if max_typicality_index >= len(typicality_rank_map[domain]) - 3:
                    continue
                p3 = random.choice(
                    typicality_rank_map[domain][max_typicality_index + 1 :]
                )
                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1, p2, p3],
                        domain,
                        [p1, p2],
                        domain,
                    )
                )

            elif phenomenon_number == 5:
                # Premise Monotonicity (Specific)
                # X, Y, Z | W vs X, Y | W
                # Four categories total
                # Sample so that third premise of first argument is not more similar to the conclusion than either of the first two premises
                c = random.choice(dl)
                sample_categories = random.sample(similarity_rank_map[domain][c], 2)
                p1, p2 = sample_categories
                min_similar_index, max_similar_index = sorted(
                    [
                        similarity_rank_map[domain][c].index(p1),
                        similarity_rank_map[domain][c].index(p2),
                    ]
                )
                min_typicality_index, max_typicality_index = sorted(
                    [
                        typicality_rank_map[domain].index(p1),
                        typicality_rank_map[domain].index(p2),
                    ]
                )
                if (
                    max_similar_index >= len(similarity_rank_map[domain][c]) - 3
                    or max_typicality_index >= len(typicality_rank_map[domain]) - 3
                ):
                    continue
                options = list(
                    set(
                        similarity_rank_map[domain][c][max_similar_index + 1 :]
                    ).intersection(
                        set(typicality_rank_map[domain][max_typicality_index + 1 :])
                    )
                )
                if not options:
                    continue
                p3 = random.choice(options)
                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1, p2, p3],
                        c,
                        [p1, p2],
                        c,
                    )
                )

            elif phenomenon_number == 6:
                # Premise Diversity (General)
                # X, Y | D vs X, Z | D
                # Three categories total, conclusion is domain
                # Sample second premise of first argument so that it is similar to first premise and not more typical than first premise
                # Sample second premise of second argument so that it is dissimilar to first premise and not more typical than first premise
                p1 = random.choice(dl)

                dissimilar = high_dissimilarity_map[domain][p1]
                similar = high_similarity_map[domain][p1]
                if not similar or not dissimilar:
                    continue
                i1 = typicality_rank_map[domain].index(p1)
                dissimilar = [
                    ci
                    for ci in dissimilar
                    if typicality_rank_map[domain].index(ci) > i1
                ]  # p2 must be less typical than p1
                similar = [
                    ci for ci in similar if typicality_rank_map[domain].index(ci) > i1
                ]  # p3 must be less typical than p1
                if not similar or not dissimilar:
                    continue
                p2 = random.choice(dissimilar)
                p3 = random.choice(similar)

                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1, p2],
                        domain,
                        [p1, p3],
                        domain,
                    )
                )

            elif phenomenon_number == 7:
                # Premise Diversity (Specific)
                # X, Y | W vs X, Z | W
                # Four categories total
                # Sample second premise of first argument so that it is dissimilar to first premise and not more similar to conclusion than first premise
                # Sample second premise of second argument so that it is similar to first premise and not more similar to conclusion than first premise or second premise of first argument
                c = random.choice(dl)
                cti = typicality_rank_map[domain].index(c)
                if cti == len(typicality_rank_map[domain]) - 1:
                    continue
                p1 = random.choice(typicality_rank_map[domain][cti + 1 :])

                dissimilar = [
                    p
                    for p in high_dissimilarity_map[domain][p1]
                    if typicality_rank_map[domain].index(p) > cti
                ]
                similar = [
                    p
                    for p in high_similarity_map[domain][p1]
                    if typicality_rank_map[domain].index(p) > cti
                ]
                if not similar or not dissimilar:
                    continue
                i1 = similarity_rank_map[domain][c].index(p1)
                dissimilar = [
                    ci
                    for ci in dissimilar
                    if ci != c and similarity_rank_map[domain][c].index(ci) > i1
                ]  # p2 must be less similar to c than p1
                similar = [
                    ci
                    for ci in similar
                    if ci != c and similarity_rank_map[domain][c].index(ci) > i1
                ]  # p3 must be less similar to c than p1
                if not similar or not dissimilar:
                    continue
                p2 = random.choice(dissimilar)
                i2 = similarity_rank_map[domain][c].index(p2)
                similar = [
                    ci
                    for ci in similar
                    if ci != c and similarity_rank_map[domain][c].index(ci) > i2
                ]  # p3 must be less similar to c than p2
                if not similar:
                    continue
                p3 = random.choice(similar)

                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1, p2],
                        c,
                        [p1, p3],
                        c,
                    )
                )

            elif phenomenon_number == 8:
                # Nonmonotonicity (General)
                # X, Y | D vs X, Y, Z | D
                # Three categories total, conclusion is domain, third category is from another domain
                sample_categories = random.sample(dl, 2)
                p1, p2 = sample_categories
                p3 = random.choice(other_categories[domain])

                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1, p2],
                        domain,
                        [p1, p2, p3],
                        domain,
                    )
                )

            elif phenomenon_number == 9:
                # Nonmonotonicity (Specific)
                # X | Z vs X, Y | Z
                # Three categories total, second category is from another domain
                c = random.choice(dl)
                cti = typicality_rank_map[domain].index(c)
                if cti == len(typicality_rank_map[domain]) - 1:
                    continue
                p1 = random.choice(typicality_rank_map[domain][cti + 1 :])
                p2 = random.choice(other_categories[domain])

                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1],
                        c,
                        [p1, p2],
                        c,
                    )
                )

            elif phenomenon_number == 10:
                # Premise-conclusion Asymmetry
                # X | Y vs Y | X
                # Two categories total
                p1 = random.choice(typicality_rank_map[domain][:10])
                p2 = random.choice(typicality_rank_map[domain][-10:])
                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1],
                        p2,
                        [p2],
                        p1,
                    )
                )

            elif phenomenon_number == 11:
                # Inclusion Fallacy
                # X | D vs X | Y
                # Two categories, first conclusion is domain
                p1 = random.choice(typicality_rank_map[domain][:10])
                c2 = random.choice(typicality_rank_map[domain][-10:])
                rows.append(
                    (
                        domain,
                        phenomenon_number,
                        OSHERSON_PHENOMENON_NUMBERS[phenomenon_number],
                        i,
                        [p1],
                        domain,
                        [p1],
                        c2,
                    )
                )

sample_df = pd.DataFrame(
    rows,
    columns=[
        "domain",
        "phenomenon_number",
        "phenomenon_name",
        "sample_num",
        "arg1_premises",
        "arg1_conclusion",
        "arg2_premises",
        "arg2_conclusion",
    ],
)
sample_df.to_csv(f"{PROCESSED_DATA}/experiment/argument_pair_samples.csv")


In [23]:
# Pick top 24 pairs according to SCM difference
# Same premise set cannot appear more than 5 times
# Same category cannot appear more than 8 times

NUM_PAIRS = 24
MAX_PSET = 10
MAX_C = 10

sample_df["is_osherson"] = [False] * sample_df.shape[0]
e1_df = pd.DataFrame([], columns=sample_df.columns)
for dpn, sdf in tqdm.tqdm(sample_df.groupby(["domain", "phenomenon_number"])):
    domain, phenomenon_number = dpn
    row_indices = []

    pset_counts = defaultdict(int)
    category_counts = defaultdict(int)
    seen = set()

    tdf = pd.DataFrame([], columns=sdf.columns)
    osh = osherson_df[
        (osherson_df["phenomenon_number"] == phenomenon_number)
        & (osherson_df["domain"] == domain)
    ]
    if osh.shape[0] > 0:
        tdf = osh
        for _, row in osh.iterrows():
            a1p = tuple(sorted(row["arg1_premises"]))
            a2p = tuple(sorted(row["arg2_premises"]))
            pset_counts[a1p] += 1
            pset_counts[a2p] += 1
            seen.add((a1p, row["arg1_conclusion"], a2p, row["arg2_conclusion"]))
            seen.add((a2p, row["arg2_conclusion"], a1p, row["arg1_conclusion"]))

    i = 0
    while i < sdf.shape[0] and len(row_indices) < NUM_PAIRS - osh.shape[0]:
        row = sdf.iloc[i]
        categories = (
            row["arg1_premises"]
            + row["arg2_premises"]
            + [row["arg1_conclusion"], row["arg2_conclusion"]]
        )
        n = (
            len(
                [
                    c
                    for c in categories
                    if category_counts[c] == MAX_C and c not in DOMAINS
                ]
            )
            > 0
        )
        a1p = tuple(sorted(row["arg1_premises"]))
        a2p = tuple(sorted(row["arg2_premises"]))
        if (
            pset_counts[a1p] < MAX_PSET
            and pset_counts[a2p] < MAX_PSET
            and (a1p, row["arg1_conclusion"], a2p, row["arg2_conclusion"]) not in seen
            and (a2p, row["arg2_conclusion"], a1p, row["arg1_conclusion"]) not in seen
            and row["arg1_conclusion"] not in a1p
            and row["arg2_conclusion"] not in a2p
            and len(set(a1p)) == len(a1p)
            and len(set(a2p)) == len(a2p)
            and not n
        ):
            pset_counts[a1p] += 1
            pset_counts[a2p] += 1
            seen.add((a1p, row["arg1_conclusion"], a2p, row["arg2_conclusion"]))
            seen.add((a2p, row["arg2_conclusion"], a1p, row["arg1_conclusion"]))
            for category in categories:
                category_counts[category] += 1

            row_indices.append(i)

        i += 1

    tdf = pd.concat([tdf, sdf.iloc[row_indices]], axis=0)
    e1_df = pd.concat([e1_df, tdf], axis=0)


# Shuffle Pairs
a1_p, a2_p, a1_c, a2_c, swaps = [], [], [], [], []
for i in range(e1_df.shape[0]):
    swap = random.choice([True, False])
    if swap:
        a1_p.append(e1_df.iloc[i]["arg2_premises"])
        a2_p.append(e1_df.iloc[i]["arg1_premises"])
        a1_c.append(e1_df.iloc[i]["arg2_conclusion"])
        a2_c.append(e1_df.iloc[i]["arg1_conclusion"])
    else:
        a1_p.append(e1_df.iloc[i]["arg1_premises"])
        a2_p.append(e1_df.iloc[i]["arg2_premises"])
        a1_c.append(e1_df.iloc[i]["arg1_conclusion"])
        a2_c.append(e1_df.iloc[i]["arg2_conclusion"])
    swaps.append(swap)

e1_df["arg2_is_stronger"] = swaps
e1_df["arg1_premises"] = a1_p
e1_df["arg2_premises"] = a2_p
e1_df["arg1_conclusion"] = a1_c
e1_df["arg2_conclusion"] = a2_c

e1_df.reset_index(drop=True).to_csv(f"{PROCESSED_DATA}/experiment/experiment_1.csv")


100%|█████████████████████████████████████████████████████████████████████████████████████| 33/33 [00:00<00:00, 269.22it/s]


In [24]:
e1_df = pd.read_csv(f"{PROCESSED_DATA}/experiment/experiment_1.csv", index_col=0)

x = [(tuple(sorted(eval(row["arg1_premises"]) + eval(row["arg2_premises"]))), tuple(sorted([row["arg1_conclusion"], row["arg2_conclusion"]]))) for _, row in e1_df.iterrows()]
assert len(set(x)) == len(x)
assert e1_df.shape[0] == 792
assert all(ddf.shape[0] == 24 for _, ddf in e1_df.groupby(["domain", "phenomenon_number"]))

# Generate Experiment 2 arguments

In [26]:
NUM_ARGUMENTS = 100
NUM_BINS = 10
NUM_PREMISES = {"Specific": 2, "General": 2}

# Sample sets of arguments first
rows = []
for domain in MAIN_DOMAINS:
    dl = domain_categories[domain]

    for argtype, num_premises in NUM_PREMISES.items():
        for i in range(NUM_ARGUMENTS * 100):
            categories = random.sample(dl, num_premises + 1)
            premises = categories[:num_premises]
            if argtype == "Specific":
                conclusion = categories[-1]
            else:
                conclusion = domain

            rows.append((domain, argtype, tuple(premises), conclusion))

sample_df = pd.DataFrame(rows, columns=["domain", "argtype", "premises", "conclusion"])
sample_df = sample_df.drop_duplicates()

# Calculate SCM
scm_rows = []
for _, row in tqdm.tqdm(sample_df.iterrows()):
    scm_score = scm(
        row["premises"],
        row["conclusion"],
        domain_categories[row["domain"]],
        similarity_map[row["domain"]],
        row["argtype"] == "Specific",
        alpha=0.5,
    )
    scm_rows.append(scm_score)

sample_df["scm"] = scm_rows

sample_df.to_csv(f"{PROCESSED_DATA}/experiment/argument_samples.csv")


22133it [00:01, 12712.04it/s]


In [27]:
# Stratified sampling
# Each premise set cannot appear more than four times
# Each category cannot appear more than fifteen times


def generate_experiment2_arguments(num_bins: int) -> pd.DataFrame:
    MAX_PSET = 4
    MAX_CATEGORY = 15

    cols = ["domain", "argtype", "premises", "conclusion", "scm"]
    e2_df = pd.DataFrame([], columns=cols)
    for da, tdf in sample_df.groupby(["domain", "argtype"]):
        domain, argtype = da

        pset_counts = defaultdict(int)
        category_counts = defaultdict(int)
        seen = set()

        num_arguments = NUM_ARGUMENTS

        rows = []

        # Split arguments into bins
        tdf["bin"] = pd.cut(tdf["scm"], num_bins, labels=False)

        while len(rows) < num_arguments:
            binlabels = list(range(num_bins))
            random.shuffle(binlabels)

            for binlabel in binlabels:
                bdf = tdf[tdf["bin"] == binlabel]

                for i, row in bdf.iterrows():
                    premises, conclusion = row["premises"], row["conclusion"]
                    sorted_premises = tuple(sorted(premises))
                    categories = list(premises) + [conclusion]
                    n = (
                        len(
                            [
                                c
                                for c in categories
                                if category_counts[c] >= MAX_CATEGORY
                                and c not in DOMAINS
                            ]
                        )
                        > 0
                    )
                    if (
                        (sorted_premises, conclusion) not in seen
                        and pset_counts[sorted_premises] < MAX_PSET
                        and not n
                    ):
                        seen.add((sorted_premises, conclusion))
                        pset_counts[sorted_premises] += 1
                        for c in categories:
                            category_counts[c] += 1
                        rows.append((domain, argtype, premises, conclusion, row["scm"]))
                        break

                if len(rows) == num_arguments:
                    break

        bdf = pd.DataFrame(rows, columns=cols)
        bdf["is_osherson"] = [0] * bdf.shape[0]

        for _, tdf in bdf.groupby(["domain", "argtype"]):
            x = Counter([tuple(sorted(list(p))) for p in tdf["premises"].tolist()])
            assert all(c <= MAX_PSET for c in x.values())
            x = Counter([a for b in tdf["premises"].tolist() for a in b])
            assert all([c <= MAX_CATEGORY for c in x.values()])

        if domain == "Mammals":
            # if argtype == "General":
            #   bdf = pd.concat([bdf, osherson_general_df], axis=0)
            if argtype == "Specific":
                bdf = pd.concat([bdf, osherson_specific_df], axis=0)
        bdf = bdf.sample(frac=1)

        e2_df = pd.concat([e2_df, bdf], axis=0)

    return e2_df.reset_index(drop=True)


e2_df = generate_experiment2_arguments(NUM_BINS)
e2_df["premises"] = [
    tuple([ie.plural(p).capitalize() for p in row["premises"]])
    if row["is_osherson"] == 1
    else row["premises"]
    for _, row in e2_df.iterrows()
]
e2_df["conclusion"] = [
    ie.plural(row["conclusion"]).capitalize()
    if row["is_osherson"] == 1 and row["argtype"] == "Specific"
    else row["conclusion"]
    for _, row in e2_df.iterrows()
]
e2_df.sort_values(by=["domain", "argtype", "scm"]).to_csv(
    f"{PROCESSED_DATA}/experiment/experiment_2.csv"
)


In [28]:
e2_df = pd.read_csv(f"{PROCESSED_DATA}/experiment/experiment_2.csv", index_col=0)
e2_df["premises"] = e2_df["premises"].apply(eval)

for da, tdf in e2_df.groupby(["domain", "argtype", "is_osherson"]):
    domain, argtype, is_osherson = da
    premises = set([a for b in tdf["premises"].tolist() for a in b])
    conclusions = set([c for c in tdf["conclusion"].tolist() if c not in DOMAINS])
    assert len(premises.union(conclusions)) <= 24
    if not is_osherson:
        assert tdf.shape[0] == NUM_ARGUMENTS
    elif argtype == "Specific":
        assert tdf.shape[0] == osherson_specific_df.shape[0]
    else:  # General
        assert tdf.shape[0] == osherson_general_df.shape[0]


# Generate experiment JSONS

Here we divide our sets of argument pairs and arguments into batches of stimuli for every individual participant, packaging all of this up into JSON files that can be used to run the behavioural experiment.

In [29]:
e1_df = pd.read_csv(f"{PROCESSED_DATA}/experiment/experiment_1.csv", index_col=0)
e2_df = pd.read_csv(f"{PROCESSED_DATA}/experiment/experiment_2.csv", index_col=0)

e1_df["arg1_premises"] = e1_df["arg1_premises"].apply(eval)
e1_df["arg2_premises"] = e1_df["arg2_premises"].apply(eval)
e2_df["premises"] = e2_df["premises"].apply(lambda x: list(eval(x)))

e2_df = e2_df.sample(frac=1, random_state=0).reset_index(drop=True)


def generate_mturk_index_file(out, filename):
    df = pd.DataFrame(
        [(i, out[i]["tid"]) for i in range(len(out))], columns=["index", "tid"]
    )
    df.to_csv(filename)


## Generate property file

In [30]:
PROPERTY_ID = "property_p"


class PropertyGenerator(ABC):
    @abstractmethod
    def property_id(self):
        pass

    @abstractmethod
    def generate_property(self):
        pass


class PropertyPGenerator(PropertyGenerator):
    def property_id(self):
        return "property_p"

    def generate_property(self):
        return "property P"


In [31]:
property_generator = PropertyPGenerator()

out = f"""
PROPERTY_SET = {'{'}
  {property_generator.property_id()}: {'{'}
    id: {property_generator.property_id()},
    positive: '$c have {property_generator.generate_property()}',
    negative: '$c don't {property_generator.generate_property()}'
  {'}'}
{'}'}
"""

with open(f"{PROCESSED_DATA}/experiment_json/jsons/csr_properties.json", 'w') as file:
    file.write(out)

## Experiment 1

In [32]:
SIZE = 2  # Number of pairs per phenomena/domain that each participant should see
NUM_PAIRS = 24  # Total number of pairs per phenomena/domain

rows = []
for uid in range(math.ceil(NUM_PAIRS / SIZE)):
    indices = []
    for _, pdf in e1_df.groupby(["phenomenon_number", "domain"]):
        indices += (
            pdf.sample(frac=1, random_state=0)
            .iloc[uid * SIZE : uid * SIZE + SIZE]
            .index.tolist()
        )
    random.shuffle(indices)
    rows.append((uid, indices))
index_df = pd.DataFrame(rows, columns=["participant_id", "indices"])
index_df.to_csv(f"{PROCESSED_DATA}/experiment_json/experiment_1_participant_splits.csv")

# 2 pairs per phenomena (11 total) and domain (3 total) combo
assert all(len(i) == SIZE * 11 * 3 for i in index_df["indices"])

# No overlaps between splits
assert len(set([a for b in index_df["indices"].tolist() for a in b])) == len(
    [a for b in index_df["indices"].tolist() for a in b]
)

# Final splits include all pairs
assert sorted(
    [a for b in index_df["indices"].tolist() for a in b], reverse=False
) == list(range(e1_df.shape[0]))

# All phenomena/domain pairs are present in the right amount in each split
assert all(
    all(
        vc == SIZE
        for vc in e1_df.iloc[index_df["indices"].iloc[0]].value_counts(
            ["domain", "phenomenon_number"]
        )
    )
    for i in index_df["indices"]
)

test_cases = [
    (x[0][0][0], x[0][1], x[1][1]) if not f else (x[0][0][0], x[1][1], x[0][1])
    for x, f in EXPERIMENT_1_CONTROLS.items()
]

index_df = pd.read_csv(
    f"{PROCESSED_DATA}/experiment_json/experiment_1_participant_splits.csv", index_col=0
)
index_df["indices"] = index_df["indices"].apply(eval)

participant_df = index_df
for _ in range(NUM_PARTICIPANTS_PER_TRIAL - 1):
    participant_df = pd.concat([participant_df, index_df], axis=0)
participant_df = participant_df.reset_index(drop=True)
participant_df["participant_id"] = list(range(participant_df.shape[0]))

assert participant_df.shape[0] == NUM_PARTICIPANTS_PER_TRIAL * NUM_PAIRS / SIZE
assert len(set([tuple(i) for i in participant_df["indices"]])) == index_df.shape[0]

out = []
for j, irow in participant_df.iterrows():
    uid = irow["participant_id"]
    user_trials = {"tid": f"tid_experiment1_{uid}"}
    trial_config = []
    c, i = 0, 0
    t = len(irow["indices"]) // 4
    for _, row in (
        e1_df.iloc[irow["indices"]]
        .sample(frac=1, random_state=j)
        .reset_index(drop=True)
        .iterrows()
    ):
        user_trial = {"id": f"tc{i}"}
        arguments = []
        arguments.append(
            {
                "property": property_generator.property_id(),
                "premises": [p for p in row["arg1_premises"]],
                "conclusion": row["arg1_conclusion"],
            }
        )
        arguments.append(
            {
                "property": property_generator.property_id(),
                "premises": [p for p in row["arg2_premises"]],
                "conclusion": row["arg2_conclusion"],
            }
        )
        user_trial["arguments"] = arguments
        trial_config.append(user_trial)
        i += 1

        if i % t == 0 and c < len(test_cases):
            # Add test case
            user_trial = {"id": f"tc{i}"}
            arguments = []
            arguments.append(
                {
                    "property": property_generator.property_id(),
                    "premises": [test_cases[c][0]],
                    "conclusion": test_cases[c][1],
                }
            )
            arguments.append(
                {
                    "property": property_generator.property_id(),
                    "premises": [test_cases[c][0]],
                    "conclusion": test_cases[c][2],
                }
            )
            user_trial["arguments"] = arguments
            trial_config.append(user_trial)
            c += 1
            i += 1

    assert len(trial_config) == 11 * SIZE * 3 + len(test_cases)

    user_trials["cbi_compare"] = {"trialConfig": trial_config}
    out.append(user_trials)


with open(f"{PROCESSED_DATA}/experiment_json/experiment_1.json", "w") as file:
    file.write(json.dumps(out, indent=2))
generate_mturk_index_file(out, f"{PROCESSED_DATA}/experiment_json/experiment_1.csv")


## Experiment 2

### Generate trial arguments

In [33]:
PREDEFINED_TRIALS = {
    "training_1_0": {
        "id": "training_1_0",
        "intro": "intro 1",
        "arguments": [
            {
                "property": "property_p",
                "premises": ["Papayas"],
                "conclusion": "All fruits",
            }
        ],
    },
    "training_1_1": {
        "id": "training_1_1",
        "intro": "outro 1",
        "arguments": [
            {
                "property": "property_p",
                "premises": ["Apples"],
                "conclusion": "All fruits",
            }
        ],
    },
    "training_2_0": {
        "id": "training_2_0",
        "intro": "intro 2",
        "arguments": [
            {
                "property": "property_p",
                "premises": ["Lemons", "Limes"],
                "conclusion": "All fruits",
            }
        ],
    },
    "training_2_1": {
        "id": "training_2_1",
        "intro": "outro 2",
        "arguments": [
            {
                "property": "property_p",
                "premises": ["Bananas", "Watermelons"],
                "conclusion": "All fruits",
            }
        ],
    },
    "training_3_0": {
        "id": "training_3_0",
        "intro": "intro 1",
        "arguments": [
            {
                "property": "property_p",
                "premises": ["Blackberries"],
                "conclusion": "Raspberries",
            }
        ],
    },
    "training_3_1": {
        "id": "training_3_1",
        "intro": "outro 1",
        "arguments": [
            {
                "property": "property_p",
                "premises": ["Pineapples"],
                "conclusion": "Raspberries",
            }
        ],
    },
    "training_4_0": {
        "id": "training_4_0",
        "intro": "intro 2",
        "arguments": [
            {
                "property": "property_p",
                "premises": ["Lemons", "Limes"],
                "conclusion": "Cherries",
            }
        ],
    },
    "training_4_1": {
        "id": "training_4_1",
        "intro": "outro 2",
        "arguments": [
            {
                "property": "property_p",
                "premises": ["Bananas", "Watermelons"],
                "conclusion": "Cherries",
            }
        ],
    },
}

save_map(
    PREDEFINED_TRIALS,
    f"{PROCESSED_DATA}/experiment_json/jsons/experiment_2_predefined_trials.json",
)


## Generate splits

In [38]:
# Split arguments into quantile based bins based on SCM
NUM_BINS = 10
df = pd.DataFrame([], columns=e2_df.columns.tolist())
for _, tdf in e2_df.groupby(["domain", "argtype", "is_osherson"]):
    if tdf["is_osherson"].iloc[0] == 1:
        tdf["bin"] = pd.qcut(tdf["human_rating"], NUM_BINS, labels=False)
    else:
        tdf["bin"] = pd.qcut(tdf["scm"], NUM_BINS, labels=False)
    df = pd.concat([df, tdf], axis=0)

# Turn bins into argument batches
batches = {}  # {(domain, argtype, batchnum): [list of arguments]}
for ado, ddf in df.groupby(["argtype", "domain", "is_osherson"]):
    argtype, domain, is_osherson = ado
    max_batch_size = ddf["bin"].value_counts().max()

    for i in range(max_batch_size):
        batch = []
        for b, bdf in ddf.groupby("bin"):
            try:
                batch.append(bdf.iloc[i][["premises", "conclusion"]].values.tolist())
            except:
                batch.append(
                    bdf.sample(1).iloc[0][["premises", "conclusion"]].values.tolist()
                )
        batches[(domain, argtype, is_osherson, i)] = batch

rows = []
for daob, arguments in batches.items():
    domain, argtype, is_osherson, batch_num = daob
    for argument in arguments:
        rows.append((batch_num, domain, argtype, is_osherson, argument))
batch_df = pd.DataFrame(
    rows, columns=["batch_num", "domain", "argtype", "is_osherson", "argument"]
)
batch_df["argstr"] = batch_df["argument"].apply(str)

assert all(
    len(tdf["argstr"].unique()) == 10
    for _, tdf in batch_df.groupby(["batch_num", "domain", "argtype", "is_osherson"])
)
assert all(
    len(tdf["argstr"].unique()) == 100
    for _, tdf in batch_df[batch_df["is_osherson"] == 0].groupby(["domain", "argtype"])
)


# Generate single premise arguments
def generate_single_prem_args(batch):
    """
    batch: [[[p1,p2,p3], c], ...]
    """
    out = []
    for p, c in batch:
        for px in p:
            out.append((px, c))
    return list(set(out))


all_single_prem_args = {}  # {(domain, argtype, is_osherson): [list of arguments]}
for dao, tdf in batch_df.groupby(["domain", "argtype", "is_osherson"]):
    single_prem_args = generate_single_prem_args(tdf["argument"].tolist())
    all_single_prem_args[dao] = single_prem_args
    random.shuffle(all_single_prem_args[dao])

# Add single premise arguments into batch_df
df = pd.DataFrame([], columns=batch_df.columns)
for bdao, bdf in batch_df.groupby(["batch_num", "domain", "argtype", "is_osherson"]):
    batch_num, domain, argtype, is_osherson = bdao

    # Generate single premise arguments corresponding to this batch's multi premise arguments
    single_prem_args = generate_single_prem_args(bdf["argument"].tolist())
    num_single_prem_args = len(single_prem_args)

    # Number of single premise arguments that we want
    n = NUM_SINGLE_PREMISE_ARGS_PER_BATCH
    assert num_single_prem_args <= n

    if num_single_prem_args < n:
        # Need to pad our single premise args with single premise args from other batches

        if not is_osherson:
            # Randomly sample single premise arguments from other batches
            
            single_prem_args += random.sample(
                list(set(all_single_prem_args[(domain, argtype, is_osherson)]).difference(
                    set(single_prem_args)
                )),
                k=n - len(single_prem_args),
            )
            
            assert len(single_prem_args) == n

        elif argtype == "Specific":
            single_prem_args = all_single_prem_args[(domain, argtype, 1)]

    random.shuffle(single_prem_args)
    tdf = pd.DataFrame(
        [
            (batch_num, domain, argtype, is_osherson, [[spa[0]], spa[1]])
            for spa in single_prem_args
        ],
        columns=["batch_num", "domain", "argtype", "is_osherson", "argument"],
    )
    tdf["argstr"] = tdf["argument"].apply(str)
    df = pd.concat([df, bdf, tdf], axis=0)

assert all(
    bdf.shape[0] == 34
    for _, bdf in df[df["is_osherson"] == 0].groupby(
        ["batch_num", "domain", "argtype", "is_osherson"]
    )
)

df["is_single_premise"] = df["argument"].apply(lambda x: len(x[0]) == 1)

assert all(
    len(tdf[tdf["is_osherson"] == 0]["argstr"].unique()) == 100
    for _, tdf in df[~df["is_single_premise"]].groupby(["domain", "argtype"])
)
assert (
    len(
        df[
            (df["is_osherson"] == 1)
            & (df["argtype"] == "Specific")
            & (~df["is_single_premise"])
        ]["argstr"].unique()
    )
    == osherson_specific_df.shape[0]
)
assert (
    len(df[(df["is_osherson"] == 1) & (df["argtype"] == "General")]["argstr"].unique())
    == 0
)

df[
    ["batch_num", "domain", "argtype", "is_osherson", "is_single_premise", "argument"]
].to_csv(f"{PROCESSED_DATA}/experiment_json/experiment_2_participant_splits.csv")

batch_df = pd.read_csv(
    f"{PROCESSED_DATA}/experiment_json/experiment_2_participant_splits.csv", index_col=0
)
batch_df["argument"] = batch_df["argument"].apply(eval)


def generate_argument_trial(tid, premises, conclusion, breakAfter=False):
    return {
        "id": f"tc{tid}",
        "breakAfter": breakAfter,
        "arguments": [
            {
                "property": "property_p",
                "premises": premises,
                "conclusion": conclusion,
            },
        ],
    }


# Show a 'test' trial every N trials
TEST_SPACING = 8
NUM_TESTS = 4

rows = []
for is_osherson, o_df in batch_df.groupby(["is_osherson"]):
    output = []

    pid = 0

    for single_premise_first in (True, False):
        for bda, batch in o_df.groupby(["batch_num", "domain", "argtype"]):
            batch_num, domain, argtype = bda

            experiment_grouping = "dedeyne" if not is_osherson else "osherson"

            for _ in range(NUM_PARTICIPANTS_PER_TRIAL // 2):
                multi_premise_args = batch[~batch["is_single_premise"]][
                    "argument"
                ].tolist()
                single_premise_args = batch[batch["is_single_premise"]][
                    "argument"
                ].tolist()
                random.shuffle(multi_premise_args)
                random.shuffle(single_premise_args)

                p_trial = {
                    "tid": f"tid_experiment2_{experiment_grouping}_participant{pid}",
                    "cbi_rate": {"trialConfig": []},
                }

                if single_premise_first:
                    arglist = [single_premise_args, multi_premise_args]
                else:
                    arglist = [multi_premise_args, single_premise_args]

                tid, t = 0, 0
                is_single_premise = single_premise_first
                for args in arglist:
                    # Place trial args first
                    pt = "single_premise" if is_single_premise else "multi_premise"
                    p_trial["cbi_rate"]["trialConfig"] += EXPERIMENT_2_TRAINING_TRIALS[
                        argtype
                    ][pt]

                    for arg in args:
                        premises = [p.capitalize() for p in arg[0]]
                        if argtype == "General":
                            conclusion = f"All {arg[1].lower()}"
                        else:
                            conclusion = arg[1].capitalize()

                        a_trial = generate_argument_trial(tid, premises, conclusion)
                        p_trial["cbi_rate"]["trialConfig"].append(a_trial)
                        rows.append(
                            (
                                pid,
                                tid,
                                domain,
                                argtype,
                                is_osherson,
                                batch_num,
                                premises,
                                conclusion,
                            )
                        )
                        tid += 1

                        if tid % TEST_SPACING == 0 and t < NUM_TESTS:
                            if is_single_premise:
                                test_arg = EXPERIMENT_2_SINGLE_PREMISE_CONTROLS[
                                    argtype
                                ][domain]
                                a_trial = generate_argument_trial(
                                    tid, [test_arg[t][0]], test_arg[t][1]
                                )
                                rows.append(
                                    (
                                        pid,
                                        tid,
                                        domain,
                                        argtype,
                                        is_osherson,
                                        batch_num,
                                        [test_arg[t][0]],
                                        test_arg[t][1],
                                    )
                                )
                                t += 1
                            else:
                                test_arg = EXPERIMENT_2_MULTI_PREMISE_CONTROLS[argtype][
                                    domain
                                ]
                                a_trial = generate_argument_trial(
                                    tid, test_arg[0], test_arg[1]
                                )
                                rows.append(
                                    (
                                        pid,
                                        tid,
                                        domain,
                                        argtype,
                                        is_osherson,
                                        batch_num,
                                        test_arg[0],
                                        test_arg[1],
                                    )
                                )

                            p_trial["cbi_rate"]["trialConfig"].append(a_trial)
                            tid += 1

                    is_single_premise = not is_single_premise

                # Set breakAfter
                arg_is_single = single_premise_first
                prev_trial = None
                for trial in p_trial["cbi_rate"]["trialConfig"]:
                    if type(trial) == dict:
                        current_arg_is_single = (
                            len(trial["arguments"][0]["premises"]) == 1
                        )
                        if current_arg_is_single != arg_is_single:
                            prev_trial["breakAfter"] = True
                            break

                        arg_is_single = current_arg_is_single
                        prev_trial = trial

                output.append(p_trial)
                pid += 1

    with open(
        f"{PROCESSED_DATA}/experiment_json/jsons/{experiment_grouping}_experiment_2.json",
        "w",
    ) as file:
        file.write(json.dumps(output, indent=2))
    generate_mturk_index_file(
        output,
        f"{PROCESSED_DATA}/experiment_json/jsons/{experiment_grouping}_experiment_2_index.csv",
    )

df = pd.DataFrame(
    rows,
    columns=[
        "pid",
        "tid",
        "domain",
        "conclusion_type",
        "is_osherson",
        "batch_number",
        "premises",
        "conclusion",
    ],
)
df.to_csv(f"{PROCESSED_DATA}/experiment_json/experiment_2.csv")

df = pd.read_csv(f"{PROCESSED_DATA}/experiment_json/experiment_2.csv", index_col=0)
df = df[df["is_osherson"] == 0]

for dct, ddf in df.groupby(["domain", "conclusion_type"]):
    domain, conclusion_type = dct
    tids = [
        f"tid_experiment2_dedeyne_participant{pid}" for pid, _ in ddf.groupby("pid")
    ]
    pd.DataFrame(tids, columns=["tid"]).to_csv(
        f"{PROCESSED_DATA}/experiment_json/jsons/dedeyne_experiment_2_{domain.lower()}_{conclusion_type.lower()}_index.csv"
    )


# Clean Experiment 1 human data

In [39]:
def pluralise_osherson_conclusion(conclusion: str):
    if conclusion in ("Mammals", "Birds", "Animals"):
        return f"All {conclusion.lower()}"
    else:
        return ie.plural(conclusion).capitalize()


osherson_arg2_stronger = set(
    [
        (
            (
                ":".join([ie.plural(p).capitalize() for p in row["arg2_premises"]]),
                pluralise_osherson_conclusion(row["arg2_conclusion"]),
            ),
            (
                ":".join([ie.plural(p).capitalize() for p in row["arg1_premises"]]),
                pluralise_osherson_conclusion(row["arg1_conclusion"]),
            ),
        )
        for _, row in osherson_df.iterrows()
    ]
)

included_categories = set(
    domain_categories["Vehicles"]
    + domain_categories["Mammals"]
    + domain_categories["Birds"]
)
nonmon_categories = set(
    domain_categories["Insects"]
    + domain_categories["Reptiles"]
    + domain_categories["Tools"]
)


def get_domain(premise: str, conclusion: str):
    if "All " in conclusion:
        return conclusion.replace("All ", "").capitalize()
    else:
        for domain, categories in domain_categories.items():
            if conclusion in categories or premise in categories:
                return domain
    # is osherson
    return "Mammals"


def remove_participant_e1(df: pd.DataFrame):
    controls = df[df["is_control"]]
    return len([x for x in controls["human_rating"] if x > 2.5]) > 1


def classify_pair(pair):
    a1, a2 = pair
    a1_p, a1_c = a1
    a2_p, a2_c = a2

    # Figure out general or specific
    def conclusion_is_general(c):
        return c in ("All mammals", "All birds", "All vehicles")

    general = conclusion_is_general(a1_c) or conclusion_is_general(a2_c)

    if pair in EXPERIMENT_1_CONTROLS:
        return "Similarity"

    # Special cases for osherson examples
    if pair in (
        (
            (("Crows", "Peacocks"), "All birds"),
            (("Crows", "Peacocks", "Rabbits"), "All birds"),
        ),
        (
            (("Crows", "Peacocks", "Rabbits"), "All birds"),
            (
                (
                    "Crows",
                    "Peacocks",
                ),
                "All birds",
            ),
        ),
    ):
        return "Nonmonotonicity (General)"
    elif pair in (
        ((("Robins", "Bluejays"), "Geese"), (("Robins", "Bluejays"), "Sparrows")),
        ((("Robins", "Bluejays"), "Sparrows"), (("Robins", "Bluejays"), "Geese")),
    ):
        return "Similarity"

    a1_p, a2_p = list(a1_p), list(a2_p)

    # Specificity?
    if "things" in a1_c or "things" in a2_c or "animals" in a1_c or "animals" in a2_c:
        return "Specificity"

    # nonmon?
    for p in a1_p + a2_p:
        if p in nonmon_categories:
            return f"Nonmonotonicity {'(Specific)' if not general else '(General)'}"

    # monotonicity?
    if len(a1_p) != len(a2_p):
        return f"Monotonicity {'(Specific)' if not general else '(General)'}"

    # asymmetry/inclusion fallacy/similarity/typicality?
    if len(a1_p) == 1 and len(a2_p) == 1:
        if len(set(a1_p + a2_p + [a1_c, a2_c])) == 2:
            return "Asymmetry"
        elif (conclusion_is_general(a1_c) and not conclusion_is_general(a2_c)) or (
            conclusion_is_general(a2_c) and not conclusion_is_general(a1_c)
        ):
            return "Inclusion Fallacy"
        elif general:
            return "Typicality"
        else:
            return "Similarity"

    # Diversity
    return f"Diversity {'(Specific)' if not general else '(General)'}"


def arg2_is_stronger(pair, phenomenon_name, domain, is_osherson, is_control):
    a1, a2 = pair
    a1_p, a1_c = a1
    a2_p, a2_c = a2

    if not is_osherson and not is_control:
        if phenomenon_name == "Similarity":
            return (
                similarity_map[domain][a1_p[0]][a1_c]
                < similarity_map[domain][a2_p[0]][a2_c]
            )
        elif phenomenon_name == "Typicality":
            return typicality_rank_map[domain].index(a1_p[0]) > typicality_rank_map[
                domain
            ].index(a2_p[0])
        elif phenomenon_name == "Specificity":
            return "animal" in a1_c or "thing" in a1_c
        elif "Monotonicity" in phenomenon_name:
            return len(a1_p) < len(a2_p)
        elif "Nonmonotonicity" in phenomenon_name:
            return len(a1_p) > len(a2_p)
        elif "Diversity" in phenomenon_name:
            return (
                similarity_map[domain][a1_p[0]][a1_p[1]]
                > similarity_map[domain][a2_p[0]][a2_p[1]]
            )
        elif phenomenon_name == "Asymmetry":
            return typicality_rank_map[domain].index(a1_p[0]) > typicality_rank_map[
                domain
            ].index(a2_p[0])
        elif phenomenon_name == "Inclusion Fallacy":
            return a2_c not in included_categories
    elif is_osherson:
        p = ((":".join(a1_p), a1_c), (":".join(a2_p), a2_c))
        return p in osherson_arg2_stronger
    else:
        p = (((a1_p[0],), a1_c), ((a2_p[0],), a2_c))
        if p in EXPERIMENT_1_CONTROLS:
            return EXPERIMENT_1_CONTROLS[p]
        else:
            return not EXPERIMENT_1_CONTROLS[(p[1], p[0])]


In [40]:
human_e1 = pd.read_csv(f"{RAW_DATA}/experiment_1_ratings.csv")

osherson_df["trial_args"] = [
    tuple(
        sorted(
            [
                (
                    ":".join([ie.plural(p).capitalize() for p in row["arg1_premises"]]),
                    pluralise_osherson_conclusion(row["arg1_conclusion"]),
                ),
                (
                    ":".join([ie.plural(p).capitalize() for p in row["arg2_premises"]]),
                    pluralise_osherson_conclusion(row["arg2_conclusion"]),
                ),
            ]
        )
    )
    for _, row in osherson_df.iterrows()
]
osherson_trials = set(osherson_df["trial_args"])
human_e1["trial_args"] = [
    tuple(
        sorted(
            [
                (row["premises0"], row["conclusion0"]),
                (row["premises1"], row["conclusion1"]),
            ]
        )
    )
    for _, row in human_e1.iterrows()
]

human_e1["is_osherson"] = human_e1["trial_args"].apply(lambda x: x in osherson_trials)
human_e1["human_rating"] = human_e1["rating"]
human_e1["trialId_sort"] = human_e1["trialId"].apply(lambda x: int(x.replace("tc", "")))
human_e1 = human_e1.sort_values(by=["tid", "trialId_sort"], ascending=True).reset_index(
    drop=True
)
human_e1["premises0"] = human_e1["premises0"].apply(lambda x: x.split(":"))
human_e1["premises1"] = human_e1["premises1"].apply(lambda x: x.split(":"))
human_e1["conclusion0"] = human_e1["conclusion0"]
human_e1["conclusion1"] = human_e1["conclusion1"]
human_e1["is_control"] = [
    (
        (tuple(row["premises0"]), row["conclusion0"]),
        (tuple(row["premises1"]), row["conclusion1"]),
    )
    in EXPERIMENT_1_CONTROLS
    or (
        (tuple(row["premises1"]), row["conclusion1"]),
        (tuple(row["premises0"]), row["conclusion0"]),
    )
    in EXPERIMENT_1_CONTROLS
    for _, row in human_e1.iterrows()
]
human_e1["phenomenon"] = [
    classify_pair(
        (
            (tuple(row["premises0"]), row["conclusion0"]),
            (tuple(row["premises1"]), row["conclusion1"]),
        )
    )
    for _, row in human_e1.iterrows()
]
human_e1["domain"] = [
    get_domain(row["premises1"][0], row["conclusion1"])
    for _, row in human_e1.iterrows()
]
human_e1["domain"] = [
    get_domain(row["premises0"][0], row["conclusion0"])
    if row["domain"] in ("Animals", "Things")
    else row["domain"]
    for _, row in human_e1.iterrows()
]
human_e1["is_arg2_stronger"] = [
    arg2_is_stronger(
        (
            (row["premises0"], row["conclusion0"]),
            (row["premises1"], row["conclusion1"]),
        ),
        row["phenomenon"],
        row["domain"],
        row["is_osherson"],
        row["is_control"],
    )
    for _, row in human_e1.iterrows()
]
human_e1["phenomenon_type"] = human_e1["phenomenon"].map(lambda x: PHENOMENON_TYPE[x])

human_e1["arg1_premises"] = human_e1["premises0"]
human_e1["arg1_conclusion"] = human_e1["conclusion0"]
human_e1["arg2_premises"] = human_e1["premises1"]
human_e1["arg2_conclusion"] = human_e1["conclusion1"]
human_e1["trial_id"] = human_e1["trialId"]

# find participants who didn't pass the control trials
remove_p = {}
for uid, uid_df in human_e1.groupby("uid"):
    remove = remove_participant_e1(uid_df)
    remove_p[uid] = remove
human_e1["is_removed_participant"] = human_e1["uid"].apply(lambda x: remove_p[x])
print(
    f"Number of removed participants: {len(human_e1[human_e1['is_removed_participant']]['uid'].unique())}"
)
human_e1["argpair_id"] = [
    str(
        row["arg1_premises"]
        + [row["arg1_conclusion"]]
        + row["arg2_premises"]
        + [row["arg2_conclusion"]]
    )
    if not row["is_arg2_stronger"]
    else str(
        row["arg2_premises"]
        + [row["arg2_conclusion"]]
        + row["arg1_premises"]
        + [row["arg1_conclusion"]]
    )
    for _, row in human_e1.iterrows()
]

human_e1 = human_e1[
    [
        "uid",
        "tid",
        "trial_id",
        "argpair_id",
        "phenomenon",
        "phenomenon_type",
        "domain",
        "arg1_premises",
        "arg1_conclusion",
        "arg2_premises",
        "arg2_conclusion",
        "human_rating",
        "is_arg2_stronger",
        "is_control",
        "is_osherson",
        "is_removed_participant",
    ]
]

assert all(
    tdf.shape[0] == 10
    for _, tdf in human_e1[~human_e1["is_control"]].groupby(["argpair_id"])
)
assert all(
    tdf.shape[0] == 120
    for _, tdf in human_e1[human_e1["is_control"]].groupby(["argpair_id"])
)
assert all(
    tdf.shape[0] == 10
    for _, tdf in human_e1[human_e1["is_osherson"]].groupby(["domain", "phenomenon"])
)
assert all(
    tdf.shape[0] == 240
    for _, tdf in human_e1[~human_e1["is_control"]].groupby(["domain", "phenomenon"])
)

human_e1.to_csv(f"{PROCESSED_DATA}/experiment_1_master.csv")


Number of removed participants: 10


In [41]:
human_e1 = pd.read_csv(f"{PROCESSED_DATA}/experiment_1_master.csv", index_col=0)
human_e1 = human_e1[~human_e1["is_removed_participant"]].reset_index(drop=True)
human_e1["arg1_premises"] = human_e1["arg1_premises"].apply(eval)
human_e1["arg2_premises"] = human_e1["arg2_premises"].apply(eval)

# Reorder e1 pairs so that stronger pair is always arg1
arg1_premises, arg1_conclusion, arg2_premises, arg2_conclusion, human_rating = (
    [],
    [],
    [],
    [],
    [],
)
for _, row in human_e1.iterrows():
    if row["is_arg2_stronger"]:
        arg1_premises.append(row["arg2_premises"])
        arg1_conclusion.append(row["arg2_conclusion"])
        arg2_premises.append(row["arg1_premises"])
        arg2_conclusion.append(row["arg1_conclusion"])
        human_rating.append(5 - row["human_rating"])
    else:
        arg1_premises.append(row["arg1_premises"])
        arg1_conclusion.append(row["arg1_conclusion"])
        arg2_premises.append(row["arg2_premises"])
        arg2_conclusion.append(row["arg2_conclusion"])
        human_rating.append(row["human_rating"])
(
    human_e1["arg1_premises"],
    human_e1["arg1_conclusion"],
    human_e1["arg2_premises"],
    human_e1["arg2_conclusion"],
    human_e1["human_rating"],
) = (arg1_premises, arg1_conclusion, arg2_premises, arg2_conclusion, human_rating)
human_e1["is_arg2_stronger"] = [False] * human_e1.shape[0]

# Aggregate e1 argument pairs
rows = []
for a, adf in human_e1.groupby(
    ["argpair_id", "phenomenon", "phenomenon_type", "domain"]
):
    argpair_id, phenomenon, phenomeonon_type, domain = a
    first_row = adf.iloc[0]
    rows.append(
        (
            argpair_id,
            phenomenon,
            phenomeonon_type,
            domain,
            first_row["arg1_premises"],
            first_row["arg1_conclusion"],
            first_row["arg2_premises"],
            first_row["arg2_conclusion"],
            np.mean(adf["human_rating"]),
            first_row["is_control"],
            first_row["is_osherson"],
        )
    )
aggregated_e1 = pd.DataFrame(
    rows,
    columns=[
        "argpair_id",
        "phenomenon",
        "phenomenon_type",
        "domain",
        "stronger_arg_premises",
        "stronger_arg_conclusion",
        "weaker_arg_premises",
        "weaker_arg_conclusion",
        "average_human_rating",
        "is_control",
        "is_osherson",
    ],
)

# Add SCM scores
scmable_phenomena = set(
    [
        "Similarity",
        "Typicality",
        "Monotonicity (General)",
        "Monotonicity (Specific)",
        "Diversity (General)",
        "Diversity (Specific)",
        "Asymmetry",
        "Inclusion Fallacy",
    ]
)
strong_scm_scores, weak_scm_scores = [], []
for _, row in aggregated_e1.iterrows():
    if (
        row["is_osherson"]
        or row["is_control"]
        or row["phenomenon"] not in scmable_phenomena
    ):
        strong_scm_scores.append(None)
        weak_scm_scores.append(None)
    else:
        a1_p, a1_c, a2_p, a2_c = (
            row["stronger_arg_premises"],
            row["stronger_arg_conclusion"],
            row["weaker_arg_premises"],
            row["weaker_arg_conclusion"],
        )
        strong_scm_scores.append(
            scm(
                a1_p,
                a1_c,
                domain_categories[row["domain"]],
                similarity_map[row["domain"]],
                row["phenomenon_type"] == "Specific",
            )
        )
        weak_scm_scores.append(
            scm(
                a2_p,
                a2_c,
                domain_categories[row["domain"]],
                similarity_map[row["domain"]],
                row["phenomenon_type"] == "Specific",
            )
        )

aggregated_e1["stronger_arg_scm"] = strong_scm_scores
aggregated_e1["weaker_arg_scm"] = weak_scm_scores

# Add in original osherson ratings
OSHERSON_E1_RATINGS = {
    "Typicality": 73 / 80,
    "Diversity (General)": 59 / 80,
    "Specificity": 75 / 80,
    "Monotonicity (General)": 75 / 80,
    "Similarity": 76 / 80,
    "Diversity (Specific)": 52 / 80,
    "Monotonicity (Specific)": 66 / 80,
    "Asymmetry": 40 / 80,
    "Nonmonotonicity (General)": 68 / 80,
    "Inclusion Fallacy": 52 / 80,
}
aggregated_e1["osherson_rating"] = [
    OSHERSON_E1_RATINGS[row["phenomenon"]] if row["is_osherson"] else None
    for _, row in aggregated_e1.iterrows()
]
aggregated_e1["conclusion_type"] = aggregated_e1["phenomenon_type"]

aggregated_e1 = aggregated_e1[
    [
        "argpair_id",
        "phenomenon",
        "conclusion_type",
        "domain",
        "stronger_arg_premises",
        "stronger_arg_conclusion",
        "weaker_arg_premises",
        "weaker_arg_conclusion",
        "average_human_rating",
        "osherson_rating",
        "stronger_arg_scm",
        "weaker_arg_scm",
        "is_control",
        "is_osherson",
    ]
]
aggregated_e1.to_csv(f"{PROCESSED_DATA}/experiment_1_aggregated.csv")
