In [99]:
import pandas as pd

import config
import tqdm

In [133]:
def determine_if_row_is_control(row: pd.Series, control_df: pd.DataFrame) -> bool:
    """Given a row in the MTurk results dataframe, determine if this is a control trial"""

    if "All" in row["premises"][0]:
        return True

    control_rows = control_df[(control_df["conclusion_type"] == row["conclusion_type"]) & (control_df["domain"] == row["domain"]) & (control_df["is_single_premise"] == row["is_single_premise"])]
    
    for _, control_row in control_rows.iterrows():
        control_premise = tuple(control_row["premises"])
        control_conclusion = control_row["conclusion"]
        if tuple(control_premise) == row["premises"] and control_conclusion == row["conclusion"]:
            return True

    return False


def identify_participants_to_cut(df: pd.DataFrame) -> pd.DataFrame:
    """
    Identify participants that we should exclude from our analysis.
    
    'Light cut': participants who got more than one control trial incorrect.
    'Medium cut': Same as above, or those who gave at least 12 trials the exact same rating.
    'Hard cut': Same as above, or those who gave at least 8 trials the exact same rating.
    """

    rows = []
    for pid, pid_df in df.groupby("pid"):

        num_incorrect_controls = pid_df[(pid_df["is_control"]) & (pid_df["rating"] < 50)].shape[0]
        ratings_range = pid_df["rating"].max() - pid_df["rating"].min()
        ratings_mode = pid_df["rating"].value_counts().max()
        ratings_std = pid_df["rating"].std()

        rows.append((pid, num_incorrect_controls, ratings_range, ratings_mode, ratings_std))

    pid_cut_df = pd.DataFrame(rows, columns=["pid", "num_incorrect_controls", "ratings_range", "ratings_mode", "ratings_std"])
    pid_cut_df["light_cut"] = [row["num_incorrect_controls"] > 1 for _, row in pid_cut_df.iterrows()]
    pid_cut_df["medium_cut"] = [row["ratings_mode"] >= 12 or row["num_incorrect_controls"] > 1 for _, row in pid_cut_df.iterrows()]
    pid_cut_df["hard_cut"] = [row["ratings_mode"] >= 8 or row["num_incorrect_controls"] > 1 for _, row in pid_cut_df.iterrows()]

    output_df = df.merge(pid_cut_df[["pid", "light_cut", "medium_cut", "hard_cut"]], on="pid")
    assert output_df.shape[0] == df.shape[0]
    
    for cut_type in ("light_cut", "medium_cut", "hard_cut"):
        print(f"Number of {cut_type} participants: {pid_cut_df[pid_cut_df[cut_type]].shape[0]}/{pid_cut_df.shape[0]}")
    print()

    return output_df

In [143]:
# Load original experiment split labels
experiment_df = pd.read_csv(f"{config.DATA}/experiment_trials.csv", index_col=0)
experiment_df["pid"] = experiment_df["pid"].apply(str)
experiment_df["tid"] = experiment_df["tid"].apply(str)
experiment_df["premises"] = experiment_df["premises"].apply(lambda x: tuple(eval(x)))
experiment_df = experiment_df[experiment_df["is_osherson"] == 0].reset_index(drop=True)

# Load human ratings
human_df = pd.read_csv(f"{config.DATA}/raw_human_ratings.csv")
human_df["pid"] = human_df["tid"].apply(lambda x: x.split("participant")[-1])
human_df["tid"] = human_df["trialId"].apply(lambda x: x.replace("tc", ""))
human_df["premises"] = human_df["premises0"].apply(lambda x: tuple(x.split(":")))
human_df["conclusion"] = human_df["conclusion0"]
human_df["conclusion_type"] = human_df["conclusionType"].apply(lambda x: x.capitalize())

# Drop participants who were not paid
unpaid_participants_df = pd.read_csv(f"{config.DATA}/unpaid_participants.csv", index_col=0)
drop_uids = unpaid_participants_df[~unpaid_participants_df["paid"]]["uid"].tolist()
human_df = human_df[~human_df["uid"].isin(drop_uids)].reset_index(drop=True)

# Join human_df and experiment_df, keeping experiment_df's split labels
df = human_df.merge(experiment_df, on=["pid", "tid", "conclusion_type", "premises", "conclusion"])
df["is_single_premise"] = df["premises"].apply(lambda x: len(x) == 1)
df = df[["pid", "tid", "domain", "conclusion_type", "is_single_premise", "premises", "conclusion", "rating"]].sort_values(by=["pid", "tid"]).reset_index(drop=True)

# Do some checks
print(f"Merged df nrows: {df.shape[0]}, human_df nrows: {human_df.shape[0]}, experiment_df nrows: {experiment_df.shape[0]}")
print(f"Total number of PIDs: {len(human_df['pid'].unique())}")
missing_pids = set(experiment_df["pid"]).difference(set(human_df["pid"]))
print(f"Missing PIDs that are in experiment_df but not human_df: {missing_pids}")
assert df.shape[0] == human_df.shape[0] == experiment_df[~experiment_df["pid"].isin(missing_pids)].shape[0]
assert len(experiment_df["pid"].unique()) == len(df["pid"].unique()) + len(missing_pids)
assert all(gdf.shape[0] == 38 for _,gdf in df.groupby("pid"))
print()

# Label control trials
control_df = pd.read_csv(f"{config.DATA}/control_trials.csv", index_col=0)
control_df["premises"] = control_df["premises"].apply(lambda x: tuple(eval(x)))
print("Labelling trials as control or not control...")
df["is_control"] = [determine_if_row_is_control(row, control_df) for _, row in tqdm.tqdm(df.iterrows())]
assert all(p == 4 for p in df[df["is_control"]]["pid"].value_counts().tolist())
assert df[df["is_control"]]["pid"].value_counts().shape[0] == len(df["pid"].unique())

df = identify_participants_to_cut(df)
df["argument"] = [(row["premises"], row["conclusion"]) for _, row in df.iterrows()]
df = df[["pid", "tid",  "argument", "domain", "conclusion_type", "is_single_premise", "is_control", "premises", "conclusion", "rating", "light_cut", "medium_cut", "hard_cut"]]

134it [00:00, 1339.99it/s]

Merged df nrows: 23028, human_df nrows: 23028, experiment_df nrows: 23180
Total number of PIDs: 606
Missing PIDs that are in experiment_df but not human_df: {'58', '526', '381', '538'}

Labelling trials as control or not control...


23028it [00:18, 1260.85it/s]


Number of light_cut participants: 86/606
Number of medium_cut participants: 152/606
Number of hard_cut participants: 210/606



In [144]:
df.head()

Unnamed: 0,pid,tid,argument,domain,conclusion_type,is_single_premise,is_control,premises,conclusion,rating,light_cut,medium_cut,hard_cut
0,0,0,"((Eagles,), All birds)",Birds,General,True,False,"(Eagles,)",All birds,0,False,False,False
1,0,1,"((Crows,), All birds)",Birds,General,True,False,"(Crows,)",All birds,7,False,False,False
2,0,10,"((Vultures,), All birds)",Birds,General,True,False,"(Vultures,)",All birds,3,False,False,False
3,0,11,"((Falcons,), All birds)",Birds,General,True,False,"(Falcons,)",All birds,3,False,False,False
4,0,12,"((Herons,), All birds)",Birds,General,True,False,"(Herons,)",All birds,7,False,False,False


In [157]:
df = df[~df["medium_cut"]]
print(f"Number of participants left after medium cut: {len(df['pid'].unique())}")

df["ratings_rank"] = df.groupby(["pid", "is_single_premise"])['rating'].rank("dense", ascending=False)

rows = []
argument_labels = ["argument", "domain", "conclusion_type", "is_single_premise", "is_control", "premises", "conclusion"]
for al, arg_df in df.groupby(argument_labels):

    avg_rank = arg_df["ratings_rank"].mean()
    avg_rating = arg_df["rating"].mean()

    num_ratings = arg_df.shape[0]

    rows.append(al + (avg_rating, avg_rank, num_ratings))
    
aggregated_df = pd.DataFrame(rows, columns=argument_labels + ["average_rating", "average_ranking", "num_ratings"])
aggregated_df = aggregated_df[~aggregated_df["is_control"]].reset_index(drop=True)

# Check that number of arguments is correct and all are unique
for split, split_df in aggregated_df.groupby(["domain", "conclusion_type", "is_single_premise"]):
    domain, conclusion_type, is_single_premise = split
    if conclusion_type == "General":
        assert split_df.shape[0] == 24 if is_single_premise else split_df.shape[0] == 100
    else:
        assert split_df.shape[0] > 100 if is_single_premise else split_df.shape[0] == 100
    assert len(set(split_df["argument"].tolist())) == split_df.shape[0]
    
# Print statistics on number of arguments per split
print("Number of ratings per argument per split")
for split, split_df in aggregated_df.groupby(["domain", "conclusion_type", "is_single_premise"]):
    r = split_df["num_ratings"]
    minr, maxr, meanr, sdr = r.min(), r.max(), r.mean(), r.std()
    print(f"    {split}: min - {minr}, max - {maxr}, mean - {meanr:.2f}, sd - {sdr:.2f}")

Number of participants left after medium cut: 454
Number of ratings per argument per split
    ('Birds', 'General', False): min - 4, max - 13, mean - 6.50, sd - 2.13
    ('Birds', 'General', True): min - 65, max - 65, mean - 65.00, sd - 0.00
    ('Birds', 'Specific', False): min - 7, max - 10, mean - 8.50, sd - 0.81
    ('Birds', 'Specific', True): min - 7, max - 34, mean - 12.29, sd - 5.59
    ('Mammals', 'General', False): min - 4, max - 8, mean - 5.80, sd - 1.33
    ('Mammals', 'General', True): min - 58, max - 58, mean - 58.00, sd - 0.00
    ('Mammals', 'Specific', False): min - 8, max - 10, mean - 8.90, sd - 0.83
    ('Mammals', 'Specific', True): min - 8, max - 36, mean - 13.43, sd - 6.52
    ('Vehicles', 'General', False): min - 4, max - 8, mean - 6.60, sd - 1.29
    ('Vehicles', 'General', True): min - 66, max - 66, mean - 66.00, sd - 0.00
    ('Vehicles', 'Specific', False): min - 7, max - 10, mean - 9.10, sd - 1.14
    ('Vehicles', 'Specific', True): min - 7, max - 35, mean -

In [153]:
aggregated_df

Unnamed: 0,argument,domain,conclusion_type,is_single_premise,is_control,premises,conclusion,average_rating,average_ranking,num_ratings
0,"((Airplanes,), All vehicles)",Vehicles,General,True,False,"(Airplanes,)",All vehicles,38.136364,9.257576,66
1,"((Airplanes,), Bicycles)",Vehicles,Specific,True,False,"(Airplanes,)",Bicycles,22.285714,14.142857,7
2,"((Airplanes,), Helicopters)",Vehicles,Specific,True,False,"(Airplanes,)",Helicopters,77.875000,4.750000,8
3,"((Airplanes,), Jeeps)",Vehicles,Specific,True,False,"(Airplanes,)",Jeeps,34.250000,14.250000,8
4,"((Airplanes,), Skateboards)",Vehicles,Specific,True,False,"(Airplanes,)",Skateboards,23.187500,14.625000,16
...,...,...,...,...,...,...,...,...,...,...
1156,"((Zeppelins, Rockets), All vehicles)",Vehicles,General,False,False,"(Zeppelins, Rockets)",All vehicles,56.142857,7.000000,7
1157,"((Zeppelins, Sleds), Caravans)",Vehicles,Specific,False,False,"(Zeppelins, Sleds)",Caravans,53.250000,6.250000,8
1158,"((Zeppelins, Submarines), All vehicles)",Vehicles,General,False,False,"(Zeppelins, Submarines)",All vehicles,47.333333,6.666667,6
1159,"((Zeppelins, Tractors), All vehicles)",Vehicles,General,False,False,"(Zeppelins, Tractors)",All vehicles,51.000000,4.625000,8
