## Common Sense: Individual Level

Slightly different calculation: use the model's rating in the calculation of the majority vote as well.


In [1]:
# Change to home directory
import os

os.chdir("..")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
import matplotlib

# Set default font to Arial
matplotlib.rcParams["font.family"] = "Arial"
matplotlib.rcParams["font.sans-serif"] = "Arial"

## Statements


In [3]:
# Load statements
statements = pd.read_csv("data/statements_and_prompts.csv")
statements = statements["statement"]

## Common Sense of Humans


### Load Human Data


In [4]:
# Human ratings
# Do you agree with this statement?
individual = pd.read_csv("data/results/individual_ratings.csv", index_col=0)

# Do you think most people would agree with this statement?
group = pd.read_csv("data/results/group_ratings.csv", index_col=0)

# Majority voting (skip participants who didn't answer)
avg_vote_per_q = individual.mean(axis=1, skipna=True)
maj_i = (avg_vote_per_q >= 0.5).astype(int)

avg_vote_per_q_others = group.mean(axis=1, skipna=True)

# Majority of "others agree" answers by humans
maj_others = (group.mean(1, skipna=True) >= 0.5).astype(int)

### Statement-Level Commonsensicality of Humans

This calculation uses **human ratings** only.


In [5]:
# Statement: consensus
c_i = 2 * np.abs(individual.mean(1) - 0.5)

# Statement: awareness
a_i = pd.Series(
    [(group.loc[i].dropna().astype(int) == maj_i[i]).mean() for i in maj_i.index],
    index=maj_i.index,
)

# Statement: commonsensicality
m_i = np.sqrt(c_i * a_i)

### Individual-Level Commonsensicality of Humans

This calculation uses **human ratings** only.


In [6]:
# Person: consensus
C_j = pd.Series(
    [
        (
            individual.loc[:, j].dropna().astype(int)
            == maj_i.loc[individual.loc[:, j].dropna().index]
        ).mean()
        for j in individual.columns
    ],
    index=individual.columns,
)

# Person: awareness
A_j = pd.Series(
    [
        (
            group.loc[:, j].dropna().astype(int)
            == maj_i.loc[group.loc[:, j].dropna().index]
        ).mean()
        for j in group.columns
    ],
    index=group.columns,
)

# Person: commonsensicality
M_j = np.sqrt(C_j * A_j)

## Common Sense of LLMs


### Load Model Data


In [7]:
model_to_color = {
    "GPT-3.5": "forestgreen",
    "GPT-4-0125": "forestgreen",
    "GPT-4-0409": "forestgreen",
    "GPT-4o": "forestgreen",
    "GPT-5": "forestgreen",
    "LLaMA-2-7B": "indianred",
    "LLaMA-2-13B": "indianred",
    "LLaMA-2-70B": "indianred",
    "LLaMA-3-8B": "indianred",
    "LLaMA-3-70B": "indianred",
    "Flan-T5-Small": "royalblue",
    "Flan-T5-Base": "royalblue",
    "Flan-T5-Large": "royalblue",
    "Flan-T5-XL": "royalblue",
    "Flan-T5-XXL": "royalblue",
    "Gemma-2B": "chocolate",
    "Gemma-7B": "chocolate",
    "Gemini Pro 1.0": "chocolate",
    "Mistral-7B": "darkviolet",
    "Mixtral-8x7B": "darkviolet",
    "Mixtral-8x22B": "darkviolet",
    "Mistral-Large": "darkviolet",
    "OLMo-7B": "goldenrod",
    "Falcon-7B": "teal",
    "Falcon-40B": "teal",
    "Falcon-180B": "teal",
    "Claude 3 Haiku": "olivedrab",
    "Claude 3 Sonnet": "olivedrab",
    "Claude 3 Opus": "olivedrab",
    "DBRX": "crimson",
    "Qwen2-0.5B": "lightseagreen",
    "Qwen2-1.5B": "lightseagreen",
    "Qwen2-7B": "lightseagreen",
    "Qwen2-57B": "lightseagreen",
    "Qwen2-72B": "lightseagreen",
}

In [8]:
from src.utilities import load_annotations_gpt
from src.utilities import load_results_hf
from src.utilities import load_results_freq
from src.utilities import load_results_gpt5

all_models = {}

all_models["GPT-3.5"] = load_annotations_gpt(
    model_name="gpt-3.5-turbo-0125", trial_no=1, verbose=True
)
all_models["GPT-4-0125"] = load_annotations_gpt(
    model_name="gpt-4-0125-preview", trial_no=1, verbose=True
)
all_models["GPT-4-0409"] = load_annotations_gpt(
    model_name="gpt-4-turbo-2024-04-09", trial_no=1, verbose=True
)
all_models["GPT-4o"] = load_annotations_gpt(
    model_name="gpt-4o-2024-05-13", trial_no=1, verbose=True
)
all_models["GPT-5"] = load_results_gpt5(model_name="gpt-5-2025-08-07", verbose=True)

all_models["LLaMA-2-7B"] = load_results_hf(
    model_name="meta-llama--Llama-2-7b-chat-hf", verbose=True
)
all_models["LLaMA-2-13B"] = load_results_hf(
    model_name="meta-llama--Llama-2-13b-chat-hf", verbose=True
)
all_models["LLaMA-2-70B"] = load_results_hf(
    model_name="meta-llama--Llama-2-70b-chat-hf", verbose=True
)

all_models["LLaMA-3-8B"] = load_results_hf(
    model_name="meta-llama--Meta-Llama-3-8B-Instruct", verbose=True
)
all_models["LLaMA-3-70B"] = load_results_hf(
    model_name="meta-llama--Meta-Llama-3-70B-Instruct", verbose=True
)

all_models["Flan-T5-Small"] = load_results_hf(
    model_name="google--flan-t5-small", verbose=True
)
all_models["Flan-T5-Base"] = load_results_hf(
    model_name="google--flan-t5-base", verbose=True
)
all_models["Flan-T5-Large"] = load_results_hf(
    model_name="google--flan-t5-large", verbose=True
)
all_models["Flan-T5-XL"] = load_results_hf(
    model_name="google--flan-t5-xl", verbose=True
)
all_models["Flan-T5-XXL"] = load_results_hf(
    model_name="google--flan-t5-xxl", verbose=True
)

all_models["Gemma-2B"] = load_results_hf(model_name="google--gemma-2b-it", verbose=True)
all_models["Gemma-7B"] = load_results_hf(model_name="google--gemma-7b-it", verbose=True)

all_models["Gemini Pro 1.0"] = load_results_freq(model_name="gemini-pro", verbose=True)

all_models["Mistral-7B"] = load_results_hf(
    model_name="mistralai--Mistral-7B-Instruct-v0.2", verbose=True
)
all_models["Mixtral-8x7B"] = load_results_hf(
    model_name="mistralai--Mixtral-8x7B-Instruct-v0.1", verbose=True
)
all_models["Mixtral-8x22B"] = load_results_hf(
    model_name="mistralai--Mixtral-8x22B-Instruct-v0.1", verbose=True
)

all_models["Mistral-Large"] = load_results_freq(
    model_name="mistral-large-latest", verbose=True
)

all_models["OLMo-7B"] = load_results_hf(
    model_name="allenai--OLMo-7B-Instruct", verbose=True
)

all_models["Falcon-7B"] = load_results_hf(
    model_name="tiiuae--falcon-7b-instruct", verbose=True
)
all_models["Falcon-40B"] = load_results_hf(
    model_name="tiiuae--falcon-40b-instruct", verbose=True
)
all_models["Falcon-180B"] = load_results_hf(
    model_name="tiiuae--falcon-180B-chat", verbose=True
)

all_models["Claude 3 Opus"] = load_results_freq(
    model_name="claude-3-opus", verbose=True
)
all_models["Claude 3 Sonnet"] = load_results_freq(
    model_name="claude-3-sonnet", verbose=True
)
all_models["Claude 3 Haiku"] = load_results_freq(
    model_name="claude-3-haiku", verbose=True
)

all_models["DBRX"] = load_results_hf(
    model_name="databricks--dbrx-instruct", verbose=True
)

all_models["Qwen2-0.5B"] = load_results_hf(
    model_name="Qwen--Qwen2-0.5B-Instruct", verbose=True
)
all_models["Qwen2-1.5B"] = load_results_hf(
    model_name="Qwen--Qwen2-1.5B-Instruct", verbose=True
)
all_models["Qwen2-7B"] = load_results_hf(
    model_name="Qwen--Qwen2-7B-Instruct", verbose=True
)
all_models["Qwen2-57B"] = load_results_hf(
    model_name="Qwen--Qwen2-57B-A14B-Instruct", verbose=True
)
all_models["Qwen2-72B"] = load_results_hf(
    model_name="Qwen--Qwen2-72B-Instruct", verbose=True
)

                                       

Question q1 has 50 repetitions


                                       

Question q2 has 50 repetitions


                                       

Question q3 has 50 repetitions


                                       

Question q1 has 23 repetitions


                                       

Question q2 has 23 repetitions


                                       

Question q3 has 23 repetitions


                                       

Question q1 has 23 repetitions


                                       

Question q2 has 23 repetitions


                                       

Question q3 has 23 repetitions


                                       

Question q1 has 23 repetitions


                                       

Question q2 has 23 repetitions


                                       

Question q3 has 23 repetitions


                                       

Question q1 has 23 repetitions


                                       

Question q2 has 23 repetitions


                                       

Question q3 has 23 repetitions


                                       

In [9]:
# Model votes
def get_model_probs(model_name, q="q1"):

    q_answers = all_models[model_name][q]
    q_answers = q_answers[["yes", "no", "other"]]

    q_answers = q_answers.to_numpy()
    other = q_answers[:, 2]
    q_answers[:, 0] += other / 2
    q_answers[:, 1] += other / 2

    # Ignore probability mass of "other" option
    # Ensure that the probabilities of "yes" and "no" sum to 1
    q_answers = q_answers[:, 0:2]
    q_answers /= q_answers.sum(1, keepdims=True)

    q_answers = pd.DataFrame(q_answers, columns=["yes", "no"], index=individual.index)
    return q_answers

In [10]:
# Human votes
humans_agree = (individual == 1).sum(1, skipna=True)
humans_disagree = (individual == 0).sum(1, skipna=True)
humans_q1_soft = np.vstack((humans_agree, humans_disagree), dtype=float).T
humans_q1_soft /= humans_q1_soft.sum(1, keepdims=True)

humans_q1_soft = pd.DataFrame(
    humans_q1_soft, columns=["yes", "no"], index=individual.index
)

humans_others_agree = (group == 1).sum(1, skipna=True)
humans_others_disagree = (group == 0).sum(1, skipna=True)
humans_q2_soft = np.vstack((humans_others_agree, humans_others_disagree), dtype=float).T
humans_q2_soft /= humans_q2_soft.sum(1, keepdims=True)

humans_q2_soft = pd.DataFrame(
    humans_q2_soft, columns=["yes", "no"], index=individual.index
)
humans_q2_soft.head()

Unnamed: 0,yes,no
0,1.0,0.0
1,0.666667,0.333333
2,0.954545,0.045455
3,1.0,0.0
4,1.0,0.0


In [11]:
all_models_q1_probs = {}
for model_name in all_models.keys():
    all_models_q1_probs[model_name] = get_model_probs(model_name, "q1")

all_models_q2_probs = {}
for model_name in all_models.keys():
    all_models_q2_probs[model_name] = get_model_probs(model_name, "q2")

In [12]:
df = all_models["GPT-4o"]["q2"]
df["diff"] = np.abs(df["yes"] - df["no"])
df.sort_values("diff")

Unnamed: 0,yes,no,other,diff
3569,4.998673e-01,4.998673e-01,2.653495e-04,3.803709e-08
1529,4.999913e-01,4.999912e-01,1.746399e-05,6.462529e-08
3362,4.999973e-01,4.999973e-01,5.371852e-06,6.573800e-08
1886,4.999896e-01,4.999895e-01,2.093080e-05,6.901112e-08
2337,4.999897e-01,4.999896e-01,2.069824e-05,8.234505e-08
...,...,...,...,...
632,9.999999e-01,9.237450e-09,6.291377e-08,9.999999e-01
449,9.999999e-01,8.592168e-10,7.365126e-08,9.999999e-01
2914,4.363463e-09,9.999999e-01,6.301088e-08,9.999999e-01
596,9.999999e-01,2.335593e-09,5.015960e-08,9.999999e-01


## Calculate Individual-Level Commonsensicality of Models


In [13]:
def get_binary_answers(q_answers):
    assert q_answers.columns[0].lower() == "yes"
    assert q_answers.columns[1].lower() == "no"
    if len(q_answers.columns) > 2:
        assert q_answers.columns[2].lower() == "other"

    # Remove the "other" answer and rescale so "yes" + "no" = 1
    q_answers = q_answers.to_numpy()
    other = q_answers[:, 2]
    q_answers[:, 0] += other / 2
    q_answers[:, 1] += other / 2
    q_answers = q_answers[:, 0:2]
    q_answers /= q_answers.sum(1, keepdims=True)

    # Get the answer with the highest probability
    q_answers = q_answers.argmax(axis=1)

    # Revert the ordering so that No = 0, Yes = 1
    # (Previously, no = 1, yes = 0)
    q_answers = 1 - q_answers

    return q_answers

### The default way

This version only uses human ratings in calculating the majority vote.


In [14]:
from sklearn.metrics import accuracy_score


def compute_commonsensicality(answers, binary=False, return_ca=False):
    q1_answers = answers["q1"]
    if not binary:
        q1_answers = get_binary_answers(q1_answers)

    q2_answers = answers["q2"]
    if not binary:
        q2_answers = get_binary_answers(q2_answers)

    consensus = accuracy_score(y_true=maj_i, y_pred=q1_answers)
    awareness = accuracy_score(y_true=maj_i, y_pred=q2_answers)

    commonsensicality = np.sqrt(consensus * awareness)

    if return_ca:
        return consensus, awareness, commonsensicality

    return commonsensicality

In [15]:
# Commonsensicality for all models
all_model_comm = {}
all_model_cons = {}
all_model_awar = {}
for model_name, model_answers in all_models.items():
    con, awe, com = compute_commonsensicality(model_answers, return_ca=True)
    all_model_comm[model_name] = com
    all_model_awar[model_name] = awe
    all_model_cons[model_name] = con

In [16]:
all_model_cons_pd = pd.Series(all_model_cons)
all_model_awar_pd = pd.Series(all_model_awar)
all_model_comm_pd = pd.Series(all_model_comm)
all_model_comm_pd.sort_values(ascending=False)

Mixtral-8x22B      0.823416
Mistral-Large      0.812750
Qwen2-72B          0.811413
Qwen2-57B          0.809153
GPT-4-0409         0.805780
Mistral-7B         0.804397
Flan-T5-XXL        0.804046
Qwen2-7B           0.803691
Falcon-180B        0.799179
Gemini Pro 1.0     0.797248
GPT-4-0125         0.784279
Flan-T5-Large      0.768995
GPT-3.5            0.768421
Mixtral-8x7B       0.763886
DBRX               0.762877
GPT-5              0.756795
Claude 3 Opus      0.753990
GPT-4o             0.751618
Falcon-40B         0.750548
LLaMA-2-7B         0.749877
Qwen2-1.5B         0.745926
LLaMA-3-70B        0.743530
Flan-T5-XL         0.729969
OLMo-7B            0.726615
Gemma-7B           0.720467
Qwen2-0.5B         0.667794
Falcon-7B          0.663373
Gemma-2B           0.658804
LLaMA-2-70B        0.635449
LLaMA-3-8B         0.616844
Claude 3 Sonnet    0.615466
Claude 3 Haiku     0.614012
Flan-T5-Base       0.581310
LLaMA-2-13B        0.464967
Flan-T5-Small      0.341722
dtype: float64

### The alternative way

For each model, this version adds one more rating to every statement, so that the majority vote includes the model's own rating as well.


In [17]:
from sklearn.metrics import accuracy_score


def compute_commonsensicality_with_model(answers, binary=False, return_ca=False):
    q1_answers = answers["q1"].copy()
    if not binary:
        q1_answers = get_binary_answers(q1_answers)

    q2_answers = answers["q2"].copy()
    if not binary:
        q2_answers = get_binary_answers(q2_answers)

    sum_ratings_per_statement_q1 = individual.sum(axis=1, skipna=True)
    tot_ratings_per_statement_q1 = individual.notna().sum(axis=1, skipna=True)
    sum_ratings_per_statement_q1 += q1_answers
    tot_ratings_per_statement_q1 += 1
    avg_ratings_per_statement_q1 = (
        sum_ratings_per_statement_q1 / tot_ratings_per_statement_q1
    )
    maj_i_with_model = (avg_ratings_per_statement_q1 >= 0.5).astype(int)

    consensus = accuracy_score(y_true=maj_i_with_model, y_pred=q1_answers)
    awareness = accuracy_score(y_true=maj_i_with_model, y_pred=q2_answers)

    commonsensicality = np.sqrt(consensus * awareness)

    if return_ca:
        return consensus, awareness, commonsensicality
    return commonsensicality

In [18]:
# Commonsensicality for all models
all_model_comm_with_model = {}
all_model_cons_with_model = {}
all_model_awar_with_model = {}
for model_name, model_answers in all_models.items():
    con, awe, com = compute_commonsensicality_with_model(model_answers, return_ca=True)
    all_model_comm_with_model[model_name] = com
    all_model_awar_with_model[model_name] = awe
    all_model_cons_with_model[model_name] = con

In [19]:
all_model_cons_with_model_pd = pd.Series(all_model_cons_with_model)
all_model_awar_with_model_pd = pd.Series(all_model_awar_with_model)
all_model_comm_with_model_pd = pd.Series(all_model_comm_with_model)
all_model_comm_with_model_pd.sort_values(ascending=False)

Mixtral-8x22B      0.835720
Mistral-Large      0.827290
Qwen2-72B          0.824589
Qwen2-57B          0.822780
Mistral-7B         0.818696
Flan-T5-XXL        0.818126
GPT-4-0409         0.817890
Qwen2-7B           0.816191
Falcon-180B        0.813971
Gemini Pro 1.0     0.811136
GPT-4-0125         0.796783
Flan-T5-Large      0.783964
GPT-3.5            0.781273
DBRX               0.775012
Mixtral-8x7B       0.774907
GPT-5              0.769660
Claude 3 Opus      0.766343
GPT-4o             0.764466
LLaMA-2-7B         0.763971
Falcon-40B         0.762948
Qwen2-1.5B         0.758388
LLaMA-3-70B        0.757027
Flan-T5-XL         0.745618
OLMo-7B            0.740655
Gemma-7B           0.736986
Qwen2-0.5B         0.682995
Falcon-7B          0.679030
Gemma-2B           0.673339
LLaMA-2-70B        0.648942
LLaMA-3-8B         0.631567
Claude 3 Sonnet    0.631355
Claude 3 Haiku     0.629512
Flan-T5-Base       0.595902
LLaMA-2-13B        0.481271
Flan-T5-Small      0.358286
dtype: float64

### Comparing results


In [20]:
model_name_order = [
    "Claude 3 Haiku",
    "Claude 3 Sonnet",
    "Claude 3 Opus",
    "DBRX",
    "Falcon-7B",
    "Falcon-40B",
    "Falcon-180B",
    "Flan-T5-Small",
    "Flan-T5-Base",
    "Flan-T5-Large",
    "Flan-T5-XL",
    "Flan-T5-XXL",
    "Gemma-2B",
    "Gemma-7B",
    "Gemini Pro 1.0",
    "GPT-3.5",
    "GPT-4-0125",
    "GPT-4-0409",
    "GPT-4o",
    "GPT-5",
    "LLaMA-2-7B",
    "LLaMA-3-8B",
    "LLaMA-2-13B",
    "LLaMA-2-70B",
    "LLaMA-3-70B",
    "Mistral-7B",
    "Mixtral-8x7B",
    "Mixtral-8x22B",
    "Mistral-Large",
    "OLMo-7B",
    "Qwen2-0.5B",
    "Qwen2-1.5B",
    "Qwen2-7B",
    "Qwen2-57B",
    "Qwen2-72B",
]
model_comm_table = pd.DataFrame(
    {
        "Consensus": all_model_cons_pd[model_name_order],
        "Consensus (with model)": all_model_cons_with_model_pd[model_name_order],
        "Awareness": all_model_awar_pd[model_name_order],
        "Awareness (with model)": all_model_awar_with_model_pd[model_name_order],
        "Commonsensicality": all_model_comm_pd[model_name_order],
        "Commonsensicality (with model)": all_model_comm_with_model_pd[
            model_name_order
        ],
    }
)

In [21]:
def process_col(col):
    col = col * 100
    col_num = col.round(1)
    col_ranking = col.rank(ascending=False, method="min").astype(int)
    col_str = col_num.astype(str) + " (" + col_ranking.astype(str) + ")"
    return col_str


model_comm_table_with_ranking = model_comm_table.copy()

for col in model_comm_table.columns:
    model_comm_table_with_ranking[col] = process_col(model_comm_table[col])

In [22]:
model_comm_table_with_ranking

Unnamed: 0,Consensus,Consensus (with model),Awareness,Awareness (with model),Commonsensicality,Commonsensicality (with model)
Claude 3 Haiku,58.8 (31),60.5 (31),64.1 (30),65.5 (30),61.4 (32),63.0 (32)
Claude 3 Sonnet,60.9 (30),62.6 (30),62.2 (31),63.7 (31),61.5 (31),63.1 (31)
Claude 3 Opus,73.4 (19),75.1 (20),77.4 (15),78.2 (15),75.4 (17),76.6 (17)
DBRX,73.7 (18),75.2 (18),79.0 (13),79.9 (13),76.3 (15),77.5 (14)
Falcon-7B,66.6 (27),68.1 (27),66.1 (29),67.7 (28),66.3 (27),67.9 (27)
Falcon-40B,73.0 (22),74.8 (22),77.2 (16),77.9 (17),75.1 (19),76.3 (20)
Falcon-180B,78.6 (8),80.3 (8),81.3 (6),82.5 (5),79.9 (9),81.4 (9)
Flan-T5-Small,34.4 (35),36.1 (35),33.9 (35),35.6 (35),34.2 (35),35.8 (35)
Flan-T5-Base,56.8 (33),58.6 (33),59.5 (33),60.6 (33),58.1 (33),59.6 (33)
Flan-T5-Large,77.3 (14),78.9 (14),76.5 (18),77.9 (16),76.9 (12),78.4 (12)


In [23]:
print(model_comm_table_with_ranking.to_latex())

\begin{tabular}{lllllll}
\toprule
 & Consensus & Consensus (with model) & Awareness & Awareness (with model) & Commonsensicality & Commonsensicality (with model) \\
\midrule
Claude 3 Haiku & 58.8 (31) & 60.5 (31) & 64.1 (30) & 65.5 (30) & 61.4 (32) & 63.0 (32) \\
Claude 3 Sonnet & 60.9 (30) & 62.6 (30) & 62.2 (31) & 63.7 (31) & 61.5 (31) & 63.1 (31) \\
Claude 3 Opus & 73.4 (19) & 75.1 (20) & 77.4 (15) & 78.2 (15) & 75.4 (17) & 76.6 (17) \\
DBRX & 73.7 (18) & 75.2 (18) & 79.0 (13) & 79.9 (13) & 76.3 (15) & 77.5 (14) \\
Falcon-7B & 66.6 (27) & 68.1 (27) & 66.1 (29) & 67.7 (28) & 66.3 (27) & 67.9 (27) \\
Falcon-40B & 73.0 (22) & 74.8 (22) & 77.2 (16) & 77.9 (17) & 75.1 (19) & 76.3 (20) \\
Falcon-180B & 78.6 (8) & 80.3 (8) & 81.3 (6) & 82.5 (5) & 79.9 (9) & 81.4 (9) \\
Flan-T5-Small & 34.4 (35) & 36.1 (35) & 33.9 (35) & 35.6 (35) & 34.2 (35) & 35.8 (35) \\
Flan-T5-Base & 56.8 (33) & 58.6 (33) & 59.5 (33) & 60.6 (33) & 58.1 (33) & 59.6 (33) \\
Flan-T5-Large & 77.3 (14) & 78.9 (14) & 76.5 (1

In [24]:
from scipy.stats import spearmanr

cols = ["Consensus", "Awareness", "Commonsensicality"]
for col in cols:
    col_data = model_comm_table[col]
    col_data_with_model = model_comm_table[f"{col} (with model)"]
    corr, p_value = spearmanr(col_data, col_data_with_model)
    print(f"Spearman correlation for {col}: {corr:.4f} (p-value: {p_value})")

Spearman correlation for Consensus: 0.9989 (p-value: 2.6169541956072737e-45)
Spearman correlation for Awareness: 0.9974 (p-value: 2.6392264837837585e-39)
Spearman correlation for Commonsensicality: 0.9986 (p-value: 1.0373418768072296e-43)
