In [11]:
import pandas as pd
import statsmodels.stats.contingency_tables as ctt
import numpy as np

mistral_c = pd.read_csv('commonsense/mistral_500.csv')
mistral_d = pd.read_csv('deontology/mistral_500.csv')
mistral_j = pd.read_csv('justice/mistral_500.csv')
mistral_u = pd.read_csv('utilitarianism/mistral_500_2.csv')
mistral_v = pd.read_csv('virtue/mistral_500.csv')

chatgpt_c = pd.read_csv('commonsense/Common_final_test_chatgpt.csv')
chatgpt_d = pd.read_csv('deontology/deontology_final_test_chatgpt.csv')
chatgpt_j = pd.read_csv('justice/justice_final_test_chatgpt.csv')
chatgpt_u = pd.read_csv('utilitarianism/util_final_test_chatgpt.csv')
chatgpt_v = pd.read_csv('virtue/virtue_final_test_chatgpt_cleaned.csv')

gemini_c = pd.read_csv('commonsense/gemini_500.csv')
gemini_d = pd.read_csv('deontology/gemini_500.csv')
gemini_j = pd.read_csv('justice/gemini_500.csv')
gemini_u = pd.read_csv('utilitarianism/gemini_500.csv')
gemini_v = pd.read_csv('virtue/gemini_500.csv')

answers_c = pd.read_csv('commonsense/cm_test_hard_answers.csv')
answers_d = pd.read_csv('deontology/deontology_test_hard_answers_500.csv')
answers_j = pd.read_csv('justice/justice_test_hard_answers_500.csv')
answers_u = pd.read_csv('utilitarianism/util_test_hard_answers_500.csv')
answers_v = pd.read_csv('virtue/virtue_test_hard_shuffled_answers.csv')

pos = 1
neg = 0



In [12]:
def get_contingency_table(model1, model2, answers):

    table = np.zeros((2, 2), dtype=int)
    for i in range(len(model1)):
        if answers.iloc[i, 0] == pos:
            if model1.iloc[i, 0] == pos:
                if model2.iloc[i, 0] == pos:
                    table[0][0] += 1
                else:
                    table[0][1] += 1
            else:
                if model2.iloc[i, 0] == pos:
                    table[1][0] += 1
                else:
                    table[1][1] += 1
        else:
            if model1.iloc[i, 0] == neg:
                if model2.iloc[i, 0] == neg:
                    table[0][0] += 1
                else:
                    table[0][1] += 1
            else:
                if model2.iloc[i, 0] == neg:
                    table[1][0] += 1
                else:
                    table[1][1] += 1

    return table

In [13]:
def get_contingency_table_nonbinary(model1, model2, answers):

    table = np.zeros((2, 2), dtype=int)
    for i in range(len(model1)):
        if model1.iloc[i, 0] == answers.iloc[i, 0]:
            if model2.iloc[i, 0] == answers.iloc[i, 0]:
                table[0][0] += 1
            else:
                table[0][1] += 1
        else:
            if model2.iloc[i, 0] == answers.iloc[i, 0]:
                table[1][0] += 1
            else:
                table[1][1] += 1

    return table

In [20]:
def sample_size(table):
    return ((table[0][1]/500 + table[1][0]/500) * (1.96 + 0.8) ** 2) / (table[1][0]/500 - table[0][1]/500) ** 2

In [23]:
import math as Math
sample_sizes = {'c': {'mistral':{}, 'chat':{}, 'gemini':{}}, 
            'd': {'mistral':{}, 'chat':{}, 'gemini':{}}, 
            'j': {'mistral':{}, 'chat':{}, 'gemini':{}}, 
            'u': {'mistral':{}, 'chat':{}, 'gemini':{}}, 
            'v': {'mistral':{}, 'chat':{}, 'gemini':{}}}

for i in ['c', 'd', 'j', 'u', 'v']:
    sample_sizes[i]['mistral']['mistral'] = 1.0
    sample_sizes[i]['chat']['chat'] = 1.0
    sample_sizes[i]['gemini']['gemini'] = 1.0

for i in ['c', 'd', 'j', 'u', 'v']:
    if not i == 'v':
        mistral = globals()[f"mistral_{i}"]
        chatgpt = globals()[f"chatgpt_{i}"]
        gemini = globals()[f"gemini_{i}"]
        answers = globals()[f"answers_{i}"]
        table1 = np.array(get_contingency_table(mistral, chatgpt, answers))
        table2 = np.array(get_contingency_table(mistral, gemini, answers))
        table3 = np.array(get_contingency_table(chatgpt, gemini, answers))
    else:
        mistral = globals()[f"mistral_{i}"]
        chatgpt = globals()[f"chatgpt_{i}"]
        gemini = globals()[f"gemini_{i}"]
        answers = globals()[f"answers_{i}"]
        table1 = np.array(get_contingency_table_nonbinary(mistral, chatgpt, answers))
        table2 = np.array(get_contingency_table_nonbinary(mistral, gemini, answers))
        table3 = np.array(get_contingency_table_nonbinary(chatgpt, gemini, answers))


    tables = [table1, table2, table3]
    labels = [
        "Contingency table for Mistral vs ChatGPT",
        "Contingency table for Mistral vs Gemini",
        "Contingency table for ChatGPT vs Gemini"
    ]
    for idx, tbl in enumerate(tables):
        print(f"{labels[idx]} ({tbl}):")

        if idx == 0:
            sample_sizes[i]['mistral']['chat'] = sample_size(tbl)
            sample_sizes[i]['chat']['mistral'] = sample_size(tbl)
            print(Math.ceil(sample_sizes[i]['mistral']['chat']))
        elif idx == 1:
            sample_sizes[i]['mistral']['gemini'] = sample_size(tbl)
            sample_sizes[i]['gemini']['mistral'] = sample_size(tbl)
            print(Math.ceil(sample_sizes[i]['mistral']['gemini']))
        elif idx == 2:
            sample_sizes[i]['chat']['gemini'] = sample_size(tbl)
            sample_sizes[i]['gemini']['chat'] = sample_size(tbl)
            print(Math.ceil(sample_sizes[i]['chat']['gemini']))

print(sample_sizes)


Contingency table for Mistral vs ChatGPT ([[449  21]
 [ 18  12]]):
16505
Contingency table for Mistral vs Gemini ([[463   7]
 [ 22   8]]):
491
Contingency table for ChatGPT vs Gemini ([[458   9]
 [ 27   6]]):
424
Contingency table for Mistral vs ChatGPT ([[286 105]
 [ 34  75]]):
106
Contingency table for Mistral vs Gemini ([[361  30]
 [ 53  56]]):
598
Contingency table for ChatGPT vs Gemini ([[292  28]
 [122  58]]):
65
Contingency table for Mistral vs ChatGPT ([[282 154]
 [ 41  23]]):
59
Contingency table for Mistral vs Gemini ([[405  31]
 [ 46  18]]):
1304
Contingency table for ChatGPT vs Gemini ([[297  26]
 [154  23]]):
42
Contingency table for Mistral vs ChatGPT ([[245  43]
 [ 99 113]]):
173
Contingency table for Mistral vs Gemini ([[243  45]
 [ 58 154]]):
2322
Contingency table for ChatGPT vs Gemini ([[263  81]
 [ 38 118]]):
246
Contingency table for Mistral vs ChatGPT ([[312  78]
 [ 35  75]]):
233
Contingency table for Mistral vs Gemini ([[373  17]
 [ 61  49]]):
154
Contingency ta