In [5]:
import numpy as np
from scipy.stats import ttest_ind

# Accuracy data for the models
accuracy_data = {
    'Llama2 7B Chat': {
        'ARC': 0.527,
        'TruthfulQA': 0.453,
        'HellaSwag': 0.785,
        'MMLU': 0.470,
        'MNLI': 0.495
    },
    'Llama2 7B Chat Uncensored': {
        'ARC': 0.532,
        'TruthfulQA': 0.427,
        'HellaSwag': 0.786,
        'MMLU': 0.353,
        'MNLI': 0.447
    },
    'Llama2 13B Chat': {
        'ARC': 0.592,
        'TruthfulQA': 0.439,
        'HellaSwag': 0.819,
        'MMLU': 0.461,
        'MNLI': 0.474
    },
    'Llama2 13B Chat Uncensored': {
        'ARC': 0.600,
        'TruthfulQA': 0.408,
        'HellaSwag': 0.824,
        'MMLU': 0.530,
        'MNLI': 0.459
    }
}

# Define significance level
alpha = 0.05

# Perform t-test for each benchmark
benchmarks = ['ARC', 'TruthfulQA', 'HellaSwag', 'MMLU', 'MNLI']
model_combinations = [('Llama2 7B Chat', 'Llama2 7B Chat Uncensored'),
                      ('Llama2 7B Chat', 'Llama2 13B Chat'),
                      ('Llama2 7B Chat', 'Llama2 13B Chat Uncensored'),
                      ('Llama2 7B Chat Uncensored', 'Llama2 13B Chat Uncensored'),
                      ('Llama2 13B Chat', 'Llama2 13B Chat Uncensored')]

for model1, model2 in model_combinations:
    print(f"Statistical comparison between {model1} and {model2}:")
    for benchmark in benchmarks:
        print(f"\tBenchmark: {benchmark}")
        data1 = accuracy_data[model1][benchmark]
        data2 = accuracy_data[model2][benchmark]
        
        if np.var(data1) < 1e-9 or np.var(data2) < 1e-9:
            print("\t\tLow variability detected, skipping test.")
            continue
        
        t_statistic, p_value = ttest_ind(data1, data2)
        print("\t\tp-value:", p_value)
        if p_value < alpha:
            print("\t\tSignificant difference exists.")
        else:
            print("\t\tNo significant difference.")
    print()



Statistical comparison between Llama2 7B Chat and Llama2 7B Chat Uncensored:
	Benchmark: ARC
		Low variability detected, skipping test.
	Benchmark: TruthfulQA
		Low variability detected, skipping test.
	Benchmark: HellaSwag
		Low variability detected, skipping test.
	Benchmark: MMLU
		Low variability detected, skipping test.
	Benchmark: MNLI
		Low variability detected, skipping test.

Statistical comparison between Llama2 7B Chat and Llama2 13B Chat:
	Benchmark: ARC
		Low variability detected, skipping test.
	Benchmark: TruthfulQA
		Low variability detected, skipping test.
	Benchmark: HellaSwag
		Low variability detected, skipping test.
	Benchmark: MMLU
		Low variability detected, skipping test.
	Benchmark: MNLI
		Low variability detected, skipping test.

Statistical comparison between Llama2 7B Chat and Llama2 13B Chat Uncensored:
	Benchmark: ARC
		Low variability detected, skipping test.
	Benchmark: TruthfulQA
		Low variability detected, skipping test.
	Benchmark: HellaSwag
		Low var