In [13]:
import pandas as pd
from pathlib import Path
from scipy import stats
import json

In [3]:
def stat_test(data1, data2):
    _, p1 = stats.shapiro(data1)
    _, p2 = stats.shapiro(data2)
    
    shapiro_threshold = 0.05
    is_normal = p1 > shapiro_threshold and p2 > shapiro_threshold
    
    if is_normal:
        _, p = stats.ttest_rel(data1, data2)
    else:
        val = stats.wilcoxon(data1, data2)
        p = val.pvalue
        
    p_value_threshold = 0.05
    
    if p < p_value_threshold:
        print(">>> Significant difference, p-value:", p)
        if p < 0.01:
            print(">>> Significant difference at 0.01")
        if p < 0.001:
            print(">>> Significant difference at 0.001")
    else:
        print("No significant difference")
    
    return p1, p2, p

In [4]:
result_path = Path.cwd().parent / "analysis_results" / "summary.csv"

In [5]:
df = pd.read_csv(result_path, index_col=(0, 1, 2))
df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,model,task,pe_technique,correct,total,accuracy,created_at,mean_length,median_length,std_length,min_length,max_length,mean_time_taken,median_time_taken,std_time_taken,min_time_taken,max_time_taken,total_time_taken
pythia-160m,math-number,zero-shot,pythia-160m,math-number,zero-shot,0,540,0.0,2024-05-04-08-53-45,10366.17037,10000,2502.600344,4331,17701,19.871409,19.951003,0.506602,14.162299,20.882381,10730.560791
pythia-160m,math-number,null-shot,pythia-160m,math-number,null-shot,0,540,0.0,2024-05-04-11-52-37,11041.246296,11183,2716.999606,5558,16882,19.775824,19.806719,0.174244,18.703898,20.600824,10678.945112
pythia-160m,math-algebra,zero-shot,pythia-160m,math-algebra,zero-shot,0,1187,0.0,2024-05-03-09-25-22,10021.422072,9558,2816.592556,3172,27789,19.980845,19.958036,1.788766,14.680008,77.402771,23717.263572
pythia-160m,math-algebra,null-shot,pythia-160m,math-algebra,null-shot,0,1187,0.0,2024-05-03-16-00-41,10236.326032,9825,2907.029382,4342,23504,19.751473,19.790684,0.319809,15.446048,20.999895,23444.998943
pythia-160m,anli,zero-shot,pythia-160m,anli,zero-shot,90,1200,7.5,2024-03-02-10-13-32,13467.106667,13640,2102.988893,212,21359,32.997478,33.507348,2.462416,0.39839,34.808553,39596.973934


In [6]:
main_df = df[~df.index.get_level_values(0).str.contains("pythia|qwen|llama")]
main_df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,model,task,pe_technique,correct,total,accuracy,created_at,mean_length,median_length,std_length,min_length,max_length,mean_time_taken,median_time_taken,std_time_taken,min_time_taken,max_time_taken,total_time_taken
gemini-pro-chat,math-number,zero-shot,gemini-pro-chat,math-number,zero-shot,87,540,16.111111,2024-04-30-11-33-56,33.053704,14,183.17232,9,2054,1.123837,0.862738,2.492253,0.740963,29.687569,606.871817
gemini-pro-chat,math-number,null-shot-cot,gemini-pro-chat,math-number,null-shot-cot,102,540,18.888889,2024-04-30-12-20-47,480.907407,329,625.793667,0,5980,4.188187,2.892331,5.092414,0.805594,30.439521,2261.621232
gemini-pro-chat,math-number,zero-shot-cot,gemini-pro-chat,math-number,zero-shot-cot,90,540,16.666667,2024-04-30-11-59-20,199.811111,15,512.343358,0,5970,2.215503,0.928071,3.917813,0.749846,30.596659,1196.371361
gemini-pro-chat,math-number,null-shot,gemini-pro-chat,math-number,null-shot,87,540,16.111111,2024-04-30-11-46-18,41.933333,14,218.417878,13,2924,1.226178,0.88891,2.764307,0.775016,29.742396,662.136272
gemini-pro-chat,math-algebra,zero-shot,gemini-pro-chat,math-algebra,zero-shot,295,1187,24.85257,2024-04-30-06-13-02,40.142376,14,131.776185,9,3585,1.070296,0.868555,1.044572,0.734635,29.855753,1270.441327


# Overall comparison

In [7]:
zero_shot = main_df[main_df['pe_technique'] == 'zero-shot']['accuracy']
null_shot = main_df[main_df['pe_technique'] == 'null-shot']['accuracy']

print("Overall")
print(stat_test(zero_shot, null_shot))
print('---')

zero_shot_cot = main_df[main_df['pe_technique'] == 'zero-shot-cot']['accuracy']
null_shot_cot = main_df[main_df['pe_technique'] == 'null-shot-cot']['accuracy']

print("CoT Overall")
print(stat_test(zero_shot_cot, null_shot_cot))
print('---')

Overall
No significant difference
(1.6096597689222078e-07, 4.131741743893041e-06, 0.1768256570066088)
---
CoT Overall
>>> Significant difference, p-value: 0.003188695957047323
>>> Significant difference at 0.01
(6.504913765049724e-05, 1.0607436453866692e-05, 0.003188695957047323)
---


# Task-wise comparison

In [8]:
for task in main_df['task'].unique():
    zero_shot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'zero-shot')]['accuracy']
    null_shot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'null-shot')]['accuracy']
    
    print(f"Task: {task}")
    print(stat_test(zero_shot, null_shot))
    print('---')
    
    zero_shot_cot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'zero-shot-cot')]['accuracy']
    null_shot_cot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'null-shot-cot')]['accuracy']
    
    if len(zero_shot_cot) > 0 and len(null_shot_cot) > 0:
        print(f"CoT Task: {task}")
        print(stat_test(zero_shot_cot, null_shot_cot))
        print('---')

Task: math-number
No significant difference
(0.2781368585651082, 0.2094574406653062, 0.6541594063104574)
---
CoT Task: math-number
No significant difference
(0.4087176572439697, 0.227423424986529, 0.5764310464876028)
---
Task: math-algebra
No significant difference
(0.8827268831349387, 0.698948624057447, 0.32797529270606574)
---
CoT Task: math-algebra
No significant difference
(0.9025053623228496, 0.36492941500539017, 0.6450656465169533)
---
Task: anli
No significant difference
(0.6353684617909103, 0.8392527527572857, 0.5143109531804305)
---
CoT Task: anli
No significant difference
(0.9380405922496455, 0.5026690381899934, 0.17291391351419813)
---
Task: triviaqa
>>> Significant difference, p-value: 0.037109375
(0.5110642658838829, 0.03577279067929675, 0.037109375)
---
CoT Task: triviaqa
>>> Significant difference, p-value: 0.015328820631749764
(0.5159365336843068, 0.07290366720242526, 0.015328820631749764)
---
Task: halueval-summarization
No significant difference
(0.14951939613454146, 

# Model-wise comparison

In [9]:
for model in main_df['model'].unique():
    zero_shot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'zero-shot')]['accuracy']
    null_shot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'null-shot')]['accuracy']
    
    print(f"Model: {model}")
    print(stat_test(zero_shot, null_shot))
    print('---')
    
    zero_shot_cot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'zero-shot-cot')]['accuracy']
    null_shot_cot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'null-shot-cot')]['accuracy']
    
    if len(zero_shot_cot) > 0 and len(null_shot_cot) > 0:
        print(f"CoT Model: {model}")
        print(stat_test(zero_shot_cot, null_shot_cot))
        print('---')

Model: gemini-pro-chat
No significant difference
(0.0176134888567654, 0.04594345893066334, 0.28597919066405675)
---
CoT Model: gemini-pro-chat
No significant difference
(0.12673781646172858, 0.22979146121049404, 0.12203725546742168)
---
Model: palm-2-chat
>>> Significant difference, p-value: 7.62939453125e-06
>>> Significant difference at 0.01
>>> Significant difference at 0.001
(0.0005348800843300856, 0.0012115609880377538, 7.62939453125e-06)
---
CoT Model: palm-2-chat
No significant difference
(0.0024478077222137395, 0.003056653456976515, 0.389404296875)
---
Model: claude-3-opus
>>> Significant difference, p-value: 0.005329132080078125
>>> Significant difference at 0.01
(0.013366676855067696, 0.0669391011193735, 0.005329132080078125)
---
CoT Model: claude-3-opus
No significant difference
(0.5264871702023641, 0.17941366388660962, 0.050954679371706016)
---
Model: claude-2.1
>>> Significant difference, p-value: 0.02533294744205989
(0.7982121749595039, 0.6841827126206819, 0.0253329474420

# Breakdown comparison

## Data Preparation

In [10]:
raw_result_path = Path.cwd().parent / "results"

In [26]:
breakdown_data = {}
for model_path in raw_result_path.iterdir():
    if not model_path.is_dir():
        continue
    model = model_path.name
    breakdown_data[model] = {}
    
    for task_path in model_path.iterdir():
        if not task_path.is_dir():
            continue
        task = task_path.name
        breakdown_data[model][task] = {}
        
        for pe_path in task_path.iterdir():
            if not pe_path.is_dir():
                continue
            pe_technique = pe_path.name
            breakdown_data[model][task][pe_technique] = []
            
            for result_path in pe_path.iterdir():
                if not result_path.is_file() or result_path.suffix != ".json" or result_path.stem == "summary":
                    continue
                with result_path.open() as f:
                    result = json.load(f)
                    
                breakdown_data[model][task][pe_technique].append(1 if result["is_correct"] else 0)

In [27]:
breakdown_data_path = Path.cwd().parent / "analysis_results" / "breakdown_data.json"
with breakdown_data_path.open("w") as f:
    json.dump(breakdown_data, f)

## Analysis

In [28]:
with breakdown_data_path.open() as f:
    breakdown_data = json.load(f)

In [35]:
breakdown_result_csv = {
    "model": [],
    "task": [],
    "pe_technique": [],
    "significance at 0.05": [],
    "significance at 0.01": [],
    "significance at 0.001": [],
    "is_improved": []
}
for model in breakdown_data:
    if model.startswith("pythia") or model.startswith("qwen") or model.startswith("llama"):
        continue
    for task in breakdown_data[model]:
        print(f"Model: {model}, Task: {task}")
        zero_shot = breakdown_data[model][task]['zero-shot']
        null_shot = breakdown_data[model][task]['null-shot']
        _, _, p = stat_test(zero_shot, null_shot)
        is_improved = sum(breakdown_data[model][task]['null-shot']) > sum(breakdown_data[model][task]['zero-shot'])
        print('---')
        breakdown_result_csv["model"].append(model)
        breakdown_result_csv["task"].append(task)
        breakdown_result_csv["pe_technique"].append("zero-shot vs null-shot")
        breakdown_result_csv["significance at 0.05"].append(p < 0.05)
        breakdown_result_csv["significance at 0.01"].append(p < 0.01)
        breakdown_result_csv["significance at 0.001"].append(p < 0.001)
        breakdown_result_csv["is_improved"].append(is_improved)
        
        if 'zero-shot-cot' in breakdown_data[model][task] and 'null-shot-cot' in breakdown_data[model][task]:
            zero_shot_cot = breakdown_data[model][task]['zero-shot-cot']
            null_shot_cot = breakdown_data[model][task]['null-shot-cot']
            print(f"CoT Model: {model}, Task: {task}")
            _, _, p = stat_test(zero_shot_cot, null_shot_cot)
            is_improved = sum(breakdown_data[model][task]['null-shot-cot']) > sum(breakdown_data[model][task]['zero-shot-cot'])
            print('---')
            breakdown_result_csv["model"].append(model)
            breakdown_result_csv["task"].append(task)
            breakdown_result_csv["pe_technique"].append("zero-shot-cot vs null-shot-cot")
            breakdown_result_csv["significance at 0.05"].append(p < 0.05)
            breakdown_result_csv["significance at 0.01"].append(p < 0.01)
            breakdown_result_csv["significance at 0.001"].append(p < 0.001)
            breakdown_result_csv["is_improved"].append(is_improved)

Model: gemini-pro-chat, Task: math-number
No significant difference
---
CoT Model: gemini-pro-chat, Task: math-number
No significant difference
---
Model: gemini-pro-chat, Task: math-algebra
>>> Significant difference, p-value: 0.014019277113959935
---
CoT Model: gemini-pro-chat, Task: math-algebra
>>> Significant difference, p-value: 3.9819925703018625e-12
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: anli
No significant difference
---
CoT Model: gemini-pro-chat, Task: anli
>>> Significant difference, p-value: 0.0001179298745081001
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: triviaqa
>>> Significant difference, p-value: 1.6062991055102736e-95
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gemini-pro-chat, Task: triviaqa
>>> Significant difference, p-value: 2.460551470591261e-132
>>> Significant difference at 0.01
>>> Significant

  res = hypotest_fun_out(*samples, **kwds)


>>> Significant difference, p-value: 3.904607785009623e-15
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gpt-3.5-turbo, Task: math-geometry
>>> Significant difference, p-value: 0.01748467441052132
---
CoT Model: gpt-3.5-turbo, Task: math-geometry
No significant difference
---
Model: gpt-3.5-turbo, Task: math-count-prob
>>> Significant difference, p-value: 0.005564237030557803
>>> Significant difference at 0.01
---
CoT Model: gpt-3.5-turbo, Task: math-count-prob
No significant difference
---
Model: gpt-3.5-turbo, Task: math-pre-calc
No significant difference
---
CoT Model: gpt-3.5-turbo, Task: math-pre-calc
No significant difference
---
Model: claude-3-sonnet, Task: math-number
No significant difference
---
CoT Model: claude-3-sonnet, Task: math-number
>>> Significant difference, p-value: 2.1625566553790284e-23
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-sonnet, Task: math-algebra
No significant differenc

In [36]:
breakdown_result_df = pd.DataFrame(breakdown_result_csv)
breakdown_result_df.to_csv(Path.cwd().parent / "analysis_results" / "breakdown_result.csv")

In [37]:
breakdown_result_df.head()

Unnamed: 0,model,task,pe_technique,significance at 0.05,significance at 0.01,significance at 0.001,is_improved
0,gemini-pro-chat,math-number,zero-shot vs null-shot,False,False,False,False
1,gemini-pro-chat,math-number,zero-shot-cot vs null-shot-cot,False,False,False,True
2,gemini-pro-chat,math-algebra,zero-shot vs null-shot,True,False,False,True
3,gemini-pro-chat,math-algebra,zero-shot-cot vs null-shot-cot,True,True,True,True
4,gemini-pro-chat,anli,zero-shot vs null-shot,False,False,False,True
