In [24]:
import pandas as pd
from pathlib import Path
from scipy import stats
import json
import dabest
import pingouin as pg

In [43]:
def stat_test(data1, data2):
    _, p1 = stats.shapiro(data1)
    _, p2 = stats.shapiro(data2)
    
    shapiro_threshold = 0.05
    is_normal = p1 > shapiro_threshold and p2 > shapiro_threshold
    
    if is_normal:
        _, p = stats.ttest_rel(data1, data2)
        eff_size = dabest.effsize.cohens_h(data1, data2)
        eff_desc = "small" if eff_size >= 0.2 else "medium" if eff_size >= 0.5 else "large" if eff_size >= 0.8 else str(eff_size)
    else:
        val = pg.wilcoxon(data1, data2, alternative='two-sided')
        p = val['p-val'].values[0]
        eff_size = val['RBC'].values[0]
        eff_desc = "small" if abs(eff_size) < 0.3 else "medium" if abs(eff_size) < 0.5 else "large"
        
    p_value_threshold = 0.05
    
    if p < p_value_threshold:
        print(">>> Significant difference, p-value:", p)
        print(">>> Effect size:", eff_size)
        if p < 0.01:
            print(">>> Significant difference at 0.01")
        if p < 0.001:
            print(">>> Significant difference at 0.001")
    else:
        print("No significant difference")
        
    eff_size = eff_size if p < p_value_threshold else None
    eff_desc = eff_desc if p < p_value_threshold else None
    
    return p1, p2, p, eff_size, eff_desc

In [26]:
result_path = Path.cwd().parent / "analysis_results" / "summary.csv"
breakdown_data_path = Path.cwd().parent / "analysis_results" / "breakdown_data.json"

In [27]:
df = pd.read_csv(result_path, index_col=(0, 1, 2))
df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,model,task,pe_technique,correct,total,accuracy,created_at,mean_length,median_length,std_length,min_length,max_length,mean_time_taken,median_time_taken,std_time_taken,min_time_taken,max_time_taken,total_time_taken
pythia-160m,math-number,zero-shot,pythia-160m,math-number,zero-shot,0,540,0.0,2024-05-04-08-53-45,10366.17037,10000,2502.600344,4331,17701,19.871409,19.951003,0.506602,14.162299,20.882381,10730.560791
pythia-160m,math-number,null-shot,pythia-160m,math-number,null-shot,0,540,0.0,2024-05-04-11-52-37,11041.246296,11183,2716.999606,5558,16882,19.775824,19.806719,0.174244,18.703898,20.600824,10678.945112
pythia-160m,math-algebra,zero-shot,pythia-160m,math-algebra,zero-shot,0,1187,0.0,2024-05-03-09-25-22,10021.422072,9558,2816.592556,3172,27789,19.980845,19.958036,1.788766,14.680008,77.402771,23717.263572
pythia-160m,math-algebra,null-shot,pythia-160m,math-algebra,null-shot,0,1187,0.0,2024-05-03-16-00-41,10236.326032,9825,2907.029382,4342,23504,19.751473,19.790684,0.319809,15.446048,20.999895,23444.998943
pythia-160m,anli,zero-shot,pythia-160m,anli,zero-shot,90,1200,7.5,2024-03-02-10-13-32,13467.106667,13640,2102.988893,212,21359,32.997478,33.507348,2.462416,0.39839,34.808553,39596.973934


In [28]:
main_df = df[~df.index.get_level_values(0).str.contains("pythia|qwen|llama")]
main_df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,model,task,pe_technique,correct,total,accuracy,created_at,mean_length,median_length,std_length,min_length,max_length,mean_time_taken,median_time_taken,std_time_taken,min_time_taken,max_time_taken,total_time_taken
gemini-pro-chat,math-number,zero-shot,gemini-pro-chat,math-number,zero-shot,87,540,16.111111,2024-04-30-11-33-56,33.053704,14,183.17232,9,2054,1.123837,0.862738,2.492253,0.740963,29.687569,606.871817
gemini-pro-chat,math-number,null-shot-cot,gemini-pro-chat,math-number,null-shot-cot,102,540,18.888889,2024-04-30-12-20-47,480.907407,329,625.793667,0,5980,4.188187,2.892331,5.092414,0.805594,30.439521,2261.621232
gemini-pro-chat,math-number,zero-shot-cot,gemini-pro-chat,math-number,zero-shot-cot,90,540,16.666667,2024-04-30-11-59-20,199.811111,15,512.343358,0,5970,2.215503,0.928071,3.917813,0.749846,30.596659,1196.371361
gemini-pro-chat,math-number,null-shot,gemini-pro-chat,math-number,null-shot,87,540,16.111111,2024-04-30-11-46-18,41.933333,14,218.417878,13,2924,1.226178,0.88891,2.764307,0.775016,29.742396,662.136272
gemini-pro-chat,math-algebra,zero-shot,gemini-pro-chat,math-algebra,zero-shot,295,1187,24.85257,2024-04-30-06-13-02,40.142376,14,131.776185,9,3585,1.070296,0.868555,1.044572,0.734635,29.855753,1270.441327


# Overall comparison

In [29]:
zero_shot = main_df[main_df['pe_technique'] == 'zero-shot']['accuracy']
null_shot = main_df[main_df['pe_technique'] == 'null-shot']['accuracy']

print("Overall")
print(stat_test(zero_shot, null_shot))
print('---')

zero_shot_cot = main_df[main_df['pe_technique'] == 'zero-shot-cot']['accuracy']
null_shot_cot = main_df[main_df['pe_technique'] == 'null-shot-cot']['accuracy']

print("CoT Overall")
print(stat_test(zero_shot_cot, null_shot_cot))
print('---')

Overall
No significant difference
(1.6096597689222078e-07, 4.131741743893041e-06, 0.17704197844605563, 0.11389236545682108, 'small')
---
CoT Overall
>>> Significant difference, p-value: 0.0031985852513144518
>>> Effect size: 0.2794304371485579
>>> Significant difference at 0.01
(6.504913765049724e-05, 1.0607436453866692e-05, 0.0031985852513144518, 0.2794304371485579, 'small')
---


# Task-wise comparison

In [30]:
for task in main_df['task'].unique():
    zero_shot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'zero-shot')]['accuracy']
    null_shot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'null-shot')]['accuracy']
    
    print(f"Task: {task}")
    print(stat_test(zero_shot, null_shot))
    print('---')
    
    zero_shot_cot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'zero-shot-cot')]['accuracy']
    null_shot_cot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'null-shot-cot')]['accuracy']
    
    if len(zero_shot_cot) > 0 and len(null_shot_cot) > 0:
        print(f"CoT Task: {task}")
        print(stat_test(zero_shot_cot, null_shot_cot))
        print('---')

Task: math-number


ValueError: Input data must be binary.

# Model-wise comparison

In [31]:
for model in main_df['model'].unique():
    zero_shot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'zero-shot')]['accuracy']
    null_shot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'null-shot')]['accuracy']
    
    print(f"Model: {model}")
    print(stat_test(zero_shot, null_shot))
    print('---')
    
    zero_shot_cot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'zero-shot-cot')]['accuracy']
    null_shot_cot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'null-shot-cot')]['accuracy']
    
    if len(zero_shot_cot) > 0 and len(null_shot_cot) > 0:
        print(f"CoT Model: {model}")
        print(stat_test(zero_shot_cot, null_shot_cot))
        print('---')

Model: gemini-pro-chat
No significant difference
(0.0176134888567654, 0.04594345893066334, 0.2959265995411936, -0.28654970760233917, 'small')
---
CoT Model: gemini-pro-chat


ValueError: Input data must be binary.

# Breakdown comparison

## Data Preparation

In [32]:
raw_result_path = Path.cwd().parent / "results"

In [33]:
breakdown_data = {}
for model_path in raw_result_path.iterdir():
    if not model_path.is_dir():
        continue
    model = model_path.name
    breakdown_data[model] = {}
    
    for task_path in model_path.iterdir():
        if not task_path.is_dir():
            continue
        task = task_path.name
        breakdown_data[model][task] = {}
        
        for pe_path in task_path.iterdir():
            if not pe_path.is_dir():
                continue
            pe_technique = pe_path.name
            breakdown_data[model][task][pe_technique] = []
            
            for result_path in pe_path.iterdir():
                if not result_path.is_file() or result_path.suffix != ".json" or result_path.stem == "summary":
                    continue
                with result_path.open() as f:
                    result = json.load(f)
                    
                breakdown_data[model][task][pe_technique].append(1 if result["is_correct"] else 0)

In [34]:
with breakdown_data_path.open("w") as f:
    json.dump(breakdown_data, f)

## Analysis

In [35]:
with breakdown_data_path.open() as f:
    breakdown_data = json.load(f)

In [36]:
val = pg.wilcoxon(breakdown_data['gpt-3.5-turbo']['gsm8k']['zero-shot'], breakdown_data['gpt-3.5-turbo']['gsm8k']['null-shot'], alternative='two-sided')
val

Unnamed: 0,W-val,alternative,p-val,RBC,CLES
Wilcoxon,25393.0,two-sided,1.535776e-08,-0.291005,0.458302


In [37]:
val['p-val'].values[0]

1.5357764580459134e-08

In [38]:
dabest.effsize.cohens_h(breakdown_data['gpt-3.5-turbo']['gsm8k']['zero-shot'], breakdown_data['gpt-3.5-turbo']['gsm8k']['null-shot'])

0.16982534439935937

In [44]:
breakdown_result_csv = {
    "model": [],
    "task": [],
    "pe_technique": [],
    "significance at 0.05": [],
    "significance at 0.01": [],
    "significance at 0.001": [],
    "eff_size": [],
    "eff_desc": [],
    "is_improved": []
}
for model in breakdown_data:
    if model.startswith("pythia") or model.startswith("qwen") or model.startswith("llama"):
        continue
    for task in breakdown_data[model]:
        print(f"Model: {model}, Task: {task}")
        zero_shot = breakdown_data[model][task]['zero-shot']
        null_shot = breakdown_data[model][task]['null-shot']
        _, _, p, eff_size, eff_desc = stat_test(zero_shot, null_shot)
        is_improved = sum(breakdown_data[model][task]['null-shot']) > sum(breakdown_data[model][task]['zero-shot'])
        print('---')
        breakdown_result_csv["model"].append(model)
        breakdown_result_csv["task"].append(task)
        breakdown_result_csv["pe_technique"].append("zero-shot vs null-shot")
        breakdown_result_csv["significance at 0.05"].append(p < 0.05)
        breakdown_result_csv["significance at 0.01"].append(p < 0.01)
        breakdown_result_csv["significance at 0.001"].append(p < 0.001)
        breakdown_result_csv["eff_size"].append(eff_size)
        breakdown_result_csv["eff_desc"].append(eff_desc if eff_desc is not None else "")
        breakdown_result_csv["is_improved"].append(is_improved)
        
        if 'zero-shot-cot' in breakdown_data[model][task] and 'null-shot-cot' in breakdown_data[model][task]:
            zero_shot_cot = breakdown_data[model][task]['zero-shot-cot']
            null_shot_cot = breakdown_data[model][task]['null-shot-cot']
            print(f"CoT Model: {model}, Task: {task}")
            _, _, p, eff_size, eff_desc = stat_test(zero_shot_cot, null_shot_cot)
            is_improved = sum(breakdown_data[model][task]['null-shot-cot']) > sum(breakdown_data[model][task]['zero-shot-cot'])
            print('---')
            breakdown_result_csv["model"].append(model)
            breakdown_result_csv["task"].append(task)
            breakdown_result_csv["pe_technique"].append("zero-shot-cot vs null-shot-cot")
            breakdown_result_csv["significance at 0.05"].append(p < 0.05)
            breakdown_result_csv["significance at 0.01"].append(p < 0.01)
            breakdown_result_csv["significance at 0.001"].append(p < 0.001)
            breakdown_result_csv["eff_size"].append(eff_size)
            breakdown_result_csv["eff_desc"].append(eff_desc if eff_desc is not None else "")
            breakdown_result_csv["is_improved"].append(is_improved)

Model: gemini-pro-chat, Task: math-number
No significant difference
---
CoT Model: gemini-pro-chat, Task: math-number
No significant difference
---
Model: gemini-pro-chat, Task: math-algebra
>>> Significant difference, p-value: 0.014084670857345055
>>> Effect size: -0.23214285714285715
---
CoT Model: gemini-pro-chat, Task: math-algebra
>>> Significant difference, p-value: 3.994668363690491e-12
>>> Effect size: -0.4222222222222223
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: anli
No significant difference
---
CoT Model: gemini-pro-chat, Task: anli
>>> Significant difference, p-value: 0.00011845696117386576
>>> Effect size: 0.3154362416107383
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: triviaqa
>>> Significant difference, p-value: 1.6127895061806714e-95
>>> Effect size: 0.9534883720930232
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Mo

  res = _add_reduced_axes(res, reduced_axes, keepdims)


No significant difference
---
Model: gemini-pro-chat, Task: race-m
No significant difference
---
CoT Model: gemini-pro-chat, Task: race-m
>>> Significant difference, p-value: 0.0022695643944767733
>>> Effect size: 0.3333333333333333
>>> Significant difference at 0.01
---
Model: gemini-pro-chat, Task: strategyqa
>>> Significant difference, p-value: 3.288406215761168e-58
>>> Effect size: 0.6582914572864322
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gemini-pro-chat, Task: strategyqa
>>> Significant difference, p-value: 0.0
>>> Effect size: 0.9986657771847898
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: winogrande
No significant difference
---
CoT Model: gemini-pro-chat, Task: winogrande
>>> Significant difference, p-value: 2.1988922589396707e-173
>>> Effect size: 1.0
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: math-pre-algebra


In [45]:
breakdown_result_df = pd.DataFrame(breakdown_result_csv)
breakdown_result_df.to_csv(Path.cwd().parent / "analysis_results" / "breakdown_result.csv")

In [46]:
breakdown_result_df.head()

Unnamed: 0,model,task,pe_technique,significance at 0.05,significance at 0.01,significance at 0.001,eff_size,eff_desc,is_improved
0,gemini-pro-chat,math-number,zero-shot vs null-shot,False,False,False,,,False
1,gemini-pro-chat,math-number,zero-shot-cot vs null-shot-cot,False,False,False,,,True
2,gemini-pro-chat,math-algebra,zero-shot vs null-shot,True,False,False,-0.232143,small,True
3,gemini-pro-chat,math-algebra,zero-shot-cot vs null-shot-cot,True,True,True,-0.422222,medium,True
4,gemini-pro-chat,anli,zero-shot vs null-shot,False,False,False,,,True
