In [1]:
import pandas as pd
from pathlib import Path
from scipy import stats
import json
import dabest
import pingouin as pg

In [2]:
def stat_test(data1, data2):
    _, p1 = stats.shapiro(data1)
    _, p2 = stats.shapiro(data2)
    
    val = pg.wilcoxon(data1, data2, alternative='two-sided')
    p = val['p-val'].values[0]
    eff_size = val['RBC'].values[0]
    eff_desc = "small" if abs(eff_size) < 0.3 else "medium" if abs(eff_size) < 0.5 else "large"
        
    p_value_threshold = 0.05
    
    if p < p_value_threshold:
        print(">>> Significant difference, p-value:", p)
        print(">>> Effect size:", eff_size)
        if p < 0.01:
            print(">>> Significant difference at 0.01")
        if p < 0.001:
            print(">>> Significant difference at 0.001")
    else:
        print("No significant difference")
        
    eff_size = eff_size if p < p_value_threshold else None
    eff_desc = eff_desc if p < p_value_threshold else None
    
    return p1, p2, p, eff_size, eff_desc

In [3]:
result_path = Path.cwd().parent / "analysis_results" / "summary.csv"
breakdown_data_path = Path.cwd().parent / "analysis_results" / "breakdown_data.json"

In [4]:
df = pd.read_csv(result_path, index_col=(0, 1, 2))
df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,model,task,pe_technique,correct,total,accuracy,created_at,mean_length,median_length,std_length,min_length,max_length,mean_time_taken,median_time_taken,std_time_taken,min_time_taken,max_time_taken,total_time_taken
pythia-160m,math-number,zero-shot,pythia-160m,math-number,zero-shot,0,540,0.0,2024-05-04-08-53-45,10366.17037,10000,2502.600344,4331,17701,19.871409,19.951003,0.506602,14.162299,20.882381,10730.560791
pythia-160m,math-number,null-shot,pythia-160m,math-number,null-shot,0,540,0.0,2024-05-04-11-52-37,11041.246296,11183,2716.999606,5558,16882,19.775824,19.806719,0.174244,18.703898,20.600824,10678.945112
pythia-160m,math-algebra,zero-shot,pythia-160m,math-algebra,zero-shot,0,1187,0.0,2024-05-03-09-25-22,10021.422072,9558,2816.592556,3172,27789,19.980845,19.958036,1.788766,14.680008,77.402771,23717.263572
pythia-160m,math-algebra,null-shot,pythia-160m,math-algebra,null-shot,0,1187,0.0,2024-05-03-16-00-41,10236.326032,9825,2907.029382,4342,23504,19.751473,19.790684,0.319809,15.446048,20.999895,23444.998943
pythia-160m,anli,zero-shot,pythia-160m,anli,zero-shot,90,1200,7.5,2024-03-02-10-13-32,13467.106667,13640,2102.988893,212,21359,32.997478,33.507348,2.462416,0.39839,34.808553,39596.973934


In [5]:
main_df = df[~df.index.get_level_values(0).str.contains("pythia|qwen|llama")]
main_df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,model,task,pe_technique,correct,total,accuracy,created_at,mean_length,median_length,std_length,min_length,max_length,mean_time_taken,median_time_taken,std_time_taken,min_time_taken,max_time_taken,total_time_taken
gemini-pro-chat,math-number,zero-shot,gemini-pro-chat,math-number,zero-shot,87,540,16.111111,2024-04-30-11-33-56,33.053704,14,183.17232,9,2054,1.123837,0.862738,2.492253,0.740963,29.687569,606.871817
gemini-pro-chat,math-number,null-shot-cot,gemini-pro-chat,math-number,null-shot-cot,102,540,18.888889,2024-04-30-12-20-47,480.907407,329,625.793667,0,5980,4.188187,2.892331,5.092414,0.805594,30.439521,2261.621232
gemini-pro-chat,math-number,zero-shot-cot,gemini-pro-chat,math-number,zero-shot-cot,90,540,16.666667,2024-04-30-11-59-20,199.811111,15,512.343358,0,5970,2.215503,0.928071,3.917813,0.749846,30.596659,1196.371361
gemini-pro-chat,math-number,null-shot,gemini-pro-chat,math-number,null-shot,87,540,16.111111,2024-04-30-11-46-18,41.933333,14,218.417878,13,2924,1.226178,0.88891,2.764307,0.775016,29.742396,662.136272
gemini-pro-chat,math-algebra,zero-shot,gemini-pro-chat,math-algebra,zero-shot,295,1187,24.85257,2024-04-30-06-13-02,40.142376,14,131.776185,9,3585,1.070296,0.868555,1.044572,0.734635,29.855753,1270.441327


# Overall comparison

In [6]:
zero_shot = main_df[main_df['pe_technique'] == 'zero-shot']['accuracy']
null_shot = main_df[main_df['pe_technique'] == 'null-shot']['accuracy']

print("Overall")
print(stat_test(zero_shot, null_shot))
print('---')

zero_shot_cot = main_df[main_df['pe_technique'] == 'zero-shot-cot']['accuracy']
null_shot_cot = main_df[main_df['pe_technique'] == 'null-shot-cot']['accuracy']

print("CoT Overall")
print(stat_test(zero_shot_cot, null_shot_cot))
print('---')

Overall
No significant difference
(1.60969150897472e-07, 4.131803507334553e-06, 0.17704197844605585, None, None)
---
CoT Overall
>>> Significant difference, p-value: 0.0031985852513144496
>>> Effect size: 0.2794304371485579
>>> Significant difference at 0.01
(6.504447810584679e-05, 1.0607135664031375e-05, 0.0031985852513144496, 0.2794304371485579, 'small')
---


# Task-wise comparison

In [7]:
for task in main_df['task'].unique():
    zero_shot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'zero-shot')]['accuracy']
    null_shot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'null-shot')]['accuracy']
    
    print(f"Task: {task}")
    print(stat_test(zero_shot, null_shot))
    print('---')
    
    zero_shot_cot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'zero-shot-cot')]['accuracy']
    null_shot_cot = main_df[(main_df['task'] == task) & (main_df['pe_technique'] == 'null-shot-cot')]['accuracy']
    
    if len(zero_shot_cot) > 0 and len(null_shot_cot) > 0:
        print(f"CoT Task: {task}")
        print(stat_test(zero_shot_cot, null_shot_cot))
        print('---')

Task: math-number
No significant difference
(0.2781377136707306, 0.20945793390274048, 1.0, None, None)
---
CoT Task: math-number
No significant difference
(0.4087183475494385, 0.22742445766925812, 0.375, None, None)
---
Task: math-algebra
No significant difference
(0.8827275633811951, 0.6989490985870361, 0.625, None, None)
---
CoT Task: math-algebra
No significant difference
(0.9025053381919861, 0.3649294972419739, 0.76953125, None, None)
---
Task: anli
No significant difference
(0.6353700757026672, 0.8392505645751953, 0.6953125, None, None)
---
CoT Task: anli
No significant difference
(0.9380416870117188, 0.5026674866676331, 0.16015625, None, None)
---
Task: triviaqa
>>> Significant difference, p-value: 0.037109375
>>> Effect size: 0.7454545454545454
(0.511067271232605, 0.03577284887433052, 0.037109375, 0.7454545454545454, 'large')
---
CoT Task: triviaqa
>>> Significant difference, p-value: 0.00390625
>>> Effect size: 0.9636363636363636
>>> Significant difference at 0.01
(0.5159384608



# Model-wise comparison

In [8]:
for model in main_df['model'].unique():
    zero_shot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'zero-shot')]['accuracy']
    null_shot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'null-shot')]['accuracy']
    
    print(f"Model: {model}")
    print(stat_test(zero_shot, null_shot))
    print('---')
    
    zero_shot_cot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'zero-shot-cot')]['accuracy']
    null_shot_cot = main_df[(main_df['model'] == model) & (main_df['pe_technique'] == 'null-shot-cot')]['accuracy']
    
    if len(zero_shot_cot) > 0 and len(null_shot_cot) > 0:
        print(f"CoT Model: {model}")
        print(stat_test(zero_shot_cot, null_shot_cot))
        print('---')

Model: gemini-pro-chat
No significant difference
(0.017613448202610016, 0.04594356566667557, 0.2959265995411936, None, None)
---
CoT Model: gemini-pro-chat
No significant difference
(0.12673737108707428, 0.2297903150320053, 0.67877197265625, None, None)
---
Model: palm-2-chat
>>> Significant difference, p-value: 7.62939453125e-06
>>> Effect size: -0.9894736842105263
>>> Significant difference at 0.01
>>> Significant difference at 0.001
(0.0005348818958736956, 0.0012115583522245288, 7.62939453125e-06, -0.9894736842105263, 'large')
---
CoT Model: palm-2-chat
No significant difference
(0.0024478016421198845, 0.0030566509813070297, 0.389404296875, None, None)
---
Model: claude-3-opus
>>> Significant difference, p-value: 0.005329132080078125
>>> Effect size: 0.7052631578947368
>>> Significant difference at 0.01
(0.01336666289716959, 0.06693920493125916, 0.005329132080078125, 0.7052631578947368, 'large')
---
CoT Model: claude-3-opus
No significant difference
(0.5264863967895508, 0.1794133633



# Breakdown comparison

## Data Preparation

In [9]:
raw_result_path = Path.cwd().parent / "results"

In [10]:
breakdown_data = {}
for model_path in raw_result_path.iterdir():
    if not model_path.is_dir():
        continue
    model = model_path.name
    breakdown_data[model] = {}
    
    for task_path in model_path.iterdir():
        if not task_path.is_dir():
            continue
        task = task_path.name
        breakdown_data[model][task] = {}
        
        for pe_path in task_path.iterdir():
            if not pe_path.is_dir():
                continue
            pe_technique = pe_path.name
            breakdown_data[model][task][pe_technique] = []
            
            for result_path in pe_path.iterdir():
                if not result_path.is_file() or result_path.suffix != ".json" or result_path.stem == "summary":
                    continue
                with result_path.open() as f:
                    result = json.load(f)
                    
                breakdown_data[model][task][pe_technique].append(1 if result["is_correct"] else 0)

In [11]:
with breakdown_data_path.open("w") as f:
    json.dump(breakdown_data, f)

## Analysis

In [12]:
with breakdown_data_path.open() as f:
    breakdown_data = json.load(f)

In [13]:
val = pg.wilcoxon(breakdown_data['gpt-3.5-turbo']['gsm8k']['zero-shot'], breakdown_data['gpt-3.5-turbo']['gsm8k']['null-shot'], alternative='two-sided')
val

Unnamed: 0,W-val,alternative,p-val,RBC,CLES
Wilcoxon,25393.0,two-sided,1.535776e-08,-0.291005,0.458302


In [14]:
val['p-val'].values[0]

1.535776458045913e-08

In [15]:
dabest.effsize.cohens_h(breakdown_data['gpt-3.5-turbo']['gsm8k']['zero-shot'], breakdown_data['gpt-3.5-turbo']['gsm8k']['null-shot'])

0.16982534439935937

In [16]:
breakdown_result_csv = {
    "model": [],
    "task": [],
    "pe_technique": [],
    "significance at 0.05": [],
    "significance at 0.01": [],
    "significance at 0.001": [],
    "eff_size": [],
    "eff_desc": [],
    "is_improved": []
}
for model in breakdown_data:
    if model.startswith("pythia") or model.startswith("qwen") or model.startswith("llama"):
        continue
    for task in breakdown_data[model]:
        print(f"Model: {model}, Task: {task}")
        zero_shot = breakdown_data[model][task]['zero-shot']
        null_shot = breakdown_data[model][task]['null-shot']
        _, _, p, eff_size, eff_desc = stat_test(zero_shot, null_shot)
        is_improved = sum(breakdown_data[model][task]['null-shot']) > sum(breakdown_data[model][task]['zero-shot'])
        print('---')
        breakdown_result_csv["model"].append(model)
        breakdown_result_csv["task"].append(task)
        breakdown_result_csv["pe_technique"].append("zero-shot vs null-shot")
        breakdown_result_csv["significance at 0.05"].append(p < 0.05)
        breakdown_result_csv["significance at 0.01"].append(p < 0.01)
        breakdown_result_csv["significance at 0.001"].append(p < 0.001)
        breakdown_result_csv["eff_size"].append(eff_size)
        breakdown_result_csv["eff_desc"].append(eff_desc if eff_desc is not None else "")
        breakdown_result_csv["is_improved"].append(is_improved)
        
        if 'zero-shot-cot' in breakdown_data[model][task] and 'null-shot-cot' in breakdown_data[model][task]:
            zero_shot_cot = breakdown_data[model][task]['zero-shot-cot']
            null_shot_cot = breakdown_data[model][task]['null-shot-cot']
            print(f"CoT Model: {model}, Task: {task}")
            _, _, p, eff_size, eff_desc = stat_test(zero_shot_cot, null_shot_cot)
            is_improved = sum(breakdown_data[model][task]['null-shot-cot']) > sum(breakdown_data[model][task]['zero-shot-cot'])
            print('---')
            breakdown_result_csv["model"].append(model)
            breakdown_result_csv["task"].append(task)
            breakdown_result_csv["pe_technique"].append("zero-shot-cot vs null-shot-cot")
            breakdown_result_csv["significance at 0.05"].append(p < 0.05)
            breakdown_result_csv["significance at 0.01"].append(p < 0.01)
            breakdown_result_csv["significance at 0.001"].append(p < 0.001)
            breakdown_result_csv["eff_size"].append(eff_size)
            breakdown_result_csv["eff_desc"].append(eff_desc if eff_desc is not None else "")
            breakdown_result_csv["is_improved"].append(is_improved)

Model: gemini-pro-chat, Task: math-number
No significant difference
---
CoT Model: gemini-pro-chat, Task: math-number
No significant difference
---
Model: gemini-pro-chat, Task: math-algebra
>>> Significant difference, p-value: 0.014084670857345076
>>> Effect size: -0.23214285714285715
---
CoT Model: gemini-pro-chat, Task: math-algebra
>>> Significant difference, p-value: 3.994668363690491e-12
>>> Effect size: -0.4222222222222223
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: anli
No significant difference
---
CoT Model: gemini-pro-chat, Task: anli
>>> Significant difference, p-value: 0.00011845696117386572
>>> Effect size: 0.3154362416107383
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: triviaqa
>>> Significant difference, p-value: 1.6127895061806714e-95
>>> Effect size: 0.9534883720930232
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Mo



No significant difference
---
Model: gemini-pro-chat, Task: race-m
No significant difference
---
CoT Model: gemini-pro-chat, Task: race-m
>>> Significant difference, p-value: 0.0022695643944767733
>>> Effect size: 0.3333333333333333
>>> Significant difference at 0.01
---
Model: gemini-pro-chat, Task: strategyqa
>>> Significant difference, p-value: 3.288406215761169e-58
>>> Effect size: 0.6582914572864322
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gemini-pro-chat, Task: strategyqa
>>> Significant difference, p-value: 0.0
>>> Effect size: 0.9986657771847898
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: winogrande
No significant difference
---
CoT Model: gemini-pro-chat, Task: winogrande
>>> Significant difference, p-value: 2.198892258939671e-173
>>> Effect size: 1.0
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: math-pre-algebra
>



>>> Significant difference, p-value: 5.144142228741655e-08
>>> Effect size: -0.3333333333333333
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-chat, Task: race-h
>>> Significant difference, p-value: 0.0005389059834773553
>>> Effect size: -0.26627218934911245
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gemini-pro-chat, Task: race-h
No significant difference
---
Model: gemini-pro-chat, Task: halueval-general
No significant difference
---
Model: gemini-pro-chat, Task: halueval-dialogue




No significant difference
---
Model: gemini-pro-chat, Task: math-geometry
No significant difference
---
CoT Model: gemini-pro-chat, Task: math-geometry
>>> Significant difference, p-value: 0.03252798168069465
>>> Effect size: -0.2698412698412698
---
Model: gemini-pro-chat, Task: math-count-prob
No significant difference
---
CoT Model: gemini-pro-chat, Task: math-count-prob
>>> Significant difference, p-value: 0.00919377535526991
>>> Effect size: -0.3584905660377358
>>> Significant difference at 0.01
---
Model: gemini-pro-chat, Task: math-pre-calc
No significant difference
---
CoT Model: gemini-pro-chat, Task: math-pre-calc
No significant difference
---
Model: palm-2-chat, Task: math-number
>>> Significant difference, p-value: 0.0002729508421167817
>>> Effect size: -0.6666666666666667
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: palm-2-chat, Task: math-number
No significant difference
---
Model: palm-2-chat, Task: math-algebra
>>> Significant dif



>>> Significant difference, p-value: 6.896149172888035e-14
>>> Effect size: -0.6376811594202898
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-chat, Task: race-m
No significant difference
---
CoT Model: palm-2-chat, Task: race-m
No significant difference
---
Model: palm-2-chat, Task: strategyqa
No significant difference
---
CoT Model: palm-2-chat, Task: strategyqa
>>> Significant difference, p-value: 7.9418780794543e-19
>>> Effect size: -0.44360902255639095
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-chat, Task: winogrande
>>> Significant difference, p-value: 5.6235394295047173e-05
>>> Effect size: -0.3246753246753247
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: palm-2-chat, Task: winogrande
>>> Significant difference, p-value: 0.004074628261016221
>>> Effect size: 0.24285714285714288
>>> Significant difference at 0.01
---
Model: palm-2-chat, Task: math-pre-al



>>> Significant difference, p-value: 8.941972130258081e-105
>>> Effect size: -0.6446789797713282
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-chat, Task: race-h
No significant difference
---
CoT Model: palm-2-chat, Task: race-h
No significant difference
---
Model: palm-2-chat, Task: halueval-general
>>> Significant difference, p-value: 2.812084344755682e-06
>>> Effect size: -0.2285714285714286
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-chat, Task: halueval-dialogue




No significant difference
---
Model: palm-2-chat, Task: math-geometry
>>> Significant difference, p-value: 0.024273675794147815
>>> Effect size: -0.393939393939394
---
CoT Model: palm-2-chat, Task: math-geometry
No significant difference
---
Model: palm-2-chat, Task: math-count-prob
>>> Significant difference, p-value: 1.530000696393433e-05
>>> Effect size: -0.7222222222222223
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: palm-2-chat, Task: math-count-prob
No significant difference
---
Model: palm-2-chat, Task: math-pre-calc
>>> Significant difference, p-value: 0.007299680691008223
>>> Effect size: -0.4838709677419355
>>> Significant difference at 0.01
---
CoT Model: palm-2-chat, Task: math-pre-calc
No significant difference
---
Model: claude-3-opus, Task: math-number
>>> Significant difference, p-value: 0.022085255577415065
>>> Effect size: 0.19424460431654672
---
CoT Model: claude-3-opus, Task: math-number
No significant difference
---
Model: c



No significant difference
---
Model: claude-3-opus, Task: race-m
>>> Significant difference, p-value: 0.0006325135617878961
>>> Effect size: -0.20138888888888884
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-opus, Task: race-m
>>> Significant difference, p-value: 9.417941537783497e-05
>>> Effect size: 0.2542372881355932
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-opus, Task: strategyqa
>>> Significant difference, p-value: 4.687384427411473e-309
>>> Effect size: 0.9619921363040629
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-opus, Task: strategyqa
>>> Significant difference, p-value: 1.2713431429381966e-141
>>> Effect size: 0.7752808988764044
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-opus, Task: winogrande
>>> Significant difference, p-value: 2.41602571365644e-195
>>> Effect size: 0.997760358342



No significant difference
---
Model: claude-3-opus, Task: race-h
>>> Significant difference, p-value: 8.569289257883766e-11
>>> Effect size: -0.23018867924528297
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-opus, Task: race-h
>>> Significant difference, p-value: 1.5636096050085343e-14
>>> Effect size: 0.25438596491228066
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-opus, Task: halueval-general
No significant difference
---
Model: claude-3-opus, Task: halueval-dialogue




>>> Significant difference, p-value: 9.992770807791968e-20
>>> Effect size: 0.15528312901342672
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-opus, Task: math-geometry
No significant difference
---
CoT Model: claude-3-opus, Task: math-geometry
No significant difference
---
Model: claude-3-opus, Task: math-count-prob
>>> Significant difference, p-value: 0.005640074686773696
>>> Effect size: 0.28421052631578947
>>> Significant difference at 0.01
---
CoT Model: claude-3-opus, Task: math-count-prob
No significant difference
---
Model: claude-3-opus, Task: math-pre-calc
No significant difference
---
CoT Model: claude-3-opus, Task: math-pre-calc
No significant difference
---
Model: claude-2.1, Task: math-number
No significant difference
---
CoT Model: claude-2.1, Task: math-number
No significant difference
---
Model: claude-2.1, Task: math-algebra
>>> Significant difference, p-value: 0.012372945407322762
>>> Effect size: 0.16049382716049387
---
Co



>>> Significant difference, p-value: 3.861895297310729e-16
>>> Effect size: 0.16331456154465007
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-2.1, Task: race-m
No significant difference
---
CoT Model: claude-2.1, Task: race-m
>>> Significant difference, p-value: 0.02763272388566593
>>> Effect size: -0.09153713298791022
---
Model: claude-2.1, Task: strategyqa
>>> Significant difference, p-value: 6.19800703725699e-101
>>> Effect size: 0.6934460887949261
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-2.1, Task: strategyqa
>>> Significant difference, p-value: 3.3762093270943895e-72
>>> Effect size: 0.6294478527607361
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-2.1, Task: winogrande
>>> Significant difference, p-value: 3.6565908361335234e-06
>>> Effect size: 0.8620689655172413
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Mode



>>> Significant difference, p-value: 5.791037403212491e-51
>>> Effect size: 0.37330037082818296
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-2.1, Task: race-h
No significant difference
---
CoT Model: claude-2.1, Task: race-h
No significant difference
---
Model: claude-2.1, Task: halueval-general
No significant difference
---
Model: claude-2.1, Task: halueval-dialogue




>>> Significant difference, p-value: 3.3482067760236706e-69
>>> Effect size: 0.47126436781609193
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-2.1, Task: math-geometry
No significant difference
---
CoT Model: claude-2.1, Task: math-geometry
No significant difference
---
Model: claude-2.1, Task: math-count-prob
No significant difference
---
CoT Model: claude-2.1, Task: math-count-prob
No significant difference
---
Model: claude-2.1, Task: math-pre-calc
No significant difference
---
CoT Model: claude-2.1, Task: math-pre-calc
No significant difference
---
Model: palm-2-text, Task: math-number
No significant difference
---
CoT Model: palm-2-text, Task: math-number
No significant difference
---
Model: palm-2-text, Task: math-algebra
No significant difference
---
CoT Model: palm-2-text, Task: math-algebra
No significant difference
---
Model: palm-2-text, Task: anli
No significant difference
---
CoT Model: palm-2-text, Task: anli
No significant diffe



>>> Significant difference, p-value: 3.8205830617664246e-34
>>> Effect size: -0.4682422451994092
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-text, Task: race-m
>>> Significant difference, p-value: 0.005271275568450326
>>> Effect size: -0.3548387096774193
>>> Significant difference at 0.01
---
CoT Model: palm-2-text, Task: race-m
>>> Significant difference, p-value: 5.202080213967457e-31
>>> Effect size: -0.812807881773399
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-text, Task: strategyqa
>>> Significant difference, p-value: 2.5696403762323395e-17
>>> Effect size: -0.4777070063694267
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: palm-2-text, Task: strategyqa
>>> Significant difference, p-value: 0.0005672382519636846
>>> Effect size: 0.15430861723446898
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-text, Task: winogra



No significant difference
---
Model: palm-2-text, Task: race-h
>>> Significant difference, p-value: 1.0613232004248148e-08
>>> Effect size: -0.35968379446640314
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: palm-2-text, Task: race-h
>>> Significant difference, p-value: 8.373345619744243e-71
>>> Effect size: -0.7464788732394365
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-text, Task: halueval-general
>>> Significant difference, p-value: 7.080427141953646e-06
>>> Effect size: -0.3540372670807453
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-text, Task: halueval-dialogue




>>> Significant difference, p-value: 3.433826946384226e-11
>>> Effect size: -0.32530120481927716
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: palm-2-text, Task: math-geometry
No significant difference
---
CoT Model: palm-2-text, Task: math-geometry
No significant difference
---
Model: palm-2-text, Task: math-count-prob
>>> Significant difference, p-value: 0.007736684809190524
>>> Effect size: 0.8181818181818181
>>> Significant difference at 0.01
---
CoT Model: palm-2-text, Task: math-count-prob
No significant difference
---
Model: palm-2-text, Task: math-pre-calc
No significant difference
---
CoT Model: palm-2-text, Task: math-pre-calc
No significant difference
---
Model: claude-3-haiku, Task: math-number
No significant difference
---
CoT Model: claude-3-haiku, Task: math-number
No significant difference
---
Model: claude-3-haiku, Task: math-algebra
No significant difference
---
CoT Model: claude-3-haiku, Task: math-algebra
No significant difference



>>> Significant difference, p-value: 4.3645211880089416e-13
>>> Effect size: -0.16657852987837124
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-haiku, Task: race-m
>>> Significant difference, p-value: 3.483577554140189e-08
>>> Effect size: 0.29824561403508776
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-haiku, Task: race-m
>>> Significant difference, p-value: 2.2115892670143177e-32
>>> Effect size: -0.48571428571428577
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-haiku, Task: strategyqa
>>> Significant difference, p-value: 7.225940242726711e-31
>>> Effect size: 0.39716312056737585
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-haiku, Task: strategyqa
No significant difference
---
Model: claude-3-haiku, Task: winogrande
>>> Significant difference, p-value: 4.758890867259583e-41
>>> Effect size: 0.7032



>>> Significant difference, p-value: 7.0342458473952414e-09
>>> Effect size: 0.14769030579050102
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-haiku, Task: race-h
>>> Significant difference, p-value: 4.2172095045842536e-09
>>> Effect size: 0.19072708113804
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-haiku, Task: race-h
>>> Significant difference, p-value: 9.029016858935683e-48
>>> Effect size: -0.4150326797385621
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-haiku, Task: halueval-general
>>> Significant difference, p-value: 0.00017542184522085485
>>> Effect size: 0.4146341463414634
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-haiku, Task: halueval-dialogue




>>> Significant difference, p-value: 1.9540143994364876e-06
>>> Effect size: -0.16525934861278646
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-haiku, Task: math-geometry
No significant difference
---
CoT Model: claude-3-haiku, Task: math-geometry
No significant difference
---
Model: claude-3-haiku, Task: math-count-prob
No significant difference
---
CoT Model: claude-3-haiku, Task: math-count-prob
No significant difference
---
Model: claude-3-haiku, Task: math-pre-calc
No significant difference
---
CoT Model: claude-3-haiku, Task: math-pre-calc
>>> Significant difference, p-value: 0.02845958438710001
>>> Effect size: -0.25333333333333335
---
Model: gemini-pro-text, Task: math-number
No significant difference
---
CoT Model: gemini-pro-text, Task: math-number
No significant difference
---
Model: gemini-pro-text, Task: math-algebra
>>> Significant difference, p-value: 0.023444378034353848
>>> Effect size: -0.21428571428571425
---
CoT Model: ge



No significant difference
---
Model: gemini-pro-text, Task: race-m
>>> Significant difference, p-value: 0.008840244253781776
>>> Effect size: -0.2987012987012987
>>> Significant difference at 0.01
---
CoT Model: gemini-pro-text, Task: race-m
>>> Significant difference, p-value: 0.001020452006388926
>>> Effect size: 0.30000000000000004
>>> Significant difference at 0.01
---
Model: gemini-pro-text, Task: strategyqa
>>> Significant difference, p-value: 1.1064011128301779e-53
>>> Effect size: 0.6345177664974619
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gemini-pro-text, Task: strategyqa
>>> Significant difference, p-value: 0.0
>>> Effect size: 1.0
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-text, Task: winogrande
No significant difference
---
CoT Model: gemini-pro-text, Task: winogrande
>>> Significant difference, p-value: 1.7992469038405102e-174
>>> Effect size: 1.0
>>> Significant difference at 0.



>>> Significant difference, p-value: 3.071444869175189e-08
>>> Effect size: -0.3333333333333333
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gemini-pro-text, Task: race-h
>>> Significant difference, p-value: 0.0002051840085127536
>>> Effect size: -0.23770491803278687
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gemini-pro-text, Task: race-h
>>> Significant difference, p-value: 0.04698816162837693
>>> Effect size: 0.136150234741784
---
Model: gemini-pro-text, Task: halueval-general
No significant difference
---
Model: gemini-pro-text, Task: halueval-dialogue




No significant difference
---
Model: gemini-pro-text, Task: math-geometry
No significant difference
---
CoT Model: gemini-pro-text, Task: math-geometry
No significant difference
---
Model: gemini-pro-text, Task: math-count-prob
No significant difference
---
CoT Model: gemini-pro-text, Task: math-count-prob
>>> Significant difference, p-value: 0.01636542788596126
>>> Effect size: -0.3214285714285714
---
Model: gemini-pro-text, Task: math-pre-calc
No significant difference
---
CoT Model: gemini-pro-text, Task: math-pre-calc
No significant difference
---
Model: gpt-4-turbo, Task: math-number
No significant difference
---
CoT Model: gpt-4-turbo, Task: math-number
No significant difference
---
Model: gpt-4-turbo, Task: math-algebra
>>> Significant difference, p-value: 0.001065998631732887
>>> Effect size: 0.23809523809523814
>>> Significant difference at 0.01
---
CoT Model: gpt-4-turbo, Task: math-algebra
No significant difference
---
Model: gpt-4-turbo, Task: anli
No significant difference



No significant difference
---
Model: gpt-4-turbo, Task: race-m
No significant difference
---
CoT Model: gpt-4-turbo, Task: race-m
>>> Significant difference, p-value: 0.010077474065287162
>>> Effect size: 0.11417322834645671
---
Model: gpt-4-turbo, Task: strategyqa
>>> Significant difference, p-value: 1.2351597769451855e-41
>>> Effect size: 0.6131687242798354
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gpt-4-turbo, Task: strategyqa
No significant difference
---
Model: gpt-4-turbo, Task: winogrande
>>> Significant difference, p-value: 7.645306108200638e-29
>>> Effect size: 0.5544554455445545
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gpt-4-turbo, Task: winogrande
>>> Significant difference, p-value: 9.51473798420831e-14
>>> Effect size: -0.34029227557411273
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gpt-4-turbo, Task: math-pre-algebra
No significant difference
---




>>> Significant difference, p-value: 9.01755468393317e-15
>>> Effect size: 0.19640564826700896
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gpt-4-turbo, Task: race-h
No significant difference
---
CoT Model: gpt-4-turbo, Task: race-h
>>> Significant difference, p-value: 4.374608170131553e-07
>>> Effect size: 0.13795674869500374
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gpt-4-turbo, Task: halueval-general
No significant difference
---
Model: gpt-4-turbo, Task: halueval-dialogue




No significant difference
---
Model: gpt-4-turbo, Task: math-geometry
>>> Significant difference, p-value: 0.032772913904737556
>>> Effect size: 0.240506329113924
---
CoT Model: gpt-4-turbo, Task: math-geometry
No significant difference
---
Model: gpt-4-turbo, Task: math-count-prob
No significant difference
---
CoT Model: gpt-4-turbo, Task: math-count-prob
No significant difference
---
Model: gpt-4-turbo, Task: math-pre-calc
>>> Significant difference, p-value: 0.03498177199069837
>>> Effect size: 0.21212121212121215
---
CoT Model: gpt-4-turbo, Task: math-pre-calc
No significant difference
---
Model: gpt-3.5-turbo, Task: math-number
>>> Significant difference, p-value: 1.1306217906538634e-07
>>> Effect size: -0.4468085106382979
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: gpt-3.5-turbo, Task: math-number
No significant difference
---
Model: gpt-3.5-turbo, Task: math-algebra
>>> Significant difference, p-value: 3.824100078994672e-25
>>> Effect si



No significant difference
---
Model: gpt-3.5-turbo, Task: race-m
>>> Significant difference, p-value: 0.01063868331841713
>>> Effect size: 0.2972972972972973
---
CoT Model: gpt-3.5-turbo, Task: race-m
>>> Significant difference, p-value: 0.004897537034152014
>>> Effect size: 0.19801980198019797
>>> Significant difference at 0.01
---
Model: gpt-3.5-turbo, Task: strategyqa
>>> Significant difference, p-value: 0.005125769063995782
>>> Effect size: -0.17037037037037034
>>> Significant difference at 0.01
---
CoT Model: gpt-3.5-turbo, Task: strategyqa
>>> Significant difference, p-value: 8.622191278260579e-12
>>> Effect size: 0.23310023310023315
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gpt-3.5-turbo, Task: winogrande
No significant difference
---
CoT Model: gpt-3.5-turbo, Task: winogrande
>>> Significant difference, p-value: 0.003172436141958619
>>> Effect size: 0.12262521588946457
>>> Significant difference at 0.01
---
Model: gpt-3.5-turbo, Task: mat



>>> Significant difference, p-value: 1.1487011917056215e-27
>>> Effect size: -0.32732191163210095
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gpt-3.5-turbo, Task: race-h
>>> Significant difference, p-value: 0.016777926358581073
>>> Effect size: 0.1683168316831683
---
CoT Model: gpt-3.5-turbo, Task: race-h
No significant difference
---
Model: gpt-3.5-turbo, Task: halueval-general
No significant difference
---
Model: gpt-3.5-turbo, Task: halueval-dialogue




>>> Significant difference, p-value: 3.905847064830899e-15
>>> Effect size: -0.21292217327459623
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: gpt-3.5-turbo, Task: math-geometry
>>> Significant difference, p-value: 0.01757599773640744
>>> Effect size: -0.23529411764705888
---
CoT Model: gpt-3.5-turbo, Task: math-geometry
No significant difference
---
Model: gpt-3.5-turbo, Task: math-count-prob
>>> Significant difference, p-value: 0.005597193231864035
>>> Effect size: -0.2745098039215686
>>> Significant difference at 0.01
---
CoT Model: gpt-3.5-turbo, Task: math-count-prob
No significant difference
---
Model: gpt-3.5-turbo, Task: math-pre-calc
No significant difference
---
CoT Model: gpt-3.5-turbo, Task: math-pre-calc
No significant difference
---
Model: claude-3-sonnet, Task: math-number
No significant difference
---
CoT Model: claude-3-sonnet, Task: math-number
>>> Significant difference, p-value: 2.179284069436163e-23
>>> Effect size: 0.72486772486



>>> Significant difference, p-value: 0.0004546709769085569
>>> Effect size: -0.06791744840525332
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-sonnet, Task: race-m
>>> Significant difference, p-value: 8.426314328192002e-23
>>> Effect size: 0.4830917874396135
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-sonnet, Task: race-m
>>> Significant difference, p-value: 1.1396626079862228e-09
>>> Effect size: -0.2808510638297872
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-sonnet, Task: strategyqa
>>> Significant difference, p-value: 9.293496288971375e-115
>>> Effect size: 0.6803571428571429
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-sonnet, Task: strategyqa
>>> Significant difference, p-value: 4.382429096537627e-202
>>> Effect size: 0.9055258467023173
>>> Significant difference at 0.01
>>> Significant diff



>>> Significant difference, p-value: 7.868655909081431e-47
>>> Effect size: 0.214016411621202
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-sonnet, Task: race-h
>>> Significant difference, p-value: 1.3319362149167647e-37
>>> Effect size: 0.42116630669546434
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
CoT Model: claude-3-sonnet, Task: race-h
>>> Significant difference, p-value: 2.1271033670012279e-13
>>> Effect size: -0.20181405895691612
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-sonnet, Task: halueval-general
No significant difference
---
Model: claude-3-sonnet, Task: halueval-dialogue




>>> Significant difference, p-value: 1.1522324765694896e-19
>>> Effect size: -0.2400279916025192
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-sonnet, Task: math-geometry
>>> Significant difference, p-value: 0.015442095157822175
>>> Effect size: 0.29411764705882354
---
CoT Model: claude-3-sonnet, Task: math-geometry
>>> Significant difference, p-value: 8.88607392654154e-23
>>> Effect size: 0.8394160583941606
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-sonnet, Task: math-count-prob
No significant difference
---
CoT Model: claude-3-sonnet, Task: math-count-prob
>>> Significant difference, p-value: 2.1663622326265138e-27
>>> Effect size: 0.8709677419354838
>>> Significant difference at 0.01
>>> Significant difference at 0.001
---
Model: claude-3-sonnet, Task: math-pre-calc
No significant difference
---
CoT Model: claude-3-sonnet, Task: math-pre-calc
>>> Significant difference, p-value: 2.2975625870

In [17]:
breakdown_result_df = pd.DataFrame(breakdown_result_csv)
breakdown_result_df.to_csv(Path.cwd().parent / "analysis_results" / "breakdown_result.csv")

In [18]:
breakdown_result_df.head()

Unnamed: 0,model,task,pe_technique,significance at 0.05,significance at 0.01,significance at 0.001,eff_size,eff_desc,is_improved
0,gemini-pro-chat,math-number,zero-shot vs null-shot,False,False,False,,,False
1,gemini-pro-chat,math-number,zero-shot-cot vs null-shot-cot,False,False,False,,,True
2,gemini-pro-chat,math-algebra,zero-shot vs null-shot,True,False,False,-0.232143,small,True
3,gemini-pro-chat,math-algebra,zero-shot-cot vs null-shot-cot,True,True,True,-0.422222,medium,True
4,gemini-pro-chat,anli,zero-shot vs null-shot,False,False,False,,,True


In [19]:
breakdown_result_df = pd.read_csv(Path.cwd().parent / "analysis_results" / "breakdown_result.csv", index_col=0)

In [20]:
breakdown_result_df.head()

Unnamed: 0,model,task,pe_technique,significance at 0.05,significance at 0.01,significance at 0.001,eff_size,eff_desc,is_improved
0,gemini-pro-chat,math-number,zero-shot vs null-shot,False,False,False,,,False
1,gemini-pro-chat,math-number,zero-shot-cot vs null-shot-cot,False,False,False,,,True
2,gemini-pro-chat,math-algebra,zero-shot vs null-shot,True,False,False,-0.232143,small,True
3,gemini-pro-chat,math-algebra,zero-shot-cot vs null-shot-cot,True,True,True,-0.422222,medium,True
4,gemini-pro-chat,anli,zero-shot vs null-shot,False,False,False,,,True


In [21]:
breakdown_result_df.pe_technique.unique()

array(['zero-shot vs null-shot', 'zero-shot-cot vs null-shot-cot'],
      dtype=object)

In [22]:
main_tasks = ['aqua', 'gsm8k', 'strategyqa', 'winogrande', 'race-m', 'race-h', 'anli', 'triviaqa']
math_tasks = ['math-number', 'math-algebra', 'math-pre-algebra', 'math-int-algebra', 'math-geometry', 'math-count-prob', 'math-pre-calc']
halueval_tasks = ['halueval-general', 'halueval-dialogue', 'halueval-summarization', 'halueval-qa']

stat_sig_cols = ['significance at 0.05', 'significance at 0.01', 'significance at 0.001']

In [23]:
def get_significant_count(selected_tasks, prompt):
    bk_result_df = breakdown_result_df[breakdown_result_df['task'].isin(selected_tasks)]
    bk_result_df = bk_result_df[bk_result_df['pe_technique'] == prompt]
    print("Number of combinations: ", len(bk_result_df))
    
    improved_bk_result_df = bk_result_df[bk_result_df['is_improved']]
    bk_result_df = improved_bk_result_df[improved_bk_result_df['task'].isin(selected_tasks)]
    print("Number of improved combinations: ", len(bk_result_df))
    
    sig_bk_result_df = bk_result_df[bk_result_df[stat_sig_cols].any(axis=1)]
    print("Number of improved and significant combinations: ", len(sig_bk_result_df))
    
    eff_desc = sig_bk_result_df['eff_desc']
    print("Effect size distribution:")
    print(eff_desc.value_counts())

In [24]:
get_significant_count(main_tasks, 'zero-shot vs null-shot')

Number of combinations:  80
Number of improved combinations:  34
Number of improved and significant combinations:  20
Effect size distribution:
medium    12
small      7
large      1
Name: eff_desc, dtype: int64


In [25]:
get_significant_count(math_tasks, 'zero-shot vs null-shot')

Number of combinations:  70
Number of improved combinations:  33
Number of improved and significant combinations:  17
Effect size distribution:
medium    7
small     5
large     5
Name: eff_desc, dtype: int64


In [26]:
get_significant_count(halueval_tasks, 'zero-shot vs null-shot')

Number of combinations:  40
Number of improved combinations:  25
Number of improved and significant combinations:  14
Effect size distribution:
medium    6
small     6
large     2
Name: eff_desc, dtype: int64


In [27]:
get_significant_count(main_tasks, 'zero-shot-cot vs null-shot-cot')

Number of combinations:  80
Number of improved combinations:  19
Number of improved and significant combinations:  11
Effect size distribution:
medium    4
small     4
large     3
Name: eff_desc, dtype: int64


In [28]:
get_significant_count(math_tasks, 'zero-shot-cot vs null-shot-cot')

Number of combinations:  70
Number of improved combinations:  40
Number of improved and significant combinations:  10
Effect size distribution:
medium    7
small     3
Name: eff_desc, dtype: int64
