In [1]:
import os
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd

REPO_ABS_PATH = Path(os.path.abspath("")).parent.parent


PREDICTIONS_DIR = f"{REPO_ABS_PATH}/unstated_norms_llm_bias/prompt_based_classification/predictions/"

MODEL_DIRS = [
    "opt6_7b/",
    "llama_7b/",
    "llama2_7b/",
    "llama3_8b/",
    "mistral_7b/",
    "qwen2_5_3b/",
    "qwen2_5_7b/",
    "gemma_7b/",
]

RUNS = ["run_1", "run_2", "run_3", "run_4", "run_5"]
DATASETS = ["ZeroShot", "SST5", "SemEval", "zero_cot"]
PREDICTION_PREFIXES = [
    "opt_6_7b_prompt_predictions_",
    "llama_7b_prompt_predictions_",
    "llama2_7b_prompt_predictions_",
    "llama3_8b_prompt_predictions_",
    "mistral_7b_prompt_predictions_",
    "qwen2_5_3b_prompt_predictions_",
    "qwen2_5_7b_prompt_predictions_",
    "gemma_7b_prompt_predictions_",
]


def construct_prediction_paths(model_dir: str, dataset: str, prediction_prefix: str) -> List[str]:
    paths = []
    for run_id in RUNS:
        paths.append(f"{PREDICTIONS_DIR}{model_dir}{prediction_prefix}{dataset}_{run_id}.tsv")
    return paths


def compute_accuracy_for_path(prediction_path: str) -> float:
    predictions_df = pd.read_csv(prediction_path, sep="\t")
    total_preds = len(predictions_df)
    correct_preds = len(np.where(predictions_df["y_true"] == predictions_df["y_pred"])[0])
    print(f"Total Accuracy: {correct_preds/total_preds}")
    # negative_labels = len(np.where(predictions_df["y_true"] == 0)[0])
    # neutral_labels = len(np.where(predictions_df["y_true"] == 1)[0])
    # positive_labels = len(np.where(predictions_df["y_true"] == 2)[0])
    assert total_preds == 31972
    return correct_preds / total_preds


def compute_accuracy_and_std_dev_for_all_preds(prediction_paths: List[str]) -> Tuple[float, float]:
    accuracies = []
    for path in prediction_paths:
        accuracy = compute_accuracy_for_path(path)
        accuracies.append(accuracy)
    return np.mean(accuracies), np.std(accuracies, ddof=1)

In [2]:
print("OPT 6.7B Accuracy Stats for various Prompting Techniques\n")
opt_model_dir = MODEL_DIRS[0]
opt_prediction_prefix = PREDICTION_PREFIXES[0]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(opt_model_dir, DATASETS[0], opt_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(opt_model_dir, DATASETS[1], opt_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(opt_model_dir, DATASETS[2], opt_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")

OPT 6.7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.45167646690854496
Total Accuracy: 0.4528650068810209
Total Accuracy: 0.448486175403478
Total Accuracy: 0.45330289002877516
Total Accuracy: 0.4481734017265107
Average Accuracy: 0.45090078818966595
Standard Deviation: 0.002423784108293601

Few-Shot SST5 Prompting
Total Accuracy: 0.45987113724508943
Total Accuracy: 0.40982734893031403
Total Accuracy: 0.44595270862004255
Total Accuracy: 0.40560490429125484
Total Accuracy: 0.4445452270736895
Average Accuracy: 0.43316026523207807
Standard Deviation: 0.024033609525978772

Few-Shot SemEval Prompting
Total Accuracy: 0.47113098961591393
Total Accuracy: 0.49039784811710246
Total Accuracy: 0.4802327036156637
Total Accuracy: 0.4752596021518829
Total Accuracy: 0.4924308770173902
Average Accuracy: 0.4818904041035907
Standard Deviation: 0.00929998223704825


In [3]:
print("LLaMA-7B Accuracy Stats for various Prompting Techniques\n")
llama_model_dir = MODEL_DIRS[1]
llama_prediction_prefix = PREDICTION_PREFIXES[1]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(llama_model_dir, DATASETS[0], llama_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(llama_model_dir, DATASETS[1], llama_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(llama_model_dir, DATASETS[2], llama_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(llama_model_dir, DATASETS[3], llama_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

LLaMA-7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.4839547103715751
Total Accuracy: 0.48251595145752535
Total Accuracy: 0.4828600025021894
Total Accuracy: 0.48295383460527963
Total Accuracy: 0.48877142499687226
Average Accuracy: 0.48421118478668834
Standard Deviation: 0.0026048996614612273

Few-Shot SST5 Prompting
Total Accuracy: 0.515795070686851
Total Accuracy: 0.5145439759789816
Total Accuracy: 0.5216439384461403
Total Accuracy: 0.5322156887276367
Total Accuracy: 0.5313399224321281
Average Accuracy: 0.5231077192543475
Standard Deviation: 0.00836180990125923

Few-Shot SemEval Prompting
Total Accuracy: 0.5537345177029901
Total Accuracy: 0.5237082447141248
Total Accuracy: 0.5846678343550606
Total Accuracy: 0.5402539722256975
Total Accuracy: 0.5099774802952584
Average Accuracy: 0.5424684098586263
Standard Deviation: 0.028810695270862243

Zero-Shot CoT Prompting
Total Accuracy: 0.4518954084824221
Total Accuracy: 0.4507068685099462
Total Accura

In [4]:
print("LLaMA2-7B Accuracy Stats for various Prompting Techniques\n")
llama2_model_dir = MODEL_DIRS[2]
llama2_prediction_prefix = PREDICTION_PREFIXES[2]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(llama2_model_dir, DATASETS[0], llama2_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(llama2_model_dir, DATASETS[1], llama2_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(llama2_model_dir, DATASETS[2], llama2_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(llama2_model_dir, DATASETS[3], llama2_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

LLaMA2-7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.48307894407606655
Total Accuracy: 0.4829225572375829
Total Accuracy: 0.4842674840485425
Total Accuracy: 0.4862692355811335
Total Accuracy: 0.48020142624796697
Average Accuracy: 0.48334792943825844
Standard Deviation: 0.00220982958745633

Few-Shot SST5 Prompting
Total Accuracy: 0.5919241836607031
Total Accuracy: 0.5924871762792443
Total Accuracy: 0.6039034154885525
Total Accuracy: 0.6545727511572627
Total Accuracy: 0.6366820968347304
Average Accuracy: 0.6159139246840987
Standard Deviation: 0.028259486836683334

Few-Shot SemEval Prompting
Total Accuracy: 0.6674903040160141
Total Accuracy: 0.6280808207181283
Total Accuracy: 0.7037407731765295
Total Accuracy: 0.6623608157137495
Total Accuracy: 0.6082822469660953
Average Accuracy: 0.6539909921181033
Standard Deviation: 0.03704035788652678

Zero-Shot CoT Prompting
Total Accuracy: 0.4925247091204804
Total Accuracy: 0.4960590516702114
Total Accurac

In [5]:
print("LLaMA3-8B Accuracy Stats for various Prompting Techniques\n")
llama3_model_dir = MODEL_DIRS[3]
llama3_prediction_prefix = PREDICTION_PREFIXES[3]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(llama3_model_dir, DATASETS[0], llama3_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(llama3_model_dir, DATASETS[1], llama3_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(llama3_model_dir, DATASETS[2], llama3_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(llama3_model_dir, DATASETS[3], llama3_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

LLaMA3-8B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.600775678718879
Total Accuracy: 0.5987113724508946
Total Accuracy: 0.5945514825472288
Total Accuracy: 0.6039659702239459
Total Accuracy: 0.6011510071312398
Average Accuracy: 0.5998311022144377
Standard Deviation: 0.0034949851856602766

Few-Shot SST5 Prompting
Total Accuracy: 0.7194420117602902
Total Accuracy: 0.6887276366820968
Total Accuracy: 0.7156261729012886
Total Accuracy: 0.702145627423996
Total Accuracy: 0.7542224446390592
Average Accuracy: 0.7160327786813462
Standard Deviation: 0.024539063419639417

Few-Shot SemEval Prompting
Total Accuracy: 0.6853184036031528
Total Accuracy: 0.6915738771424997
Total Accuracy: 0.7033967221318654
Total Accuracy: 0.6732766170399099
Total Accuracy: 0.6589515826348055
Average Accuracy: 0.6825034405104466
Standard Deviation: 0.017077399798945923

Zero-Shot CoT Prompting
Total Accuracy: 0.5402226948580008
Total Accuracy: 0.5394407606655823
Total Accuracy:

In [6]:
print("Mistral 7B Accuracy Stats for various Prompting Techniques\n")
mistral_model_dir = MODEL_DIRS[4]
mistral_prediction_prefix = PREDICTION_PREFIXES[4]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(mistral_model_dir, DATASETS[0], mistral_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(mistral_model_dir, DATASETS[1], mistral_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(mistral_model_dir, DATASETS[2], mistral_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(mistral_model_dir, DATASETS[3], mistral_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

Mistral 7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.49724759164268734
Total Accuracy: 0.5039722256974853
Total Accuracy: 0.5031590141373702
Total Accuracy: 0.5036907293882147
Total Accuracy: 0.5025647441511323
Average Accuracy: 0.5021268610033779
Standard Deviation: 0.002780081222582416

Few-Shot SST5 Prompting
Total Accuracy: 0.6407168772676092
Total Accuracy: 0.6838170899537095
Total Accuracy: 0.7049605905167021
Total Accuracy: 0.6949205554860503
Total Accuracy: 0.6875078193419242
Average Accuracy: 0.682384586513199
Standard Deviation: 0.024654229096414126

Few-Shot SemEval Prompting
Total Accuracy: 0.7472788690103841
Total Accuracy: 0.6703678218441136
Total Accuracy: 0.7480608032028024
Total Accuracy: 0.6867571625172025
Total Accuracy: 0.6499124233704492
Average Accuracy: 0.7004754159889904
Standard Deviation: 0.045017236190890604

Zero-Shot CoT Prompting
Total Accuracy: 0.5161391217315151
Total Accuracy: 0.5202677342674841
Total Accurac

In [7]:
print("Qwen 2.5 3B Accuracy Stats for various Prompting Techniques\n")
qwen2_5_3b_model_dir = MODEL_DIRS[5]
qwen2_5_3b_prediction_prefix = PREDICTION_PREFIXES[5]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(qwen2_5_3b_model_dir, DATASETS[0], qwen2_5_3b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(qwen2_5_3b_model_dir, DATASETS[1], qwen2_5_3b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(qwen2_5_3b_model_dir, DATASETS[2], qwen2_5_3b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(qwen2_5_3b_model_dir, DATASETS[3], qwen2_5_3b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

Qwen 2.5 3B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.8757350181408733
Total Accuracy: 0.8752033028900288
Total Accuracy: 0.8753284123608157
Total Accuracy: 0.870574252470912
Total Accuracy: 0.8742962592268235
Average Accuracy: 0.8742274490178907
Standard Deviation: 0.002108776634598382

Few-Shot SST5 Prompting
Total Accuracy: 0.8144001000875767
Total Accuracy: 0.8904353809583385
Total Accuracy: 0.9048542474665332
Total Accuracy: 0.8912485925184537
Total Accuracy: 0.8956587013636932
Average Accuracy: 0.8793194044789191
Standard Deviation: 0.03674026188025966

Few-Shot SemEval Prompting
Total Accuracy: 0.8877768047041161
Total Accuracy: 0.8320718128362317
Total Accuracy: 0.8732641060928312
Total Accuracy: 0.8220005004378832
Total Accuracy: 0.8348554985612411
Average Accuracy: 0.8499937445264607
Standard Deviation: 0.028736097529211187

Zero-Shot CoT Prompting
Total Accuracy: 0.8005755035656199
Total Accuracy: 0.798448642562242
Total Accuracy:

In [8]:
print("Qwen 2.5 7B Accuracy Stats for various Prompting Techniques\n")
qwen2_5_7b_model_dir = MODEL_DIRS[6]
qwen2_5_7b_prediction_prefix = PREDICTION_PREFIXES[6]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(qwen2_5_7b_model_dir, DATASETS[0], qwen2_5_7b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(qwen2_5_7b_model_dir, DATASETS[1], qwen2_5_7b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(qwen2_5_7b_model_dir, DATASETS[2], qwen2_5_7b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(qwen2_5_7b_model_dir, DATASETS[3], qwen2_5_7b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

Qwen 2.5 7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.8997873138996622
Total Accuracy: 0.8999749781058426
Total Accuracy: 0.9011635180783185
Total Accuracy: 0.8996309270611785
Total Accuracy: 0.8988802702364569
Average Accuracy: 0.8998874014762919
Standard Deviation: 0.0008255684754861913

Few-Shot SST5 Prompting
Total Accuracy: 0.8973789565870136
Total Accuracy: 0.916551982985112
Total Accuracy: 0.9128299762292006
Total Accuracy: 0.8998498686350557
Total Accuracy: 0.9049480795696234
Average Accuracy: 0.906311772801201
Standard Deviation: 0.008227392520388913

Few-Shot SemEval Prompting
Total Accuracy: 0.9337232578506193
Total Accuracy: 0.8960027524083574
Total Accuracy: 0.9342236957337671
Total Accuracy: 0.9215876391842862
Total Accuracy: 0.9293444263730765
Average Accuracy: 0.9229763543100212
Standard Deviation: 0.015906403552212723

Zero-Shot CoT Prompting
Total Accuracy: 0.823314149881146
Total Accuracy: 0.823439259351933
Total Accuracy:

In [9]:
print("Gemma 7B Accuracy Stats for various Prompting Techniques\n")
gemma_7b_model_dir = MODEL_DIRS[7]
gemma_7b_prediction_prefix = PREDICTION_PREFIXES[7]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(gemma_7b_model_dir, DATASETS[0], gemma_7b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(gemma_7b_model_dir, DATASETS[1], gemma_7b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(gemma_7b_model_dir, DATASETS[2], gemma_7b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(gemma_7b_model_dir, DATASETS[3], gemma_7b_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

Gemma 7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.829913674465157
Total Accuracy: 0.8300075065682472
Total Accuracy: 0.8291630176404354
Total Accuracy: 0.8284749155511072
Total Accuracy: 0.8302890028775178
Average Accuracy: 0.8295696234204929
Standard Deviation: 0.0007401576107968914

Few-Shot SST5 Prompting
Total Accuracy: 0.7867821844113599
Total Accuracy: 0.8037345177029901
Total Accuracy: 0.8573126485674966
Total Accuracy: 0.7919429500813212
Total Accuracy: 0.7809645940197673
Average Accuracy: 0.8041473789565871
Standard Deviation: 0.030881450461431

Few-Shot SemEval Prompting
Total Accuracy: 0.8781120980858251
Total Accuracy: 0.8352308269736018
Total Accuracy: 0.8724821719004129
Total Accuracy: 0.8463655698736394
Total Accuracy: 0.8397347679219317
Average Accuracy: 0.8543850869510822
Standard Deviation: 0.01959786798447536

Zero-Shot CoT Prompting
Total Accuracy: 0.7810584261228575
Total Accuracy: 0.7765857625422244
Total Accuracy: 0.7