In [3]:
import pandas as pd
import numpy as np
from typing import Tuple, List

PREDICTIONS_STUB = "/Users/david/Desktop/VectorRepositories/PromptEngineeringLaboratory/src/reference_implementations/"
PREDICTIONS_DIR = "fairness_measurement/resources/prompt_tuning_fairness_paper_preds/"

MODEL_DIRS = ["opt6_7b/", "llama_7b/", "llama2_7b/"]

RUNS = ["run_1", "run_2", "run_3", "run_4", "run_5"]
DATASETS = ["ZeroShot", "SST5", "SemEval", "zero_cot"]
PREDICTION_PREFIXES = ["opt_6_7b_prompt_predictions_", "llama_7b_prompt_predictions_", "llama2_7b_prompt_predictions_"]

def construct_prediction_paths(model_dir: str, dataset: str, prediction_prefix: str) -> List[str]:
    paths = []
    for run_id in RUNS:
        paths.append(f"{PREDICTIONS_STUB}{PREDICTIONS_DIR}{model_dir}{prediction_prefix}{dataset}_{run_id}.tsv")
    return paths

def compute_accuracy_for_path(prediction_path: str) -> None:
    predictions_df = pd.read_csv(prediction_path, sep = "\t")
    total_preds = len(predictions_df)
    correct_preds = len(np.where(predictions_df["y_true"] == predictions_df["y_pred"])[0])
    print(f"Total Accuracy: {correct_preds/total_preds}")
    negative_labels = len(np.where(predictions_df["y_true"] == 0)[0])
    neutral_labels = len(np.where(predictions_df["y_true"] == 1)[0])
    positive_labels = len(np.where(predictions_df["y_true"] == 2)[0])
    # print(f"Negative Labels: {negative_labels}")
    # print(f"Neutral Labels: {neutral_labels}")
    # print(f"Positive Labels: {positive_labels}")
    # print(f"Total Examples: {total_preds}")
    assert total_preds == 31972
    return correct_preds/total_preds

def compute_accuracy_and_std_dev_for_all_preds(prediction_paths: List[str]) -> Tuple[float, float]:
    accuracies = []
    for path in prediction_paths:
        accuracy = compute_accuracy_for_path(path)
        accuracies.append(accuracy)
    return np.mean(accuracies), np.std(accuracies, ddof=1)

In [4]:
print("OPT 6.7B Accuracy Stats for various Prompting Techniques\n")
opt_model_dir = MODEL_DIRS[0]
opt_prediction_prefix = PREDICTION_PREFIXES[0]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(opt_model_dir, DATASETS[0], opt_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(opt_model_dir, DATASETS[1], opt_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(opt_model_dir, DATASETS[2], opt_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")

OPT 6.7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.45167646690854496
Total Accuracy: 0.4528650068810209
Total Accuracy: 0.448486175403478
Total Accuracy: 0.45330289002877516
Total Accuracy: 0.4481734017265107
Average Accuracy: 0.45090078818966595
Standard Deviation: 0.002423784108293601

Few-Shot SST5 Prompting
Total Accuracy: 0.45987113724508943
Total Accuracy: 0.40982734893031403
Total Accuracy: 0.44595270862004255
Total Accuracy: 0.40560490429125484
Total Accuracy: 0.4445452270736895
Average Accuracy: 0.43316026523207807
Standard Deviation: 0.024033609525978772

Few-Shot SemEval Prompting
Total Accuracy: 0.47113098961591393
Total Accuracy: 0.49039784811710246
Total Accuracy: 0.4802327036156637
Total Accuracy: 0.4752596021518829
Total Accuracy: 0.4924308770173902
Average Accuracy: 0.4818904041035907
Standard Deviation: 0.00929998223704825


In [5]:
print("LLaMA-7B Accuracy Stats for various Prompting Techniques\n")
llama_model_dir = MODEL_DIRS[1]
llama_prediction_prefix = PREDICTION_PREFIXES[1]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(llama_model_dir, DATASETS[0], llama_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(llama_model_dir, DATASETS[1], llama_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(llama_model_dir, DATASETS[2], llama_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(llama_model_dir, DATASETS[3], llama_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

LLaMA-7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.4839547103715751
Total Accuracy: 0.48251595145752535
Total Accuracy: 0.4828600025021894
Total Accuracy: 0.48295383460527963
Total Accuracy: 0.48877142499687226
Average Accuracy: 0.48421118478668834
Standard Deviation: 0.0026048996614612273

Few-Shot SST5 Prompting
Total Accuracy: 0.515795070686851
Total Accuracy: 0.5145439759789816
Total Accuracy: 0.5216439384461403
Total Accuracy: 0.5322156887276367
Total Accuracy: 0.5313399224321281
Average Accuracy: 0.5231077192543475
Standard Deviation: 0.00836180990125923

Few-Shot SemEval Prompting
Total Accuracy: 0.5537345177029901
Total Accuracy: 0.5237082447141248
Total Accuracy: 0.5846678343550606
Total Accuracy: 0.5402539722256975
Total Accuracy: 0.5099774802952584
Average Accuracy: 0.5424684098586263
Standard Deviation: 0.028810695270862243

Zero-Shot CoT Prompting
Total Accuracy: 0.4518954084824221
Total Accuracy: 0.4507068685099462
Total Accura

In [6]:
print("LLaMA2-7B Accuracy Stats for various Prompting Techniques\n")
llama2_model_dir = MODEL_DIRS[2]
llama2_prediction_prefix = PREDICTION_PREFIXES[2]
print("Zero-Shot Prompting")
paths = construct_prediction_paths(llama2_model_dir, DATASETS[0], llama2_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SST5 Prompting")
paths = construct_prediction_paths(llama2_model_dir, DATASETS[1], llama2_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Few-Shot SemEval Prompting")
paths = construct_prediction_paths(llama2_model_dir, DATASETS[2], llama2_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Average Accuracy: {acc}\nStandard Deviation: {std_dev}")
print("")

print("Zero-Shot CoT Prompting")
paths = construct_prediction_paths(llama2_model_dir, DATASETS[3], llama2_prediction_prefix)
acc, std_dev = compute_accuracy_and_std_dev_for_all_preds(paths)
print(f"Accuracy: {acc}\nStandard Deviation: {std_dev}")

LLaMA2-7B Accuracy Stats for various Prompting Techniques

Zero-Shot Prompting
Total Accuracy: 0.48307894407606655
Total Accuracy: 0.4829225572375829
Total Accuracy: 0.4842674840485425
Total Accuracy: 0.4862692355811335
Total Accuracy: 0.48020142624796697
Average Accuracy: 0.48334792943825844
Standard Deviation: 0.00220982958745633

Few-Shot SST5 Prompting
Total Accuracy: 0.5919241836607031
Total Accuracy: 0.5924871762792443
Total Accuracy: 0.6039034154885525
Total Accuracy: 0.6545727511572627
Total Accuracy: 0.6366820968347304
Average Accuracy: 0.6159139246840987
Standard Deviation: 0.028259486836683334

Few-Shot SemEval Prompting
Total Accuracy: 0.6674903040160141
Total Accuracy: 0.6280808207181283
Total Accuracy: 0.7037407731765295
Total Accuracy: 0.6623608157137495
Total Accuracy: 0.6082822469660953
Average Accuracy: 0.6539909921181033
Standard Deviation: 0.03704035788652678

Zero-Shot CoT Prompting
Total Accuracy: 0.4925247091204804
Total Accuracy: 0.4960590516702114
Total Accurac