In [1]:
import os
from pathlib import Path

import pandas as pd

In [2]:
def map_label_to_string(label: int) -> str:
    if label == 0:
        return "Negative"
    elif label == 1:
        return "Neutral"
    else:
        return "Positive"

In [3]:
amazon_file_name = "amazon_llama-2-7b.tsv"
ns_prompts__file_name = "NS-prompts_llama-2-7b.tsv"
regard_file_name = "regard_llama-2-7b.tsv"

REPO_ABS_PATH = Path(os.path.abspath("")).parent.parent
predictions_stub = f"{REPO_ABS_PATH}/unstated_norms_llm_bias/finetuning_classification/predictions/lora-fine-tuned/"

In [4]:
predictions_path = f"{predictions_stub}{amazon_file_name}"

dataframe = pd.read_csv(predictions_path, delimiter="\t")
dataframe["y_true"] = dataframe["y_true"].apply(map_label_to_string)

winnowed_dataframe = dataframe[["y_true", "group"]]

sample_count = len(winnowed_dataframe)


label_group_counts = winnowed_dataframe.value_counts(sort=False)
label_counts = winnowed_dataframe[["y_true"]].value_counts()
group_counts = winnowed_dataframe[["group"]].value_counts()

print("Amazon Dataset Stats")
print(f"Number of total samples: {sample_count}")
print("Label Counts")
print(label_counts)
print("Group Counts")
print(group_counts)
print("Label-Group Pairs")
print(label_group_counts)

Amazon Dataset Stats
Number of total samples: 3300
Label Counts
y_true  
Negative    1100
Neutral     1100
Positive    1100
dtype: int64
Group Counts
group           
american_indian     900
african_american    600
hispanic            600
pacific_islander    600
asian               300
white               300
dtype: int64
Label-Group Pairs
y_true    group           
Negative  african_american    200
          american_indian     300
          asian               100
          hispanic            200
          pacific_islander    200
          white               100
Neutral   african_american    200
          american_indian     300
          asian               100
          hispanic            200
          pacific_islander    200
          white               100
Positive  african_american    200
          american_indian     300
          asian               100
          hispanic            200
          pacific_islander    200
          white               100
dtype: int64


In [5]:
predictions_path = f"{predictions_stub}{ns_prompts__file_name}"

dataframe = pd.read_csv(predictions_path, delimiter="\t")
dataframe["y_true"] = dataframe["y_true"].apply(map_label_to_string)

winnowed_dataframe = dataframe[["y_true", "group"]]

sample_count = len(winnowed_dataframe)


label_group_counts = winnowed_dataframe.value_counts(sort=False)
label_counts = winnowed_dataframe[["y_true"]].value_counts()
group_counts = winnowed_dataframe[["group"]].value_counts()

print("NS Prompts Dataset Stats")
print(f"Number of total samples: {sample_count}")
print("Label Counts")
print(label_counts)
print("Group Counts")
print(group_counts)
print("Label-Group Pairs")
print(label_group_counts)

NS Prompts Dataset Stats
Number of total samples: 48840
Label Counts
y_true 
Neutral    48840
dtype: int64
Group Counts
group           
american_indian     13320
african_american     8880
hispanic             8880
pacific_islander     8880
asian                4440
white                4440
dtype: int64
Label-Group Pairs
y_true   group           
Neutral  african_american     8880
         american_indian     13320
         asian                4440
         hispanic             8880
         pacific_islander     8880
         white                4440
dtype: int64


In [6]:
predictions_path = f"{predictions_stub}{regard_file_name}"

dataframe = pd.read_csv(predictions_path, delimiter="\t")
dataframe["y_true"] = dataframe["y_true"].apply(map_label_to_string)

winnowed_dataframe = dataframe[["y_true", "group"]]

sample_count = len(winnowed_dataframe)


label_group_counts = winnowed_dataframe.value_counts(sort=False)
label_counts = winnowed_dataframe[["y_true"]].value_counts()
group_counts = winnowed_dataframe[["group"]].value_counts()

print("Regard Dataset Stats")
print(f"Number of total samples: {sample_count}")
print("Label Counts")
print(label_counts)
print("Group Counts")
print(group_counts)
print("Label-Group Pairs")
print(label_group_counts)

Regard Dataset Stats
Number of total samples: 5555
Label Counts
y_true  
Negative    2420
Positive    1925
Neutral     1210
dtype: int64
Group Counts
group           
american_indian     1515
african_american    1010
hispanic            1010
pacific_islander    1010
asian                505
white                505
dtype: int64
Label-Group Pairs
y_true    group           
Negative  african_american    440
          american_indian     660
          asian               220
          hispanic            440
          pacific_islander    440
          white               220
Neutral   african_american    220
          american_indian     330
          asian               110
          hispanic            220
          pacific_islander    220
          white               110
Positive  african_american    350
          american_indian     525
          asian               175
          hispanic            350
          pacific_islander    350
          white               175
dtype: int64
