In [None]:
import pathlib as pl

import glob
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import itertools, warnings

from collections import Counter, defaultdict
from typing import List, Dict, Tuple

# -----------------------------------------------------------------------
# CAMERA-READY PLOTTING (thanks Alex Boyd!)
# -----------------------------------------------------------------------
# Following code is borrowed from material provided by Alex!
FULL_WIDTH = 5.50107
COL_WIDTH  = 4.50461


# Put at top of plotting script (requires tex be installed though)
matplotlib.rc('font', family='serif', size=20)
matplotlib.rc('text', usetex=True)


def adjust(fig, left=0.0, right=1.0, bottom=0.0, top=1.0, wspace=0.0, hspace=0.0):
    fig.subplots_adjust(
        left   = left,  # the left side of the subplots of the figure
        right  = right,  # the right side of the subplots of the figure
        bottom = bottom,  # the bottom of the subplots of the figure
        top    = top,  # the top of the subplots of the figure
        wspace = wspace,  # the amount of width reserved for blank space between subplots
        hspace = hspace,  # the amount of height reserved for white space between subplots
    )
    

def save_fig(fig, name, **kwargs):
    fig.savefig(f"./camera_ready/images/{name}.pdf", bbox_inches="tight", **kwargs)


# Axes formatting
from matplotlib.ticker import MultipleLocator, PercentFormatter

g
# Accessibility
sns.set_palette(sns.color_palette("colorblind"))
matplotlib.rcParams["axes.prop_cycle"] = matplotlib.cycler(color=sns.color_palette("colorblind"))


# Composite plots 
def disable_axis(ax):
    ax.set_zorder(-100)  # Avoids a visual rendering bug
    ax.set_xticks([])
    ax.set_xticklabels([])
    ax.set_yticks([])
    ax.set_yticklabels([])
    plt.setp(ax.spines.values(), color=None)

## Evaluating sentence-level differences between the different benchmarks


In this notebook, we will focus on trying to characterize the differences between the different benchmarks.
Before we delve into the analysis, let us define the properties we're interested in this work:
1. **words** that may (or not) be associated with gender, deemed *gender co-occurring words*; 
2. whether the **sentences are *neutral* or not**, irrespective of whether the words are associated with gender or not. We dub these sentences - *gender-invariant sentences*.


For 2., we have curated a set of prompts and conducted human evaluation to validate our claims. That is, we've characterized the distribution and showed that both pronouns fit. This validation verifies for our datasets as well as the coreference datasets, since they are created to be neutral by default.


We conjecture that by means of our prompts, the generated sentences are gender-invariant. Hence, this notebook will carry the evaluation for **1.**, i.e., as we vary the strength of association between gender and occurring words.

---


The remaining of the notebook is organized as follows: 

1. **Data loading**: we load both PMI difference values and the scores given by the models to each test sentence pair.
2. **Data analysis**: we evaluate how the benchmark size changes as we vary the strength of association between individual words and gender.
3. **Fairness metric**: we compute the fairness metric for the different models. This includes computing both the fairness metric to a fixed threshold and the AUC. 
4. **Fairness metric 2.0**: since our fairness metric focuses on the pct of "neutral" models, we aim to quantify whether the models are placing the remaining probability equally on male-female examples or whether they mostly place the probabilities on one instantiation of the templates.

## 1. Data Loading: Load PMI difference values

In [None]:
from utils import GROUP_PAIRED_WORDLIST, FEMALE_WORDS, MALE_WORDS, get_pmi_diff, get_gender_pairs_matrix

In [None]:
BASE_DIR = "/extra/ucinlp1/cbelem/projects/bias-diagnosis"

# loads the PMI information precomputed based on the PILE co-occurrence counts
GENDER_PMI = pd.read_csv(f"{BASE_DIR}/word2gender_pmi_PILE.csv", index_col=0)
print(len(GENDER_PMI))
GENDER_PMI.describe()

In [None]:
# Since we may want to perform some correlation with other gendered words
# we also define the PMI diff between words and other gendered word pairs
GENDER_PAIRS, GENDER_PAIRS_NUM_WORDS = get_gender_pairs_matrix(GENDER_PMI, GROUP_PAIRED_WORDLIST)

# ----------------------------------------------------------------------------
# compute PMI diff used in the main paper
# ----------------------------------------------------------------------------
# Most analysis will focus on the pmi_diff(she, he)
PMI_DIFF = get_pmi_diff(GENDER_PMI, "she", "he").sort_values("pmi(she)-pmi(he)")
# rename pmi difference column to be something less verbose :b
PMI_DIFF = PMI_DIFF.rename({"pmi(she)-pmi(he)": "pmi_diff"}, axis=1)
PMI_DIFF.sample(15, random_state=81273)

## 2. Loading data - Load model scores for the different datasets

Say, PMI_DIFF(w, she, he), let us now compute the pmi of the words used for each of the benchmarks.

In [None]:
BASE_DIR = ".."

# list all the score files per dataset
DATASET_2_FILEPATHS = {
    "USE-5": glob.glob(f"{BASE_DIR}/results-words5/final-results/*__scores__*.csv"),
    # Baselines below ----
    "Winobias": glob.glob(f"{BASE_DIR}/results-baselines/final-results/*Winobias*__scores__*.csv"),
    "Winogender": glob.glob(f"{BASE_DIR}/results-baselines/final-results/*Winogender*__scores__*.csv"),
    # "StereoSet": glob.glob(f"../results-baselines/final-results/*StereoSet*__scores__*.csv"),
    # We define this ordering so that we can automatically obtain the same coloring scheme as
    # the one used for word analysis
    "USE-10": glob.glob(f"{BASE_DIR}/results-words10/final-results/*__scores__*.csv"),
    "USE-20": glob.glob(f"{BASE_DIR}/results-words20/final-results/*__scores__*.csv"),
}

DATASET_NAMES = list(DATASET_2_FILEPATHS.keys())
print(" Dataset names:\n  ->", DATASET_NAMES, "\n", "-" * 62)

# ------------------------------------------------------------------------------
#                                     Validation
# ------------------------------------------------------------------------------
# All datasets must have exact same number of files and ordered in the same way.
for dataset1, dataset2 in itertools.product(DATASET_NAMES, DATASET_NAMES):
    fps1 = [fp.rpartition("__scores__")[-1] for fp in DATASET_2_FILEPATHS[dataset1]] 
    fps2 = [fp.rpartition("__scores__")[-1] for fp in DATASET_2_FILEPATHS[dataset2]] 
    c1, c2 = Counter(fps1), Counter(fps2)
    assert len(c1 & c2) == len(c1), f"Validation failed for datasets: ({dataset1}, {dataset2})"

# !! Assumption: When scoring there was no change in the ordering of the templates and therefore
# every time we load the filepaths, we will have exactly the same ordering for all files (regardless
# of the scoring model).
DATASET_2_FILEPATHS = {k: sorted(v) for k, v in DATASET_2_FILEPATHS.items()}

print(" Number of files per dataset", "\n", "-" * 65)
for name, files in DATASET_2_FILEPATHS.items():
    print(" -> ", name, len(files))

#### Preprocess the datasets

Transform the datasets into the canonic form:

1. Transform model name into its canonic form: Extract from filepath name and add it as a column to the dataset.
3. Obtain information about model size: 
2. Obtain information about the interventions: Is the model trained on duplicated data (is_deduped=False) or non-duplicated data (is_deduped=True).
3. Obtain information about whether the test sentence pair is natural (is_natural=True) or whether is unnatural for one of the variants in the pair (is_natural=False)
4. Obtain information about the model family.

In [None]:
def canonic_model_name(model_name: str) -> str:
    if "EleutherAI__" in model_name:
        model_name = model_name.replace("EleutherAI__", "")
    elif "facebook__" in model_name:
        model_name = model_name.replace("facebook__", "")
    elif "70b-hf__snapshots" in model_name:
        model_name = "llama-2-70b"
    elif "llama" in model_name:
        ix = model_name.index("llama")
        model_name = model_name[ix:].replace("__hf_models__", "-")
        model_name = model_name.replace("B", "b")
    elif "mosaicml__" in model_name:
        model_name = model_name.replace("mosaicml__", "")
    elif "allenai__" in model_name:
        model_name = model_name.replace("allenai__", "")
    elif "mistralai__" in model_name:
        model_name = model_name.replace("mistralai__", "")
    if "deduped" in model_name:
        model_name = model_name.replace("-deduped", " (D)")
    return model_name


def get_model_size(canonic_name: str) -> int:
    import re 
    val = re.search(r"(\d+(\.\d+)?)(b|B|m|M)", canonic_name)[0]
    const = 1_000 if val[-1] in ("b", "B") else 1        
    return float(val[:-1]) * const
        
    
def get_model_family(model_name: str) -> str:
    """Collects information about the model family"""
    if "pythia" in model_name:
        return "pythia"
    elif "opt" in model_name:
        return "opt"
    elif "mpt" in model_name:
        return "mpt"
    elif "llama" in model_name:
        return "llama2"
    elif "gpt" in model_name:
        return "gpt-j"

    
def is_deduped(model_name: str) -> bool:
    """Collect information about whether the model was trained on deduplicated data."""
    return True if '-deduped' in model_name else False
    

def is_intervention(model_name: str) -> bool:
    """Collect information about whether the model was trained on deduplicated data 
    and with gender bias intervention.
    """
    return True if '-intervention' in model_name else False


def remove_unnatural_examples(df: pd.DataFrame) -> pd.DataFrame:
    """Filter out unnatural examples from the provided dataframe.
    
    Natural test sentence pairs are those for which ChatGPT
    indicates that both sentence variants (regardless of gender)
    are both likely to occur. If one of them is unlikely (as per
    ChatGPT prediction) then we will deem the whole test sentence
    pair unnatural and remove it.
    
    The proposed datasets were generated from scratch and therefore
    will be the only ones with this column. The WinoBias and Winogender
    have no such information, since we know by definition that both
    completions of the sentences are both likely.
    """
    if "is_natural" in df.columns:
        return df[df["is_natural"]].reset_index(drop=True)
    return df

def read_filepath(fp: str, dataset: str, filter_unnatural: bool) -> pd.DataFrame:
    # print(fp)
    df = pd.read_csv(fp)
    # df has "model" information, with the fully qualified name (including company name)
    
    # add dataset name to dataframe
    df["dataset"] = dataset
    # add boolean identifying whether model was trained on deduplicated data
    df["is_deduped"] = df["model"].apply(is_deduped)
    # add boolean indentifying whether model was trained with gender swap
    df["is_intervention"] = df["model"].apply(is_intervention)
    # add canonic name (no company name, with size info)
    df["orig_model_name"] = df["model"]
    df["model"] = df["model"].apply(canonic_model_name)
    # add model size (as a float)
    df["model_size"] = df["model"].apply(get_model_size)
    # add model family
    df["model_family"] = df["model"].apply(get_model_family)

    # add information about whether templates are likely or unlikely
    if filter_unnatural:
        bef = len(df)
        df = remove_unnatural_examples(df)
        print(f"Filtered {len(df) - bef} unnatural, removed from", dataset)
        if "is_natural" in df.columns:
            print(df["is_natural"].value_counts())
            print(df["likely_under"].value_counts())
    df = df.reset_index(names=["orig_index"])
    return df


# Mapping from dataset name to the file dataframes
DATASET_2_FILES = defaultdict(list)

# Read each individual filepath, creating an association <str, list<dataframe>>.
# every str should have a list of the same size.

# To test the impact of ommiting the unnaturalness check, CHANGE THE VALUE BELOW TO FALSE
FILTER_UNNATURAL = True
# ------------------------------ ------------------------------ ------------------------------
DATASET_2_FILES = {
    dataset: [read_filepath(fp, dataset, filter_unnatural=FILTER_UNNATURAL) for fp in sorted(fps)]
    for dataset, fps in DATASET_2_FILEPATHS.items()
}

# Merge all the dataframes into a single big dataframe that contains the information of all models
# for each dataset. We've created a original index to keep track of the unique sentences.
# Sort the files per (model, orig_index)
DATASET_2_FILES = {
    dataset: pd.concat(dfs).sort_values(["model", "orig_index"]).reset_index(drop=True)
    for dataset, dfs in DATASET_2_FILES.items()
}

# Number of models being evaluated 
NUM_EVAL_MODELS = []
MODELS = []
for dataset, df  in DATASET_2_FILES.items():
    print(dataset, df["model"].nunique())
    MODELS.extend(df["model"].unique())
    NUM_EVAL_MODELS.append(df["model"].nunique())
    
# We force the number of models to be the same across all datasets
assert len(set(NUM_EVAL_MODELS)) == 1, \
    f"Found various model sizes: {NUM_EVAL_MODELS}"

NUM_EVAL_MODELS = NUM_EVAL_MODELS[0]
print("Evaluating", NUM_EVAL_MODELS, "models:")
MODELS = list(sorted(set(MODELS)))
print(" -", "\n - ".join(MODELS))

DATASET_2_FILES["USE-5"]

## Post processing: 

In this section, we will carry some processing of the templates (column "template").


**1. Remove placeholders from templates** : We first remove the placeholders (e.g., "{PRONOUN}", "{PRONOUN1}", "{PRONOUN2}", "{PRONOUN2}self") from the template.

**2. Remove stopwords from the templates**: We use **spacy**'s stopwords except that we add back some of the pronouns, effectively following the approach in [Razeghi et al 2022](https://aclanthology.org/2022.emnlp-demos.39/).

**3. Parse each template**: We use **spacy** tokenizer since this was what was used by [Razeghi et al 2022](https://aclanthology.org/2022.emnlp-demos.39/). While NTLK is much faster, it doesn't group together words like "self-care", which is treated as single word by spacy tokenizer. Therefore, we've updated the script to consider the spacy tokenization. Applying it to the whole DATASET_2_FILES[dataset] will be too time-consuming, so we will apply to the first portion of the data and then concatenate it to the dataframe.


### Filtering


Before applying the processing, we will first obtain the top unique templates by focusing on the subset of data of the first listed model.

In [None]:
# Validation (!sanity check)
# When selecting a data slice from the big dataframe
# we must guarantee that the sentences match to one another
# (that is necessary because the remaining of the code is relying
# on ordering of the dataframes)
def check_slices(dataset: pd.DataFrame, data2files: dict, models: List[str]):
    """Check for the ordering of the rows in ``dataset`` correspond to the
    ones in ``data2files``. Since the data2files are ordered by models,
    we will focus on that."""
    slices = []
    for model in models:
        df = data2files[dataset]
        df = df[df["model"] == model].copy()
        if len(slices) > 1:
            assert np.array_equal(slices[-1]["template"].values, df["template"].values)    
        slices.append(df)
        
    
for dataset in DATASET_NAMES:
    print("Checking slices for dataset:", dataset)
    check_slices(dataset=dataset, data2files=DATASET_2_FILES, models=MODELS)
    
# -----------------------------------------------------------------------------
# ^Note: if the check above does not throw an error, then it means that the
# templates can stack up based on the model, so it's ok to apply the processing
# to the first model and then create NUM_EVAL_MODEL copies of that and insert
# in the dataframe!!
# -----------------------------------------------------------------------------
DATASET_2_TEMPLATES = {
    dataset: df[df["model"] == MODELS[0]]["template"].values.tolist()
    for dataset, df in DATASET_2_FILES.items()
}

### Processing (using Spacy tokenizer)

In [None]:
try:
    import spacy
except:
    !pip install spacy
    !python -m spacy download en_core_web_md
import spacy
import nltk 
import re, string


from nltk.corpus import stopwords
nltk.download('stopwords')


PRONOUNS = ["she", "her", "hers", "he", "his", "him", "himself", "herself"]
SPACY_PARSER = spacy.load("en_core_web_md", disable=["ner", "tagger"])


def postprocess_spacy(templates, pronouns=PRONOUNS):
    def word_tokenize(sentence: str, pronouns: list, remove_stopwords: bool=True, remove_punct: bool=True):
        doc = SPACY_PARSER(sentence)
        # Extract the tokens that are not stopwords
        tokens = [token.text for token in doc 
                  if (token.text in pronouns) or (not token.is_stop and not token.is_punct)]
        return [t for t in tokens if len(t.strip()) > 0]

    templates = [t.lower() for t in templates]
    # Step 1. Remove placeholders from the templates
    templates = [t.replace("{pronoun2}self", "") for t in templates]
    templates = [re.sub(r"\{pronoun([0-2]{1})?\}", "", t) for t in templates]
    # Step 2. Parse the sentence
    templates = [word_tokenize(t, pronouns) for t in templates]
    return templates


def postprocess_nltk(templates, pronouns=PRONOUNS):
    from nltk.tokenize import word_tokenize

    nltk_stopwords = set(stopwords.words('english'))
    # We know that some sentences have some other references to other entities,
    # let's keep some pronouns
    nltk_stopwords -= set(pronouns)
    punct = string.punctuation
    
    templates = [t.lower() for t in templates]
    # Remove pronouns first
    templates = [t.replace("{pronoun2}self", "") for t in templates]
    templates = [re.sub(r"\{pronoun([0-2]{1})?\}", "", t) for t in templates]
    
    # Remove stopwords and punct
    templates = [[w for w in word_tokenize(t) if w not in punct and w not in nltk_stopwords] for t in templates]
    return templates


DATASET_2_CANONIC_TEMPLATES_SPACY = {}
DATASET_2_CANONIC_TEMPLATES_NLTK = {}

for dataset, templates in DATASET_2_TEMPLATES.items():
    DATASET_2_CANONIC_TEMPLATES_SPACY[dataset] = postprocess_spacy(templates)
    DATASET_2_CANONIC_TEMPLATES_NLTK[dataset] = postprocess_nltk(templates)

In [None]:
DATASET_2_CANONIC_TEMPLATES_NLTK["USE-5"]

## Data Analysis - Determine gender co-occurrence for each word 

In this section, we iterate the templates and compute the gender co-occurrence values for each sentence in the benchmarks. Optionally, you can weight the values of each word by the likelihood of being toxic or having negative sentiment. If such values are not provided, we assume each word is worth the same value of 1 unit.


In [None]:
## Convert dataframe to mapping from word to pmi diff for easy access
WORD2PMI = PMI_DIFF[["word", "pmi_diff"]].set_index("word").to_dict()["pmi_diff"]
WORD2WEIGHTS = defaultdict(lambda: 1)

## ----------------------------------------------------------------
## Weighting words based on frequency
## ----------------------------------------------------------------
FREQ_WORDS = pd.read_csv("./pmi_file_w_counts.csv", index_col=0)
FREQ_WORDS["log_freq"] = np.log(FREQ_WORDS["freq"])

## uncomment one of the lines below if you prefer weighting each word based
## on the frequency of each individual word
# WORD2WEIGHTS = FREQ_WORDS[["word", "freq"]].set_index("word").to_dict()["freq"]
# WORD2WEIGHTS = FREQ_WORDS[["word", "log_freq"]].set_index("word").to_dict()["log_freq"]

## ----------------------------------------------------------------
## Weighting words based on toxicity/sentiment
## ----------------------------------------------------------------
## TODO:
## -> Define toxicity for each word
## -> Define sentiment polarity for each word (?)
## Define a 1-to-1 mapping and assign the variable WORD2WEIGHTS

In [None]:
def compute_pmi_diff_per_sentences(
        templates: List[List[str]],
        word2pmi: dict,
        word2weights: dict,
    ) -> List[List[float]]:
    """Computes the PMI difference per individual token in the provided sentences.
    
    Notes
    -----
    It assumes the templates/sentences are already provided as a list of tokens.
    It returns two lists: the first one contains the list of pmi values for each of
    the provided words (some tokens won't have a PMI value associated); the second
    list contains the 1-1 mapping from word to pmi value and their weights.
    """
    pmi_values = []
    words_with_pmi = []
    
    for template in templates:
        pmi = np.array([word2weights[w] * word2pmi.get(w) for w in template if word2pmi.get(w) is not None])
        pmiwords = [{
            "word": w, 
            "pmi": round(word2pmi.get(w), 2),
            "weight": round(word2weights[w], 2),
        } for w in template if word2pmi.get(w) is not None]
        
        pmi_values.append(pmi)
        words_with_pmi.append(pmiwords)
            
    return pmi_values, words_with_pmi
        

    
    
PMI_PER_SENTENCES_NLTK = {dataset: 
                          compute_pmi_diff_per_sentences(templates, WORD2PMI, WORD2WEIGHTS)
                          for dataset, templates in DATASET_2_CANONIC_TEMPLATES_NLTK.items()
}

PMI_PER_SENTENCES_SPACY = {dataset: 
                          compute_pmi_diff_per_sentences(templates, WORD2PMI, WORD2WEIGHTS)
                          for dataset, templates in DATASET_2_CANONIC_TEMPLATES_SPACY.items()
}

# ---------------------------------------------------------------
# Plot the distribution of well-defined PMI values
# ---------------------------------------------------------------
fig, axes = plt.subplots(1, 5, sharey=True, figsize=(FULL_WIDTH*2, 3))
for i, dataset in enumerate(DATASET_NAMES):
    # Compute the number of well-defined tokens with each of the approaches
    nltk_len = list(map(len, PMI_PER_SENTENCES_NLTK[dataset][0]))
    spacy_len = list(map(len, PMI_PER_SENTENCES_SPACY[dataset][0]))
    
    axes[i].spines[['right', 'top']].set_visible(False)
    sns.histplot(list(nltk_len), label="nltk", stat="probability", alpha=0.6, binrange=(0,20), binwidth=1, ax=axes[i])
    sns.histplot(list(spacy_len), label="spacy", stat="probability", alpha=0.6, binrange=(0,20), binwidth=1, ax=axes[i])
    
    
    axes[i].set_title(dataset)
    axes[i].set_xlabel("Num words")
    axes[i].set_ylabel("Fraction")
    
    if i != 0:
        if axes[i].get_legend() is not None:
            axes[i].get_legend().remove()

adjust(fig)

Since in general **spacy** tokenizer leads to higher pct of examples being matched with a word. We will use the **spacy** tokenized templates to conduct the analysis (it increases the coverage of the constraints).

In [None]:
PMI_PER_TEMPLATES = {}
PMIWORDS_PER_TEMPLATES = {}

# Change the PMI_PER_SENTENCES_SPACY with PMI_PER_SENTENCES_NLTK
# to use NLTK tokenization instead.
# for dataset, pmi_per_sents_values in PMI_PER_SENTENCES_NLTK.items():
for dataset, pmi_per_sents_values in PMI_PER_SENTENCES_SPACY.items():
    pmi_vals, words_per_pmi = pmi_per_sents_values
    
    PMI_PER_TEMPLATES[dataset] = pmi_vals
    PMIWORDS_PER_TEMPLATES[dataset] = words_per_pmi

### Compute the constraint: MaxGenderPMI(s)

In this section, we compute the max gender PMI value per sentence. 

In [None]:
MAXGENDER_COL = "max_gender_pmi"

def max_gender_pmi(templates_pmi: List[List[str]], col: str) -> List[dict]:
    """Compute the maximum PMI diff per sentence."""
    def _max_pmi(lst_pmis: List[str]) -> float:
        if len(lst_pmis) > 0:
            idx = np.argmax(np.abs(lst_pmis))
            return lst_pmis[idx]
    
    results = []
    for template_pmi in templates_pmi:
        max_val = _max_pmi(template_pmi)
        results.append({col: max_val, f"{col}_invalid": max_val is None, "template_words_pmi": template_pmi})
        
    return results

In [None]:
# Contains the max gender pmi values per sentence
MAX_GENDER_PMI = {dataset: max_gender_pmi(templates_pmi, MAXGENDER_COL) 
                  for dataset, templates_pmi in PMI_PER_TEMPLATES.items()}

MAX_GENDER_PMI_LONG = []
for dataset, lst_value_dicts in MAX_GENDER_PMI.items():
    for value_dict in lst_value_dicts:
        r = {k: v for k, v in value_dict.items()}
        r["dataset"] = dataset
        MAX_GENDER_PMI_LONG.append(r)

MAX_GENDER_PMI_LONG = pd.DataFrame(MAX_GENDER_PMI_LONG)
        
# Adds the information to the original dataset with all models
# originally, preserved in the variable DATASET_2_FILES
DATASET_W_CONSTRAINTS = {dataset: pd.DataFrame(values * NUM_EVAL_MODELS)
                  for dataset, values in MAX_GENDER_PMI.items()}

# ------------------------------------------------------------------------------
# 
#                        Dataset w/ MaxGender PMI constraint!
# 
# ------------------------------------------------------------------------------
DATASET_W_CONSTRAINTS = {
    dataset: pd.concat((DATASET_2_FILES[dataset], DATASET_W_CONSTRAINTS[dataset]), copy=True, axis=1)
    for dataset in DATASET_NAMES
}

DATASET_W_CONSTRAINTS[DATASET_NAMES[0]].columns

In [None]:
for dataset, df in DATASET_W_CONSTRAINTS.items():
    df.to_csv(f"./datasets/{dataset}-no-maxpmi-constraint.csv.gz", index=None)

In [None]:
pd.read_csv("./datasets/USE-5-no-maxpmi-constraint.csv.gz").likely_under.unique()

### Quick test about the necessity of unnaturalness checks

In this small section, we want to get an idea of how useful the semantic filters we use with ChatGPT are useful or not. We will manually analyze examples that are not being filtered out by the $\mathrm{MaxPMI}$ constraint (the strictest we applied in the paper), and we will analyze what examples are still left out but that were filtered by the semantic checks by ChatGPT.

In [None]:
if not FILTER_UNNATURAL:
    # MAX PMI Threshold
    pmi_threshold = 0.5

    for dataset in ("USE-5", "USE-10", "USE-20"):
        data = DATASET_W_CONSTRAINTS[dataset].copy()
        # We don't want to double count, so filter by model
        data = data[data["model"] == MODELS[0]] 
        # We'll want to keep valid examples according to PMI
        data["valid_by_pmi"] = data["max_gender_pmi"].apply(lambda s: -pmi_threshold <= s <= pmi_threshold )
        # We want to check examples that are valid but deemed unnatural (if any)
        data = data[(data["valid_by_pmi"]) & (~data["is_natural"])]
        
        print("Words seed words filtered out:", list(data["word"].value_counts().items())[:10])
        data.to_csv(f"camera_ready/analysis/unnaturalness/{dataset}_pmi_eps_{str(pmi_threshold).replace('.','')}.csv")
else:
    print("Skipping naturalness analysis check... Since unlikely filters have already been applied...")

In [None]:
pmi_threshold

**Obtain examples containing the word `word`**.

In [None]:
def get_examples_with_word(
        data_to_df: Dict[str, pd.DataFrame],
        dataset: str,
        word: str,
        word2pmi: Dict[str, float]=WORD2PMI,
    ):
    data = data_to_df[dataset].copy()
        
    # Deduplicate the data (we should have the same exact number of examples for every model)
    model = data.model.unique()[0]
    data = data[data["model"] == model]

    print(f"PMI('{word}'): {word2pmi[word]:.4f}")
    return data[data["word"] == word].sort_values("max_gender_pmi")[["sentence", "is_natural", "max_gender_pmi"]]

with pd.option_context('display.max_colwidth', None):    
    display(get_examples_with_word(DATASET_W_CONSTRAINTS, "USE-5", "common"))

**`MaxPMI` is invalid when there are sentences for which it is undefined**. 
In particular, we do not have term frequencies for stopwords. When using `nltk` tokenizer, since it differs from the tokenizer used to create the term frequencies (ie, `spacy`) there is a mismatch in some of the words tokenizations, leading to some examples not being considered.

In [None]:
invalid_results = {dataset: df[df[f"{MAXGENDER_COL}_invalid"]] for dataset, df in DATASET_W_CONSTRAINTS.items()}
invalid_results

In [None]:
# Keep only the sentences w/ valid constraints
DATASET_W_CONSTRAINTS = {d: df[~df[f"{MAXGENDER_COL}_invalid"]] for d, df in DATASET_W_CONSTRAINTS.items()}

# Compute number of examples in each dataset (before filtering for gender co-occurring words)
MAX_GENDER_PMI_LONG[["dataset", MAXGENDER_COL]].groupby("dataset").describe().T[DATASET_NAMES].style.format('{:.2f}')

In [None]:
MAX_GENDER_PMI_LONG["dataset_"] = MAX_GENDER_PMI_LONG.dataset

# Figure settings
fig, ax = plt.subplots(1,1, figsize=(FULL_WIDTH, 4))
sns.boxplot(MAX_GENDER_PMI_LONG, y="dataset_", x=MAXGENDER_COL, ax=ax, color="lightgray")
ax.spines[['right', 'top']].set_visible(False)
adjust(fig)
ax.set_xlim(-3, 3)
ax.set_xlabel("Average sentence gender co-occurrence score")
ax.set_ylabel("Dataset")
save_fig(fig, "boxplot__average_sentence_maxpmi_score", dpi=150)

In [None]:
num_templates = MAX_GENDER_PMI_LONG.groupby("dataset").count()[[MAXGENDER_COL]]
num_templates.rename({MAXGENDER_COL: "orig_num_templates"}, axis=1, inplace=True)
num_templates

## Data Analysis - Filtering using $\eta$

In this section, we observe how the number of templates changes as we increase the max gender pmi difference. We observe that little to no evaluation examples remain after enforcing smaller values of $\mathrm{MaxPMI(s)}$. Conversely, as we relax the constraint, more and more examples are included.

In [None]:
def filter_eta_and_count_examples(
        name_and_dataset: Dict[str, pd.DataFrame],
        etas: List[float],
        col: str,
        constant: int=NUM_EVAL_MODELS,
    ) -> pd.DataFrame:
    """Count the number of remaining examples after filtering every dataset in
    for different settings of $|MaxPMI(s)| \leq \eta$
    """
    results = defaultdict(list)
    
    dataset_max_counts = defaultdict(lambda: 0)
    for eta in etas:
        for dataset, df in name_and_dataset.items():
            assert df["model"].nunique() == constant

            counts = ((df[col] >= -eta) & (df[col] <= eta)).sum() / constant
            results["dataset"].append(dataset)
            results["filter"].append(eta)
            results["counts"].append(counts)
            
            if dataset_max_counts[dataset] < counts:
                dataset_max_counts[dataset] = counts
            
    results = pd.DataFrame(results)
    results["freq"] = results[["dataset", "counts"]].apply(lambda x: x["counts"]/(dataset_max_counts[x["dataset"]]), axis=1)
    
    return pd.DataFrame(results)


FILTERING_ETA = np.linspace(0.0, 2.5, 101)[::-1]
FILTER_CURVES_RESULTS = filter_eta_and_count_examples(
    DATASET_W_CONSTRAINTS,
    FILTERING_ETA,
    MAXGENDER_COL,
    NUM_EVAL_MODELS, 
)



fig, ax = plt.subplots(1,1, figsize=(FULL_WIDTH, FULL_WIDTH*2/3))
sns.lineplot(
    FILTER_CURVES_RESULTS,
    x="filter",
    y="freq",
    hue="dataset",
    lw=2) #set y="counts" to plot absolute values instead


ax.spines[['right', 'top']].set_visible(False)
adjust(fig)

ax.set_xlabel("$\eta$")
ax.set_ylabel("Percentage of Dataset")

#plt.legend( loc='upper left', bbox_to_anchor=(1, 1.01))
ax.legend(title="Dataset", loc="upper left", bbox_to_anchor=(0.56, 0.70))


ax.xaxis.set_major_locator(MultipleLocator(0.5))
ax.xaxis.set_minor_locator(MultipleLocator(0.25))

ax.yaxis.set_major_locator(MultipleLocator(0.20))

# Add axis formatting
ax.yaxis.set_major_formatter(PercentFormatter(1.0))  # 1.0 is to be treated as 100%
# Add grid
ax.grid(axis='y', which="major", linewidth=1, linestyle=':', color="lightgray")

# Set axis limits
ax.set_xlim((0, 2))
ax.set_ylim((0, 1))
save_fig(fig, "lineplot__datasetpct_vs_maxpmi", dpi=100)

## Fairness metrics - Fixed threshold & AUC


In [None]:
from sklearn.metrics import auc

# fairness col in natural log space
FAIRNESS_COL = "FM_logprob"

# probability space threshold
_FAIRNESS_THRESHOLD = 1.65


def use_log_10_base(ln_val: float) -> float:
    """Transforms natural log into log base 10."""
    return np.log10(np.exp(ln_val))


def compute_neutralpct_fixed_threshold(dataset: pd.DataFrame, eps: float, col: str):
    abs_col = dataset[col].apply(np.abs)
    counts = (abs_col <= eps).sum()
    freq = counts / len(dataset)
    
    return counts, freq


def compute_neutralpct_auc(dataset: pd.DataFrame, epsilons: List[float], col: str):
    results = defaultdict(list)
    for eps in epsilons:
        counts, freq = compute_neutralpct_fixed_threshold(dataset, eps, col)
        results["fairness_eps"].append(eps)
        results["num_examples"].append(counts)
        results["pct_examples"].append(freq)
        
    results = pd.DataFrame(results)    
    return results, auc(results["fairness_eps"], results["pct_examples"])


def compute_neutralpct(data: dict, models: List[str], datasets: List[str], epsilons: List[float], col: str, use_log10: callable=None):
    results = []
    results_auc = defaultdict(list)

    for dataset in datasets:
        df = data[dataset].copy()
        
        for model in models:
            df_model = df[df["model"] == model].copy()
            
            if use_log10:
                df_model[f"{col}_base10"] = df[col].apply(use_log10)
                out, out_auc = compute_neutralpct_auc(df_model, epsilons, f"{col}_base10")            
            else:
                out, out_auc = compute_neutralpct_auc(df_model, epsilons, col)
            
            out["model"] = model
            out["dataset"] = dataset
            results.append(out)
            
            results_auc["dataset"].append(dataset)
            results_auc["model"].append(model)
            results_auc["auc"].append(out_auc)
            
            
    return pd.concat(results), pd.DataFrame(results_auc)

**Natural logarithm base**: To report the results in natural logarithm, use the following cell. 
While earlier versions of the paper included the natural logarithm results, in the camera ready version of the paper, we decide to use the **base 10** since it is more intuitive and easy to reason about.

**Base 10 logarithm**: To report the results for the camera ready version of the paper, we use the base 10, since it makes it easier to think about the meaning of the value in the plots. We stick to the default value of 1.65, such that the results found in earlier versions of the paper (eg, [paper at the NeurIPS SOLAR workshop in 2023](https://scholar.google.com/citations?view_op=view_citation&hl=en&user=nMwgV2UAAAAJ&sortby=pubdate&citation_for_view=nMwgV2UAAAAJ:_kc_bZDykSQC)) can be replicated.

In [None]:
FAIRNESS_THRESHOLD = np.log10(_FAIRNESS_THRESHOLD)
print(FAIRNESS_THRESHOLD)
MAX_AUC = 6
FAIRNESS_EPSILONS = np.linspace(0, MAX_AUC, 101)

FAIR_THRESHOLDS, FAIR_AUC = compute_neutralpct(
    DATASET_W_CONSTRAINTS,
    MODELS,
    DATASET_NAMES,
    FAIRNESS_EPSILONS,
    FAIRNESS_COL,
    use_log10=use_log_10_base,
)

In [None]:
sns.boxplot(FAIR_AUC, y="dataset", x="auc")
plt.axvline(MAX_AUC, ls="--", color="black", label="max auc")
plt.ylabel("Dataset")
plt.xlabel("Area under the fairness curve ($\epsilon \in [" + f"{FAIRNESS_EPSILONS[0]}, {FAIRNESS_EPSILONS[-1]}" + "]$)")

### Fairness AUC table


In [None]:
# Transform the long table into a wide table, by extending it with the dataset names
FAIR_AUC["dataset_"] = FAIR_AUC["dataset"].apply(lambda x: x if x != "USE-5" else "USE-05")
pd.pivot_table(FAIR_AUC, index="model", values=["auc"], columns=["dataset_"]).style.format('{:.2f}')

In [None]:
def fairness_threshold_plots(fairthresholds, fairauc, datasetnames, models, use_exp=None):
    models, tag = models[0], models[1]
    
    # For every dataset create a plot
    for dataset in datasetnames:
        # Obtain the subset corresponding to the desired dataset
        ft_df = fairthresholds[fairthresholds["dataset"] == dataset].copy()
        
        # Plot only the specified models
        ft_df = ft_df[ft_df["model"].isin(models)]
        
        # Obtain the AUC for that model and dataset
        aucs = fairauc[(fairauc["dataset"] == dataset) & (fairauc["model"].isin(models))]
        
        ft_df["Deduplicated"] = ft_df["model"].apply(lambda x: "(D)" in x)
        ft_df["Model"] = ft_df["model"].apply(lambda x: x.replace(" (D)", ""))
            
        if all(["pythia" in m for m in models]):
            ft_df["Model"] = ft_df["Model"].apply(lambda x: x.replace("pythia-", ""))
        
        if dataset in ("Winobias", "Winogender"):
            fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH/2, 2))
            ax.set_xlim((0, 5))

        else:
            fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH/2, 2))
            ax.set_xlim((0, 5))

            
        adjust(fig)
        ax.spines[['right', 'top']].set_visible(False)

        if use_exp is not None:
            ft_df["fairness_eps"] = ft_df["fairness_eps"].apply(use_exp)

        # Plot one line per model
        # Plot one line using different stule but same color if the model is deduplicated
        
        if ft_df["Deduplicated"].nunique() > 1:
            kwargs = dict(style="Deduplicated")
        else:
            kwargs = dict()
        
        sns.lineplot(ft_df, x="fairness_eps", y="pct_examples", hue="Model", lw=2, ax=ax, **kwargs)
        # ax.axvline(FAIRNESS_THRESHOLD, color="black", alpha=0.5)
        ax.set_title(dataset, fontsize=12)
        ax.set_xlabel("threshold", fontsize=12)
        ax.set_ylabel("fairness metric", fontsize=12)
        ax.set_ylim((0, 1))
        
        ax.xaxis.set_major_locator(MultipleLocator(1))
        ax.xaxis.set_minor_locator(MultipleLocator(0.5))

        ax.yaxis.set_major_locator(MultipleLocator(0.20))

        # Add axis formatting
        ax.yaxis.set_major_formatter(PercentFormatter(1.0))  # 1.0 is to be treated as 100%

        ax.grid(axis='x', which="major", linewidth=1, linestyle='--', color="lightgray")
        ax.grid(axis='x', which="minor", linewidth=1, linestyle=':', color="lightgray")

        ax.tick_params(axis='both', which='major', labelsize=12)
        ax.tick_params(axis='both', which='minor', labelsize=8)
        
        # Legend
        ax.legend(loc="upper left", bbox_to_anchor=(0.5, 0.9), fontsize=12)
        save_fig(fig, f"lineplot__{dataset}_{tag}_in_func_eps", dpi=100)
        plt.show()

### AuFC: Pythia models

In [None]:
pythia_models = [
    'pythia-70m',
    'pythia-70m (D)',
    # 'pythia-2.8b',
    # 'pythia-2.8b (D)',
    'pythia-6.9b',
    'pythia-6.9b (D)',
    'pythia-12b',
    'pythia-12b (D)',
    # 'gpt-j-6b'
], "pythia"

opt_models = [
    'opt-125m',
    'opt-2.7b',
    'opt-350m',
    'opt-6.7b',
], "opt"

misc_models = [
    'llama-2-13b',
    'llama-2-7b',
    'llama-2-70b',
    'mpt-30b',
    'mpt-7b',
    "OLMo-1B",
    "OLMo-7B",
    "Mistral-7B-v0.1",
    "Mixtral-8x7B-v0.1",
], "others"

In [None]:
fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, pythia_models)

In [None]:
## uncomment expression below if you want to plot the x axis in the probability space
# (it assumes that fair thresholds and fair auc were previously computed in the log 10.)
# fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, pythia_models, use_exp=lambda x: 10**x)

### AuFC: OPT models

In [None]:
fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, opt_models)

### AuFC: mpt * llama

In [None]:
fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, misc_models)

Let us create the grid for the fairness threshold picture in the paper.

In [None]:
def individual_fairness_threshold_plot(fairthresholds, fairauc, dataset, models, max_auc, ax, use_exp=None, simplify=True):
    # Obtain the subset corresponding to the desired dataset
    ft_df = fairthresholds[fairthresholds["dataset"] == dataset].copy()

    # Plot only the specified models
    ft_df = ft_df[ft_df["model"].isin(models)]

    # Obtain the AUC for that model and dataset
    aucs = fairauc[(fairauc["dataset"] == dataset) & (fairauc["model"].isin(models))]

    ft_df["Original"] = ft_df["model"].apply(lambda x: "No" if "(D)" in x else "Yes")
    ft_df["Model"] = ft_df["model"].apply(lambda x: x.replace(" (D)", ""))
    
    if simplify and all(["pythia" in m for m in models]):
        ft_df["Model"] = ft_df["Model"].apply(lambda x: x.replace("pythia-", ""))

    if use_exp is not None:
        ft_df["fairness_eps"] = ft_df["fairness_eps"].apply(use_exp)

    
    kwargs = {"style": "Original"} if ft_df["Original"].nunique() > 1 else {}    
    sns.lineplot(ft_df, x="fairness_eps", y="pct_examples", hue="Model", lw=1, ax=ax, alpha=0.8, **kwargs)
    # ax.axvline(FAIRNESS_THRESHOLD, color="black", alpha=0.5)
    ax.set_title(dataset, fontsize=15)
    ax.set_xlabel("threshold")
    ax.set_ylabel("fairness metric")
    ax.set_xlim((0, max_auc))
    ax.set_ylim((0, 1))

    ax.xaxis.set_major_locator(MultipleLocator(2))
    ax.xaxis.set_minor_locator(MultipleLocator(1))
    ax.yaxis.set_major_locator(MultipleLocator(0.25))

    # Add axis formatting
    # ax.yaxis.set_major_formatter(PercentFormatter(1.0))  # 1.0 is to be treated as 100%

    ax.grid(axis='x', which="major", linewidth=1, linestyle='--', color="lightgray")
    # ax.grid(axis='x', which="minor", linewidth=1, linestyle=':', color="lightgray")

    # Legend
    ax.legend(loc="upper left", bbox_to_anchor=(0.40, 0.75), fontsize=12)
    
    
# Separate plotting the data from formatting the figure
def plot_results_fairness(ax, name, **kwargs):
    if name == "USE-5":
        individual_fairness_threshold_plot(dataset="USE-5", ax=ax, max_auc=MAX_AUC, **kwargs)

    elif name == "Winobias":
        individual_fairness_threshold_plot(dataset="Winobias", ax=ax, max_auc=3, **kwargs)
    elif name == "Winogender":
        individual_fairness_threshold_plot(dataset="Winogender", ax=ax, max_auc=3, **kwargs)
    elif name == "USE-10":
        individual_fairness_threshold_plot(dataset="USE-10", ax=ax, max_auc=MAX_AUC, **kwargs)
    elif name == "USE-20":
        individual_fairness_threshold_plot(dataset="USE-20", ax=ax, max_auc=MAX_AUC, **kwargs)
    else:
        raise NotImplemented(f"Unexpected plot: {name}")

    
def make_figure(is_horizontal, plot_results, dataset_names, models, **kwargs):
    models, tag = models
    if is_horizontal:
        
        mosaic = []
        width_ratios = []
        for name in dataset_names:
            mosaic.append(name); width_ratios.append(1)
            mosaic.append("."); width_ratios.append(0.2)
            
        if len(dataset_names) == 5:
            width_ratios = [1, 0.2, 0.75, 0.2, 0.75, 0.2, 1, 0.2, 1, 0.2]
            
        fig, axd = plt.subplot_mosaic(
            mosaic=[mosaic[:-1]],
            gridspec_kw={"width_ratios": width_ratios[:-1]},
            figsize=(FULL_WIDTH, 2),
            sharey=True,
        )
        
    else:
        AB_gap, BC_gap = 0.2, 0.2

        fig, axd = plt.subplot_mosaic(
            mosaic=[
                ["A"], 
                ['.'], 
                ["B"], 
                ['.'],
                ["C"],
            ],
            gridspec_kw={"height_ratios": [1, AB_gap, 1, BC_gap, 1]},
            figsize=(2, FULL_WIDTH),
            sharey=True,
        )
    

    adjust(fig)
    
    for name, ax in axd.items():
        plot_results(ax, name, models=models,**kwargs)
        ax.spines[['right', 'top']].set_visible(False)
        ax.tick_params(axis='both', which='major', labelsize=14)
        ax.set_xlabel("threshold", fontsize=14)
        ax.set_ylabel("fairness metric", fontsize=14)

        
        if ax != axd[dataset_names[-1]]:
            ax.legend([],[], frameon=False)
        else:
            ax.legend(loc="upper center", ncol=1, bbox_to_anchor=(0.8, 0.95), fontsize=12)

    return fig, tag


for models in [pythia_models, opt_models, misc_models]:
    fig, tag = make_figure(is_horizontal=True,
                plot_results=plot_results_fairness,
                dataset_names=DATASET_NAMES,#
                #dataset_names=DATASET_NAMES[0:1] + DATASET_NAMES[-2:], 
                **dict(fairthresholds=FAIR_THRESHOLDS, fairauc=FAIR_AUC, models=models, simplify=False),
    )
    save_fig(fig, f"lineplots5__{tag}__fairness_metric_in_func_eps", dpi=150)

In [None]:
for models in [pythia_models, opt_models, misc_models]:

    fig, tag = make_figure(is_horizontal=True,
                plot_results=plot_results_fairness,
                dataset_names=DATASET_NAMES[0:1] + DATASET_NAMES[-3:], 
                **dict(fairthresholds=FAIR_THRESHOLDS, fairauc=FAIR_AUC, models=models, simplify=False),
    )
    save_fig(fig, f"lineplots4__ours__{tag}__fairness_metric_in_func_eps", dpi=100)

In [None]:
for models in [pythia_models, opt_models, misc_models]:
    fig, tag = make_figure(is_horizontal=True,
                plot_results=plot_results_fairness,
                dataset_names=DATASET_NAMES[0:1] + DATASET_NAMES[-2:], 
                **dict(fairthresholds=FAIR_THRESHOLDS, fairauc=FAIR_AUC, models=models),
    )
    save_fig(fig, f"lineplots3_ours__{tag}__fairness_metric_in_func_eps", dpi=100)

In [None]:
for models in [pythia_models, opt_models, misc_models]:
    fig, tag = make_figure(is_horizontal=True,
                plot_results=plot_results_fairness,
                dataset_names=DATASET_NAMES[1:3], 
                **dict(fairthresholds=FAIR_THRESHOLDS, fairauc=FAIR_AUC, models=models, simplify=False),
    )
    save_fig(fig, f"lineplots2_others__{tag}__fairness_metric_in_func_eps", dpi=100)

## Fairness neutrality: a preliminary analysis

In our paper, we have introduced two different thresholds. In the beginning, we started by studying the correlation between $\mathrm{MaxPMI}$ and the log-odds for each sentence, but found no obvious pattern (see plots below).
1. The threshold on $\mathrm{MaxPMI}$, i.e., $\varepsilon_k$.
2. The threshold on the fairness metric, i.e., $\varepsilon_f$.

Let us explore the correlation between the two values, that is, we aim to understand how the female-to-male logprobability ratio (i.e., the log odds) change with the maximum value $\mathrm{MaxPMI}$. To do this, we will plot the correlation between `max_gender_pmi` and the `FM_logprob`.

We will create a $\log_{10}$ variant of `FM_logprob` so that the camera-ready version of the paper is updated with a more intuitive measure. Since observing the `FM_logprob` does not provide an obvious pattern in function of the $\mathrm{MaxPMI}$, let us compute the fairness metric.

In [None]:
df = DATASET_W_CONSTRAINTS["Winobias"].copy()
df["FM_log10_prob"] = df["FM_logprob"].apply(use_log_10_base)
df = df[df["model"].isin(("pythia-6.9b",))]

sns.jointplot(df, x="max_gender_pmi", y="FM_log10_prob", hue="is_deduped")

In [None]:
df = DATASET_W_CONSTRAINTS["Winobias"].copy()
df["FM_log10_prob"] = df["FM_logprob"].apply(use_log_10_base)
df = df[df["model"].isin(("pythia-6.9b",))]

sns.jointplot(df, x="max_gender_pmi", y="FM_log10_prob", joint_kws={"alpha": 0.5, "s": 8, "linewidth": 0})

In [None]:
df = DATASET_W_CONSTRAINTS["Winobias"].copy()
df["FM_log10_prob"] = df["FM_logprob"].apply(use_log_10_base)
df = df[df["model"].isin(("Mistral-7B-v0.1",))]

sns.kdeplot(df, x="max_gender_pmi", y="FM_log10_prob", fill=True, levels=100, thresh=0, cmap="mako")
plt.show()
sns.kdeplot(df, x="max_gender_pmi", y="FM_log10_prob")
plt.show()

## Fairness Neutrality, Unstereo Score (US)

In this section, we aim to compute the different skews of the models for various constrained settings. 
In particular, we will compute:

1. **Fairness metric**: focus on the computation of the neutral examples, i.e., the examples whose test sentence pair likelihoods are within $\exp^{\epsilon_f}$
2. Difference in predicted female vs predicted male: if the sentences are not being predicted neutral, how is the model assigning the probability? 

In [None]:
FAIRNESS_THRESHOLD, FAIRNESS_COL

In [None]:
import glob, operator


def filter_data_by_col_val(data: pd.DataFrame, col: str, thres: float):
    return data[(data[col] >= -thres) & (data[col] <= thres)]


def is_neutral(df, col=FAIRNESS_COL, threshold: float=FAIRNESS_THRESHOLD):
    assert 0 <= threshold <= 1
    assert col in df.columns
    return (df[col] >= -threshold) & (df[col] <= threshold)


def get_skew(df, col=FAIRNESS_COL, threshold: float=FAIRNESS_THRESHOLD):
    assert 0 <= threshold <= 1
    assert col in df.columns

    df = df.copy()
    df["skew"] = ["neutral"] * len(df)
    df.loc[df[col] < -threshold, "skew"] = "male"
    df.loc[df[col] >  threshold, "skew"] = "female"
    return df["skew"]


def get_bins(val, max_val=100, edges=(15, 10, 5, 2.5, 1, FAIRNESS_THRESHOLD)):
    __base_interval = pd.Interval(-edges[-1], edges[-1], closed="both")
    sign = np.sign(val)
    threshold = edges[-1]

    if sign == 0 or  -threshold <= val <= threshold:
        return __base_interval

    op = operator.gt if sign > 0 else operator.le
    edges = [sign * max_val] + [e * sign for e in edges]

    for i in range(1, len(edges)):
        if op(val, edges[i]):
            e1, e2 = edges[i-1], edges[i]
            bins = (e1, e2) if sign < 0 else (e2, e1)
            return pd.Interval(*bins, closed="neither" if sign < 0 and bins[-1] == -threshold else "right")
        

def compute_skews_(data_files: dict, fairness_col, fairness_threshold, use_base_10: callable=None):
    new_data_files = {}

    for name, df in data_files.items():
        df = df.copy()
        get_fair_bins = lambda x: get_bins(val=x, max_val=100, edges=(15, 10, 5, 2.5, 1, fairness_threshold))
        
        if use_base_10:
            df[f"{fairness_col}_base10"] = df[fairness_col].apply(use_base_10)
            new_fairness_col = f"{fairness_col}_base10"
        else:
            new_fairness_col = fairness_col

        df[f"{new_fairness_col}_bins"] = df[new_fairness_col].apply(get_fair_bins)

        df["is_neutral"] = is_neutral(df, new_fairness_col, fairness_threshold)
        # Obtain a discrete measure of what gender does the model fairness_col, skews
        # note: it assumes that positive values of fairness col will skew female
        # completions; and negative values skew male completions...
        print(new_fairness_col, fairness_threshold)
        df["skew"] = get_skew(df, new_fairness_col, fairness_threshold)
        new_data_files[name] = df
        
    return new_data_files

print("-"*80)
print(f"Using threshold: {FAIRNESS_THRESHOLD:.4f} to compute fairness metric")
print("-"*80)

BEFORE_FILTER = {dataset: df.copy() for dataset, df in DATASET_W_CONSTRAINTS.items()}

# Use this version to use the natural logarithm
# BEFORE_FILTER = compute_skews_(BEFORE_FILTER, FAIRNESS_COL, 0.5)
# use this version to use the base 10 results
BEFORE_FILTER = compute_skews_(BEFORE_FILTER, FAIRNESS_COL, FAIRNESS_THRESHOLD, use_base_10=use_log_10_base)

In [None]:
BEFORE_FILTER["USE-5"]

### Neutrality and AuFC (per constrained setting)

While we propose a pipeline to create benchmarks that satisfy the gender co-occurrence constraints, in our experiments we do not immediately restrict our benchmarks. The main goal being that we'd like to be able to study the effect of stricter PMI constraints. For that reason, in the following setting, we will compute the value of Neutrality and AuFC for $\eta \in \{0.3, 0.5, 0.65, 0.8, 1\}$. The stricter setup being $\eta = 0.3$ and the least strict being $\eta = 1$. The original unconstrained version of the dataset (stored in variable `BEFORE_FILTER[<dataset>]`) is denoted $\eta = \infty$ in the paper.

In [None]:
PMI_THRESHOLDS = [0.3, 0.5, 0.65, 0.8, 1.0]
print(f"Fairness col: '{FAIRNESS_COL}' and threshold: '{FAIRNESS_THRESHOLD}'")
AFTER_FILTER = {}
# Filter out the dataset_w_constraints according to the different PMI thresholds (or \epsilon_k)
for pmi_threshold in PMI_THRESHOLDS:
    AFTER_FILTER[pmi_threshold] = {
        dataset: filter_data_by_col_val(df, col=MAXGENDER_COL, thres=pmi_threshold).copy()
        for dataset, df in BEFORE_FILTER.items()
    } 

AFTER_FILTER = {
    filt: compute_skews_(bias_files, FAIRNESS_COL, FAIRNESS_THRESHOLD, use_base_10=use_log_10_base) for filt, bias_files in AFTER_FILTER.items()
}

### Manually selecting sentences for figure 1 in paper :3 

We restricted ourselves to looking for a female sentence in USE-5 since it would be easier to plot in a single column paper!

In [None]:
dataset, maxpmi_filter = "USE-5", 0.5
counts = AFTER_FILTER[maxpmi_filter][dataset].groupby(["sentence", "skew"]).count()["orig_index"].reset_index().rename({"orig_index": "preferences"}, axis=1)

with pd.option_context('display.max_colwidth', None):
    df = counts[(counts["skew"] == "female") & (counts["preferences"] >= 12)]
    df = df.sort_values("preferences", ascending=False).head()
    display(df)

In [None]:
counts[counts["sentence"].isin([
    # "He became ruder as time passed."
    # "He plans adventurous vacations every year.",
    # "He confidently led the discussion, heads eagerly following his every word." 
    # "The flight cancelation affected him.",
    # "We appreciate that she's here.",
    "She disowns her toxic relationships."
])]

In [None]:
def compute_neutral_pct_w_std(data2files: dict):
    results = defaultdict(list)
    for dataset, df in data2files.items():
        neutral_mean = df[["model", "is_neutral"]].groupby("model").mean()
        neutral_mean *= 100

        # computed as the variance of a bernoulli distribution
        Y = neutral_mean

        n = len(BEFORE_FILTER["USE-5"]) / NUM_EVAL_MODELS
        neutral_std = np.sqrt(Y/100 * (1 - Y/100) / n) * 100
        # neutral_std = BEFORE_FILTER["USE-5"][["model", "is_neutral"]].groupby("model").std()

        results["dataset"].extend([dataset if dataset != "USE-5" else "USE-05"] * len(neutral_mean))
        results["model"].extend(neutral_mean.reset_index()["model"])
        results["neutral_avg"].extend(neutral_mean["is_neutral"].values.tolist())
        results["neutral_std"].extend(neutral_std["is_neutral"].tolist())
        final_repr = "$" + neutral_mean["is_neutral"].map('{:.2f}'.format) + "_{\\pm " + neutral_std["is_neutral"].round(2).map('{:.2f}'.format) + "}$"

        results["neutral_final"].extend(final_repr.values.tolist())
        
    return pd.DataFrame(results)
    
def compute_female_male_skews(data2files: dict, model_names=MODELS):
    results = defaultdict(list)
    for dataset, df in data2files.items():
        pcts = df.groupby(["model", "skew"]).count()["template"]
        
        for model in model_names:
            model_res = pcts[model]
            model_total = model_res.sum()
            
            results["dataset"].append(dataset if dataset != "USE-5" else "USE-05")
            results["model"].append(model)
            results["total"].append(model_total)
            results["pct_fem"].append(model_res.get("female", 0) / model_total * 100)
            results["pct_mal"].append(model_res.get("male", 0) / model_total * 100)
            results["counts_fem"].append(model_res.get("female", 0))
            results["counts_mal"].append(model_res.get("male", 0))
            results["partial_pct_mal"].append(results["counts_mal"][-1] / (results["counts_mal"][-1] + results["counts_fem"][-1]))
            results["partial_pct_fem"].append(1-results["partial_pct_mal"][-1])

            
            pct_diff = round(results["pct_fem"][-1] - results["pct_mal"][-1], 2)
            results["pct_fem_min_mal"].append(f"{pct_diff:.2f}")
           
    return pd.DataFrame(results).round(2)


def merge_results(data2files) -> pd.DataFrame:
    return pd.merge(
        compute_neutral_pct_w_std(data2files),
        compute_female_male_skews(data2files),
        on=["dataset", "model"],
        how="inner"
    )


In [None]:
METRICS_BEFORE_FILTER = merge_results(BEFORE_FILTER)
METRICS_AFTER_FILTER = {eta: merge_results(AFTER_FILTER[eta]) for eta in AFTER_FILTER.keys()}

### Fraction of predicted male vs predicted female

In [None]:
def plot_relative_male_pref_plots(df, model_names, tag, dpi=150):
    cat1, cat2 = [], []
    for name in model_names:
        if "(D)" in name:
            cat2.append(name)
        else:
            cat1.append(name)
    
    df1 = df[df["model"].isin((cat1))]

    fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH, 3))
    ax.spines[['right', 'top']].set_visible(False)

    sns.barplot(df1, y="dataset", x="partial_pct_mal", hue="model", ax=ax, hue_order=cat1)
    if len(cat2) > 0:
        df2 = df[df["model"].isin((cat2))]
        sns.barplot(df2, y="dataset", x="partial_pct_mal", hue="model", ax=ax, hue_order=cat2)
        
    ax.set_xlim((0, 1))

    ax.xaxis.set_minor_locator(MultipleLocator(0.1))
    ax.xaxis.set_major_locator(MultipleLocator(0.2))
    ax.xaxis.set_major_formatter(PercentFormatter(1.0))  # 1.0 is to be treated as 100%
    adjust(fig)

    ax.set_ylabel("Dataset")
    ax.set_xlabel("Relative male preference")

    # hatches = itertools.cycle([' ', '///', '+', '-', 'x', '\\', '\\', '*', 'o', 'O', '.'][:len(model_names)])
    # for i, bar in enumerate(ax.patches):
    #     if i % 5 == 0:
    #         hatch = next(hatches)
    #     bar.set_hatch(hatch)

    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.26), ncol=len(model_names)//2, fancybox=True, shadow=False, fontsize=12)
    save_fig(fig, f"barplots__male__relpref__{tag}", dpi=dpi)
    plt.show()

In [None]:
pythia_models_small = [
 'pythia-70m',
 'pythia-70m (D)',
    
 'pythia-1.4b',
 'pythia-1.4b (D)',
 #'pythia-160m',
 # 'pythia-160m (D)',
 'pythia-2.8b',
 'pythia-2.8b (D)',
 #'pythia-410m',
 # 'pythia-410m (D)',
], "pythia-small"

pythia_models_large = [
 'gpt-j-6b',
 'pythia-6.9b',
 'pythia-6.9b (D)',
 'pythia-12b',
 'pythia-12b (D)',
], "pythia-large"


for maxpmi_eps, remain_data in METRICS_AFTER_FILTER.items():
    for models in [pythia_models_small, pythia_models_large, opt_models, misc_models]:
        plot_relative_male_pref_plots(remain_data, pythia_models_small[0], f"__maxpmi{maxpmi_eps}__{pythia_models_small[1]}")
        break
    break

#### Plot relative preferences across models 

More interestingly, we want to be able to compare groups of models for the same dataset. 
- Is there any scaling law by family of models? 
- How do different models perform in different filters of the dataset.

In [None]:
# version 1: x axis are different models, sorted by 

#### Number of examples before and after the filters

In [None]:
print("All examples:")
print({dataset: len(df) / NUM_EVAL_MODELS for dataset, df in BEFORE_FILTER.items()})


for eps, eps_values in AFTER_FILTER.items():
    print()
    print("Number of examples after filter", eps)
    print({dataset: len(df) / NUM_EVAL_MODELS for dataset, df in eps_values.items()})

### Create tables

In [None]:
def model2latex(model: str):    
    if "pythia" in model:
        return "\\" + re.sub(r"pythia-(.+)", r"pyths{\1}", model)
    elif "opt" in model:
        return "\\" + re.sub(r"opt-(.+)", r"opts{\1}", model)
    elif "mpt" in model:
        return "\\" + re.sub(r"mpt-(.+)", r"mpts{\1}", model)
    elif "llama-2" in model:
        return "\\" + re.sub(r"llama-2-(.+)", r"llamas{\1}", model)
    elif "gpt-j" in model:
        return "\\" + "gptj"
    else:
        return model
        

def print_results(data, value):
    table = pd.pivot(data, values=[value], index="model", columns=["dataset"])
    table = table.droplevel(None, axis=1).rename_axis(None, axis=1).reset_index() 
    table["model"] = table["model"].apply(model2latex)
    print(table.set_index("model").to_latex())

    
def get_results(data, value):
    table = pd.pivot(data, values=[value], index="model", columns=["dataset"])
    table = table.droplevel(None, axis=1).rename_axis(None, axis=1).reset_index() 
    table["model"] = table["model"].apply(model2latex)
    return table.set_index("model")

### Neutral fairness

In [None]:
print("-" * 80, "\n")
print("NO FILTER")
print("\n", "-" * 80, "\n\n")
print_results(METRICS_BEFORE_FILTER, "neutral_final")


for eps, df in METRICS_AFTER_FILTER.items():
    print("-" * 80, "\n")
    print(f"FILTER = {eps}")
    print_results(METRICS_AFTER_FILTER[eps], "neutral_final")
    print("-" * 80, "\n\n")

### Create tables w/ fairness gap


#### Table 1.

In [None]:
print("NO FILTER")
r = get_results(METRICS_BEFORE_FILTER, "neutral_avg")
r.to_csv("camera_ready/table/neutral_avg__unfiltered.csv")
fairness_gap_tables = {"unfiltered": r}

for eps, df in METRICS_AFTER_FILTER.items():
    print(f"FILTER = {eps}")
    r = get_results(METRICS_AFTER_FILTER[eps], "neutral_avg")
    r.to_csv(f"camera_ready/table/neutral_avg__filtered__{eps}.csv")
    fairness_gap_tables[eps] = r
    
    
orig = fairness_gap_tables["unfiltered"]
delta_08 = fairness_gap_tables[0.8] - orig
delta_065 = fairness_gap_tables[0.65] - orig

df = pd.DataFrame()
df.index = orig.index
assert all(df.index == delta_08.index)
assert all(df.index == delta_065.index)

for dataset in ["USE-05", "Winobias", "Winogender"]:
    df.insert(len(df.columns), f"{dataset}__Orig", orig[dataset])
    df.insert(len(df.columns), f"{dataset}__\delta_" + "{0.8}", delta_08[dataset])
    df.insert(len(df.columns), f"{dataset}__\delta_" + "{0.65}", delta_065[dataset])
    
print(df.style.format('{:.2f}').to_latex())

In [None]:
df = pd.DataFrame()
df.index = orig.index
assert all(df.index == delta_08.index)
assert all(df.index == delta_065.index)

for dataset in ["USE-10", "USE-20"]:
    df.insert(len(df.columns), f"{dataset}__Orig", orig[dataset])
    df.insert(len(df.columns), f"{dataset}__\delta_" + "{0.8}", delta_08[dataset])
    df.insert(len(df.columns), f"{dataset}__\delta_" + "{0.65}", delta_065[dataset])
    
print(df.style.format('{:.2f}').to_latex())

#### Table 2. Impact of training data deduplication at $\eta = 0.65$

In [None]:
eta = 0.65
tab2 = fairness_gap_tables[eta].reset_index().copy()
tab2 = tab2[tab2["model"].apply(lambda s: s.startswith("\pyths"))]

tab2_dedup_mask = tab2["model"].apply(lambda s: '(D)' in s)
# original models
tab2_orig = tab2[~tab2_dedup_mask].sort_values("model")
tab2_orig = tab2_orig.set_index("model")

# deduplicate models
tab2_dedup = tab2[tab2_dedup_mask].sort_values("model")
tab2_dedup["model"] = tab2_dedup["model"].apply(lambda s: s.replace(" (D)", ""))
tab2_dedup = tab2_dedup.set_index("model")

assert all(tab2_dedup.index == tab2_orig.index)

print((tab2_dedup - tab2_orig).style.format('{:.2f}').to_latex())

#### Table 3. Predicted female - predicted male

In [None]:
METRICS_AFTER_FILTER

In [None]:
tab3 = get_results(METRICS_AFTER_FILTER[0.5], "pct_fem_min_mal")
print(tab3.to_latex())

### pred female - pred male

In [None]:
get_results(METRICS_BEFORE_FILTER, "pct_fem_min_mal")
get_results(METRICS_AFTER_FILTER[0.8], "pct_fem_min_mal")
get_results(METRICS_AFTER_FILTER[0.65], "pct_fem_min_mal")
get_results(METRICS_AFTER_FILTER[0.5], "pct_fem_min_mal")

In [None]:
print("-" * 80, "\n")
print("NO FILTER")
print("\n", "-" * 80, "\n\n")
print_results(METRICS_BEFORE_FILTER, "pct_fem_min_mal")

for eps, df in METRICS_AFTER_FILTER.items():
    print("-" * 80, "\n")
    print(f"FILTER = {eps}")
    print_results(METRICS_AFTER_FILTER[eps], "pct_fem_min_mal")
    print("-" * 80, "\n\n")

In [None]:
__d = AFTER_FILTER[0.65]["Winobias"]
__dgb = __d.groupby(["model", "skew"]).count()["template"].reset_index()

model_names = sorted(set(__dgb.model))


total_templates = []
for model in model_names:
    __dm = __dgb[__dgb["model"] == model]
    
    # Total number of templates
    total_templates.append(__dm["template"].sum())
    
    if "Mixtral" in model or "OLMo-7B" in model or "opt-6.7b" in model or "12b" in model or "gpt-j" in model:
        fem_count = __dm.loc[__dm["skew"] == "female", "template"].item()
        mal_count = __dm.loc[__dm["skew"] == "male", "template"].item()
        print(model, round((__dm.loc[__dm["skew"] == "neutral", "template"].item()) / total_templates[-1] * 100, 2), round((fem_count - mal_count) / total_templates[-1] * 100, 2))
    
    
set(total_templates)

In [None]:
WORD2PMI["guides"]

### AuFC

In [None]:
AUFC_BASE_DIR = "./camera_ready/table/aufc"

def print_results_aufc(data_auc, filepath):
    table = pd.pivot(data_auc, values=["auc"], index="model", columns=["dataset_"])
    table = table.droplevel(None, axis=1).rename_axis(None, axis=1).reset_index() 
    table_str = table.set_index("model").style.format('{:.2f}').to_latex()
    with open(filepath, "w") as f:
        f.write(table_str)
    
    # To latex file, leveraging rendering commands for model names
    table["model"] = table["model"].apply(model2latex)
    table_str = table.set_index("model").style.format('{:.2f}').to_latex()
    print(table_str)

In [None]:
MODELS

In [None]:
# We need to create a file for landing page with the different metrics.
# Ideally, we create a different json file for every dataset
# where json file contains for every filter/max pmi constraint the models'
# values for a given metric.
# -----------------------
# Example for dataset X
# -----------------------
# {
#   none: {
#     neutral__avg: {
#        model1: 98.32,
#        ...
#        modeln: ...
#     }, 
#     neutral__std: {
#
#     },   
#     aufc: {
#
#     },  
#     male_rel_ratio: {
#
#     },      
#   },
#   0.5: {
#     ...
#   },
#   ...
# }
# ---------------------------------------------------------------
METRICS_FOR_LANDING_PAGE = {name: {} for name in DATASET_NAMES}

neutral__avg = {None: compute_female_male_skews(BEFORE_FILTER)}
neutral__std = {None: compute_neutral_pct_w_std(BEFORE_FILTER)}

for eps in AFTER_FILTER.keys():
    neutral__avg[eps] = compute_female_male_skews(AFTER_FILTER[eps])
    neutral__std[eps] = compute_neutral_pct_w_std(AFTER_FILTER[eps])

    
fair_auc_landing_page = {None: compute_neutralpct(
    DATASET_W_CONSTRAINTS,
    MODELS,
    DATASET_NAMES,
    FAIRNESS_EPSILONS,
    FAIRNESS_COL,
    use_log10=use_log_10_base,
)[1]}

for eps, df in AFTER_FILTER.items():
    _, fair_auc = compute_neutralpct(df, MODELS, DATASET_NAMES, FAIRNESS_EPSILONS, FAIRNESS_COL)
    fair_auc_landing_page[eps] = fair_auc

In [None]:
FAIRNESS_THRESHOLD = np.log10(_FAIRNESS_THRESHOLD)
print(FAIRNESS_THRESHOLD)
MAX_AUC = 6
FAIRNESS_EPSILONS = np.linspace(0, MAX_AUC, 101)

FAIR_THRESHOLDS, FAIR_AUC = compute_neutralpct(
    DATASET_W_CONSTRAINTS,
    MODELS,
    DATASET_NAMES,
    FAIRNESS_EPSILONS,
    FAIRNESS_COL,
    use_log10=use_log_10_base,
)

print("-" * 80, "\n")
print("-" * 80, "\n")
FAIR_AUC["dataset_"] = FAIR_AUC["dataset"].apply(lambda x: x if x != "USE-5" else "USE-05")
print_results_aufc(FAIR_AUC, f"{AUFC_BASE_DIR}/unfiltered.tex")


for eps, df in AFTER_FILTER.items():
    print("-" * 80, "\n")
    print(f"FILTER = {eps}")
    print("-" * 80, "\n")
    FAIR_THRESHOLDS, FAIR_AUC = compute_neutralpct(df, MODELS, DATASET_NAMES, FAIRNESS_EPSILONS, FAIRNESS_COL)
    FAIR_AUC["dataset_"] = FAIR_AUC["dataset"].apply(lambda x: x if x != "USE-5" else "USE-05")
    print_results_aufc(FAIR_AUC, f"{AUFC_BASE_DIR}/filter_{str(eps).replace('.', '')}.tex")
    fair_auc_landing_page[eps] = FAIR_AUC
    # Uncomment these lines for drawing fairness plots
    # fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, pythia_models)
    # fairness_threshold_plots(FAIR_THRESHOLDS, FAIR_AUC, DATASET_NAMES, misc_models)

In [None]:
for dataset in DATASET_NAMES:
    # fairness_neutral__avg
    eps_results = {}
    for eps in [None] + PMI_THRESHOLDS:
        results = {}
        neutral_subset = neutral__std[eps]
        # print(dataset, eps,neutral["dataset"].unique())
        neutral_subset = neutral_subset[neutral_subset["dataset"] == ("USE-05" if dataset == "USE-5" else dataset)].drop("neutral_final", axis=1)

        results["neutral__avg"] = {}
        results["neutral__std"] = {}
        for i, row in neutral_subset.iterrows():
            model = row["model"]
            results["neutral__avg"][model] = row["neutral_avg"]
            results["neutral__std"][model] = row["neutral_std"]
    
        ## AUFC 
        results["aufc"] = {}
        fair_auc = fair_auc_landing_page[eps]
        aufc_subset = fair_auc[fair_auc.dataset == dataset]
        for i, row in aufc_subset.iterrows():
            model = row["model"]
            results["aufc"][model] = row["auc"]
        
        ## Male Relative ratio
        male_fem_subset = neutral__avg[eps]
        male_fem_subset = male_fem_subset[male_fem_subset.dataset == ("USE-05" if dataset == "USE-5" else dataset)]
        results["male_rel_ratio"] = {}
        results["num_examples_nonneutral"] = {}
        results["num_examples"] = male_fem_subset["total"].unique().item()
        for i, row in male_fem_subset.iterrows():
            model = row["model"]
            results["male_rel_ratio"][model] = row["partial_pct_mal"]
            results["num_examples_nonneutral"][model] = row["counts_fem"] + row["counts_mal"]
        if eps == None:
            eps = "unconstrained"
        eps_results[eps] = results
        
    METRICS_FOR_LANDING_PAGE[dataset] = eps_results
    import json
    with open(f"./camera_ready/landing-page/{dataset}.json", "w") as f:
        json.dump(eps_results, f, sort_keys=False, indent=2)


with open(f"./camera_ready/landing-page/all_datasets.json", "w") as f:
    json.dump(METRICS_FOR_LANDING_PAGE, f, sort_keys=False, indent=2)

In [None]:
counts_fem

## Sampling pictures for the images

In this section, we select the examples for the paper, including the ones in the Figures 1-2, as well as the ones listed in the appendix.

In [None]:
def get_analysis_examples(data2files, min_eps=0, max_eps=None):
    results = {}
    
    for dataset, df in data2files.items():
        df = df[~df[["sentence", "model"]].duplicated()]
        if min_eps == 0 :
            mask = (df[MAXGENDER_COL].abs() >= min_eps)
        else:
            mask = (df[MAXGENDER_COL].abs() > min_eps)
        
        if max_eps is not None:
            mask &= (df[MAXGENDER_COL].abs() <= max_eps)
        
        try:
            results[dataset] = df[mask].groupby(["word", "target_word", "template", "skew"]).count()[["orig_index"]]
        except:
            results[dataset] = df[mask].groupby(["word", "template", "skew"]).count()[["orig_index"]]
        
        results[dataset].reset_index(inplace=True)
        results[dataset].rename({"orig_index": "model_votes"}, axis=1, inplace=True)    
    return results


EXAMPLES_050 = get_analysis_examples(AFTER_FILTER[0.5], 0, 0.5)
EXAMPLES_065 = get_analysis_examples(AFTER_FILTER[0.65], 0.5, 0.65)
EXAMPLES_1 = get_analysis_examples(AFTER_FILTER[1.00], 0.65, 1)
EXAMPLES_1plus = get_analysis_examples(BEFORE_FILTER, 1)

In [None]:
EXAMPLES_050["USE-5"]

In [None]:
dataset = "USE-20"
k = 40
d = EXAMPLES_050[dataset]
d[(d["skew"] == "female") & (d["model_votes"] >= 12)].sample(n=k, random_state=812313)["template"].head(k).values

In [None]:
d = EXAMPLES_050[dataset]
d[(d["skew"] == "male") & (d["model_votes"] >= 12)].sample(n=k, random_state=812313)["template"].head(k).values

### Small scale human annotation 

In this section, we sample a set of examples from UnStereoEval (USE) datasets to carry some small scale data analysis. We select a sample of datapoints covering different MaxPMI intervals to test the hypothesis that the MaxPMI is a good heuristic for filtering out examples with gender connotated words. 

In [None]:
def get_sample(data2files, n=300, seed=98283):
    results = []
    for dataset, df in data2files.items():
        if not dataset.startswith("Wino"):
            # Get unique templates
            df_sampled = df.sort_values(["template", "model_votes"], ascending=False).groupby("template").head(1)
            # Get 50 random samples
            df_sampled = df_sampled.sample(n, random_state=seed, replace=False)
            df_sampled["dataset"] = dataset
        
            results.append(df_sampled)
        
    return pd.concat(results)

In [None]:
get_sample(EXAMPLES_050).to_csv("./annotate_0_to_050_alldata_300each.csv")
get_sample(EXAMPLES_065).to_csv("./annotate_050_to_065_alldata_300each.csv")
get_sample(EXAMPLES_1).to_csv("./annotate_065_to_1_alldata_300each.csv")
get_sample(EXAMPLES_1plus).to_csv("./annotate_1plus_alldata_300each.csv")

In [None]:
EXAMPLES_050[]