# Social Bias Computation

In this notebook we evaluate the biases of various models on three datasets: CrowS-Pairs, StereoSet and Winobias. We use the metrics from here in Case Study 4.

## Downloading data

In [None]:
!wget https://raw.githubusercontent.com/nyu-mll/crows-pairs/master/data/crows_pairs_anonymized.csv
!wget https://raw.githubusercontent.com/moinnadeem/StereoSet/master/data/dev.json
!wget https://raw.githubusercontent.com/rudinger/winogender-schemas/master/data/all_sentences.tsv

--2022-04-06 09:47:28--  https://raw.githubusercontent.com/nyu-mll/crows-pairs/master/data/crows_pairs_anonymized.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 437764 (428K) [text/plain]
Saving to: ‘crows_pairs_anonymized.csv.1’


2022-04-06 09:47:28 (12.3 MB/s) - ‘crows_pairs_anonymized.csv.1’ saved [437764/437764]

--2022-04-06 09:47:28--  https://raw.githubusercontent.com/moinnadeem/StereoSet/master/data/dev.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12502245 (12M) [text/plain]
Saving to: ‘dev.json.1’


2022-04-

In [None]:
import wget

splits = ["dev", "test"]
sides = ["pro", "anti"]
types = [1, 2]

for split in splits:
    for side in sides:
        for type in types:
            wget.download(f"https://raw.githubusercontent.com/uclanlp/corefBias/master/WinoBias/wino/data/{side}_stereotyped_type{type}.txt.{split}")

## Transforming data

In [1]:
from tqdm.notebook import tqdm, trange
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np
import sys

sys.path.append("..")

from votenrank.fairness_computation import *

### CrowS-Pairs

In [None]:
from tqdm.notebook import tqdm, trange
import pandas as pd

crows = pd.read_csv("crows_pairs_anonymized.csv").drop(columns=["Unnamed: 0"])

crows["good_sentence"] = ""
crows["bad_sentence"] = ""

for i, row in tqdm(crows.iterrows(), total=crows.shape[0]):
    if row["stereo_antistereo"] == "stereo":
        bad_sent, good_sent = row["sent_more"], row["sent_less"]
    elif row["stereo_antistereo"] == "antistereo":
        bad_sent, good_sent = row["sent_less"], row["sent_more"]
    else:
        raise ValueError
    crows.loc[i, "good_sentence"] = good_sent
    crows.loc[i, "bad_sentence"] = bad_sent

  0%|          | 0/1508 [00:00<?, ?it/s]

### StereoSet

In [None]:
import json

with open("dev.json") as f:
    stereo = json.load(f)

intra = {
    "anti-stereotype": [],
    "stereotype": [],
    "unrelated": []
}

for obj in stereo["data"]["intrasentence"]:
    for sentence in obj["sentences"]:
        intra[sentence["gold_label"]].append(sentence["sentence"])

inter = {
    "anti-stereotype": [],
    "stereotype": [],
    "unrelated": []
}

for obj in stereo["data"]["intrasentence"]:
    for sentence in obj["sentences"]:
        inter[sentence["gold_label"]].append(obj["context"] + " " + sentence["sentence"])

### Winobias

In [None]:
import re

def template(text, pronoun, ans):
    ans = fix_insertion(ans)
    pronoun = [fix_insertion(el) for el in pronoun]
    if len(pronoun) == 1:
        return f"{text} '{pronoun[0]}' refers to {ans}."
    else:
        return f"{text} '{pronoun[0]}' and '{pronoun[1]}' refer to {ans}."

def fix_insertion(text):
    return text.lower()[1:-1]

with open("pro_stereotyped_type1.txt.dev") as f:
    lines = f.readlines()

bias_data = {
    "pro": {"good": [], "bad": []},
    "anti": {"good": [], "bad": []},
}

for split in splits:
    for side in sides:
        for type in types:
            with open(f"{side}_stereotyped_type{type}.txt.{split}") as f:
                lines = f.readlines()

            for i in range(0, len(lines), 2):
                line1 = lines[i].strip()[lines[i].index(" ") + 1:]
                line2 = lines[i + 1].strip()[lines[i + 1].index(" ") + 1:]

                r1 = re.findall('\[[^\]]*\]', line1)
                p1 = r1[0]
                g1 = r1[1:]
                r2 = re.findall('\[[^\]]*\]', line2)
                p2 = r2[0]
                g2 = r2[1:]

                line1 = line1.replace("[", "").replace("]", "")
                line2 = line2.replace("[", "").replace("]", "")

                bias_data[side]["good"].append(template(line1, g1, p1))
                bias_data[side]["bad"].append(template(line1, g1, p2))

                bias_data[side]["good"].append(template(line2, g2, p2))
                bias_data[side]["bad"].append(template(line2, g2, p1))

## Evaluation

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, T5ForConditionalGeneration, \
    GPT2LMHeadModel

models = [
    "distilbert-base-uncased", "distilroberta-base", "bert-base-uncased", "roberta-base", "albert-base-v2", 
    "microsoft/deberta-base", "t5-base", "gpt2"
]

name_to_model = {}

for name in models:
    if "gpt2" in name:
        model = GPT2LMHeadModel
    elif "t5" in name:
        model = T5ForConditionalGeneration
    else:
        model = AutoModelForMaskedLM
        
    name_to_model[name] = {"model": model, "tokenizer": AutoTokenizer}

### Next cell runs the computations.

In [None]:
columns = [
    "crows_score",
    "intra_icat",
    "intra_lms",
    "intra_ss",
    "inter_icat",
    "inter_lms",
    "inter_ss",
    "female_acc",
    "male_acc",
    "neutral_acc",
    "pro_acc",
    "anti_acc"
]
results = pd.DataFrame(index=list(name_to_model.keys()), columns=columns)

for name, tools in name_to_model.items():
    model = tools["model"].from_pretrained(name).cuda()
    tokenizer = tools["tokenizer"].from_pretrained(name)

    if name == "gpt2":
        scorer = naive_gpt2_score
    elif name == "t5-base":
        scorer = naive_t5_score
    else:
        scorer = naive_masking_score
    
    crows_score = crows_pipeline(model, tokenizer, crows["good_sentence"], crows["bad_sentence"], scorer)
    results.loc[name, "crows_score"] = crows_score

    stereo_res = {
        "intra": stereo_pipeline(model, tokenizer, scorer, intra["anti-stereotype"], intra["stereotype"], intra["unrelated"]),
        "inter": stereo_pipeline(model, tokenizer, scorer, inter["anti-stereotype"], inter["stereotype"], inter["unrelated"])
    }
    for upper in ["intra", "inter"]:
        for lower in ["icat", "lms", "ss"]:
            results.loc[name, upper + "_" + lower] = stereo_res[upper][lower]

    wino_res = winogender_pipeline(model, tokenizer, wino_data, scorer)

    for gender in ["female", "male", "neutral"]:
        results.loc[name, gender + "_acc"] = wino_res[gender]

    bias_res = winobias_pipeline(model, tokenizer, bias_data, scorer)
    for side in ["pro", "anti"]:
        results.loc[name, side + "_acc"] = bias_res[side]

    print(name)

results

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

distilbert-base-uncased


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

bert-base-uncased


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

roberta-base


Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

albert-base-v2


Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['cls.predictions.transform.LayerNorm.

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

microsoft/deberta-base


Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

t5-base


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/2106 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

  0%|          | 0/1584 [00:00<?, ?it/s]

gpt2


Unnamed: 0,crows_score,intra_icat,intra_lms,intra_ss,inter_icat,inter_lms,inter_ss,female_acc,male_acc,neutral_acc,pro_acc,anti_acc
distilbert-base-uncased,0.590186,0.699782,0.898623,0.610636,0.75284,0.803181,0.531339,0.495833,0.4875,0.4875,0.563763,0.565657
bert-base-uncased,0.586207,0.72788,0.894349,0.593067,0.669958,0.698481,0.520418,0.4875,0.5,0.495833,0.592172,0.547348
roberta-base,0.622679,0.674952,0.910019,0.629155,0.399541,0.403371,0.495252,0.520833,0.491667,0.566667,0.527778,0.566919
albert-base-v2,0.559019,0.690074,0.923314,0.626306,0.777039,0.777778,0.499525,0.508333,0.504167,0.504167,0.522096,0.529672
microsoft/deberta-base,0.484085,0.424806,0.452754,0.469136,0.403541,0.447293,0.451092,0.504167,0.495833,0.479167,0.494949,0.508207
t5-base,0.624668,0.703445,0.87868,0.599715,0.578929,0.612061,0.527066,0.504167,0.5375,0.520833,0.539773,0.523359
gpt2,0.568302,0.697509,0.912393,0.617759,0.64897,0.8585,0.622032,0.5,0.495833,0.495833,0.566288,0.491793


## Creating dataframe with results

In [6]:
import pandas as pd

ethics = results
ethics.rename(columns={"Unnamed: 0": "model_name"}, inplace=True)
ethics["model_name"].replace("microsoft/deberta-base", "deberta-base", inplace=True)
ethics = ethics.set_index("model_name", drop=True)
ethics = pd.DataFrame({
    "anti_crows_score": 1 - ethics["crows_score"],
    "stereoset_intra_icat": ethics["intra_icat"],
    "stereoset_inter_icat": ethics["inter_icat"],
    "winobias_acc_difference": 1 - (ethics["pro_acc"] - ethics["anti_acc"]).abs()
})

ethics.to_csv("../tables/case_study_4/social_bias_data.csv", index_label="model_name")
ethics

Unnamed: 0_level_0,anti_crows_score,stereoset_intra_icat,stereoset_inter_icat,winogender_acc_difference,winobias_acc_difference
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
distilbert-base-uncased,0.409814,0.699782,0.75284,0.991667,0.998106
bert-base-uncased,0.413793,0.72788,0.669958,0.9875,0.955177
roberta-base,0.377321,0.674952,0.399541,0.970833,0.960859
albert-base-v2,0.440981,0.690074,0.777039,0.995833,0.992424
deberta-base,0.515915,0.424806,0.403541,0.991667,0.986742
t5-base,0.375332,0.703445,0.578929,0.966667,0.983586
gpt2,0.431698,0.697509,0.64897,0.995833,0.925505
