In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install torch transformers datasets peft pandas



In [2]:
!git clone https://github.com/McGill-NLP/bias-bench.git
!git clone https://github.com/manon-reusens/multilingual_bias.git

Cloning into 'bias-bench'...

Cloning into 'multilingual_bias'...



In [9]:
!git clone https://github.com/slds-lmu/stereotypes-multi

Cloning into 'stereotypes-multi'...



In [9]:
!pip install openpyxl gspread pydrive

Collecting pydrive
  Downloading PyDrive-1.3.1.tar.gz (987 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m38.7 MB/s[0m  [33m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting google-api-python-client>=1.2 (from pydrive)
  Downloading google_api_python_client-2.183.0-py3-none-any.whl.metadata (7.0 kB)
Collecting oauth2client>=4.0.0 (from pydrive)
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting httplib2<1.0.0,>=0.19.0 (from google-api-python-client>=1.2->pydrive)
  Downloading httplib2-0.31.0-py3-none-any.whl.metadata (2.2 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client>=1.2->pydrive)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5 (from google-api-python-client>=1.2->pydrive)
  Downloading google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB

In [6]:
import sys
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, concatenate_datasets
from collections import defaultdict
import pickle
import importlib.util

# Add custom paths
sys.path.append("/stereotypes-multi/code")
sys.path.append("/bias-bench/bias_bench/benchmark")

spec = importlib.util.spec_from_file_location("crows", "bias-bench/bias_bench/benchmark/crows/crows.py")
crows = importlib.util.module_from_spec(spec)
sys.modules["crows"] = crows
spec.loader.exec_module(crows)

spec3 = importlib.util.spec_from_file_location("data_loader", "stereotypes-multi/code/data_loader.py")
data_loader = importlib.util.module_from_spec(spec3)
sys.modules["data_loader"] = data_loader
spec3.loader.exec_module(data_loader)

spec2 = importlib.util.spec_from_file_location("intrasentence_inference", "stereotypes-multi/code/intrasentence_inference.py")
intrasentence_inference = importlib.util.module_from_spec(spec2)
sys.modules["intrasentence_inference"] = intrasentence_inference
spec2.loader.exec_module(intrasentence_inference)

spec4 = importlib.util.spec_from_file_location("evaluation", "stereotypes-multi/code/evaluation.py")
evaluation = importlib.util.module_from_spec(spec4)
sys.modules["evaluation"] = evaluation
spec4.loader.exec_module(evaluation)

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM, PeftModel
from crows import CrowSPairsRunner
from intrasentence_inference import generative_inference
from evaluation import ScoreStorage, add_example_calculate_score


In [7]:
def get_joint_sent_prob(model, tokenizer, sent, device, initial_token_probabilities):
    tokens = tokenizer.encode(sent)
    tokens_tensor = torch.tensor(tokens).to(device).unsqueeze(0)
    output = model(tokens_tensor)[0].softmax(dim=-1)
    joint_sentence_probability = [initial_token_probabilities[0, 0, tokens[0]].item()]
    for idx in range(1, len(tokens)):
        joint_sentence_probability.append(output[0, idx - 1, tokens[idx]].item())
    score = np.sum([np.log2(i) for i in joint_sentence_probability])
    score /= len(joint_sentence_probability)
    score = np.power(2, score)
    return score

def dict_func():
    return {}


In [15]:
import argparse

# Notebook-friendly way to mimic command-line arguments
class Args:
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct"   # e.g., "google/gemma-2-9b"
    dataset = "crowspairs"  # or "stereoset"

args = Args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [9]:
from huggingface_hub import login
HF_TOKEN = "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"
login(HF_TOKEN)

In [16]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

model = AutoModelForCausalLM.from_pretrained(
        args.model,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
if not tokenizer.pad_token_id:
        tokenizer.pad_token_id = tokenizer.eos_token_id

model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((409

In [31]:
!git clone https://gitlab.inria.fr/corpus4ethics/multilingualcrowspairs.git

Cloning into 'multilingualcrowspairs'...



In [11]:
if args.dataset == "crowspairs":
    for input_file_name in [
        "fr_FR.csv"
    ]:
        results = {}
        language = input_file_name.split("_")[0]
        if os.path.isfile(f"{args.dataset}/{args.model.split('/')[-1]}_{language}_cspresults.pkl"):
            continue
        for bias_type in [
            "race-color", "socioeconomic", "gender", "disability",
            "nationality", "sexual-orientation", "physical-appearance",
            "religion", "age"
        ]:
            runner = CrowSPairsRunner(
                model=model,
                tokenizer=tokenizer,
                input_file="crows_data/" + input_file_name,
                bias_type=bias_type,
                is_generative=True
            )
            results[bias_type] = runner()
            print(args.model, language)
        with open(f"{args.dataset}/{args.model.split('/')[-1]}_{language}_cspresults.pkl", "wb") as outfile:
            pickle.dump(results, outfile)


Evaluating race-color examples.


100%|██████████| 457/457 [02:21<00:00,  3.22it/s]


Total examples: 457
Metric score: 47.05
Stereotype score: 46.08
Anti-stereotype score: 58.33
Num. neutral: 0.0

sft_panda fr
Evaluating socioeconomic examples.


100%|██████████| 189/189 [00:58<00:00,  3.23it/s]


Total examples: 189
Metric score: 71.43
Stereotype score: 72.83
Anti-stereotype score: 56.25
Num. neutral: 0.0

sft_panda fr
Evaluating gender examples.


100%|██████████| 294/294 [01:30<00:00,  3.24it/s]


Total examples: 294
Metric score: 52.04
Stereotype score: 57.0
Anti-stereotype score: 41.49
Num. neutral: 0.0

sft_panda fr
Evaluating disability examples.


100%|██████████| 59/59 [00:18<00:00,  3.21it/s]


Total examples: 59
Metric score: 76.27
Stereotype score: 76.79
Anti-stereotype score: 66.67
Num. neutral: 0.0

sft_panda fr
Evaluating nationality examples.


100%|██████████| 250/250 [01:18<00:00,  3.20it/s]


Total examples: 250
Metric score: 61.2
Stereotype score: 61.76
Anti-stereotype score: 50.0
Num. neutral: 0.0

sft_panda fr
Evaluating sexual-orientation examples.


100%|██████████| 89/89 [00:27<00:00,  3.20it/s]


Total examples: 89
Metric score: 22.47
Stereotype score: 17.28
Anti-stereotype score: 75.0
Num. neutral: 0.0

sft_panda fr
Evaluating physical-appearance examples.


100%|██████████| 71/71 [00:22<00:00,  3.20it/s]


Total examples: 71
Metric score: 50.7
Stereotype score: 47.54
Anti-stereotype score: 70.0
Num. neutral: 0.0

sft_panda fr
Evaluating religion examples.


100%|██████████| 114/114 [00:35<00:00,  3.21it/s]


Total examples: 114
Metric score: 45.61
Stereotype score: 43.93
Anti-stereotype score: 71.43
Num. neutral: 0.0

sft_panda fr
Evaluating age examples.


100%|██████████| 90/90 [00:27<00:00,  3.23it/s]

Total examples: 90
Metric score: 56.67
Stereotype score: 55.13
Anti-stereotype score: 66.67
Num. neutral: 0.0

sft_panda fr





In [14]:
if args.dataset == "crowspairs":
    for input_file_name in [
        "fr_FR.csv"
    ]:
        results = {}
        language = input_file_name.split("_")[0]
        if os.path.isfile(f"{args.dataset}/{args.model.split('/')[-1]}_{language}_cspresults_biasdpo.pkl"):
            continue
        for bias_type in [
            "race-color", "socioeconomic", "gender", "disability",
            "nationality", "sexual-orientation", "physical-appearance",
            "religion", "age"
        ]:
            runner = CrowSPairsRunner(
                model=model,
                tokenizer=tokenizer,
                input_file="crows_data/" + input_file_name,
                bias_type=bias_type,
                is_generative=True
            )
            results[bias_type] = runner()
            print(args.model, language)
        with open(f"{args.dataset}/{args.model.split('/')[-1]}_{language}_cspresults_biasdpo.pkl", "wb") as outfile:
            pickle.dump(results, outfile)


Evaluating race-color examples.


100%|██████████| 457/457 [01:44<00:00,  4.38it/s]


Total examples: 457
Metric score: 55.14
Stereotype score: 53.92
Anti-stereotype score: 69.44
Num. neutral: 0.0

biasdpo_lora_model fr
Evaluating socioeconomic examples.


100%|██████████| 189/189 [00:42<00:00,  4.40it/s]


Total examples: 189
Metric score: 75.13
Stereotype score: 75.72
Anti-stereotype score: 68.75
Num. neutral: 0.0

biasdpo_lora_model fr
Evaluating gender examples.


100%|██████████| 294/294 [01:06<00:00,  4.41it/s]


Total examples: 294
Metric score: 53.06
Stereotype score: 55.5
Anti-stereotype score: 47.87
Num. neutral: 0.0

biasdpo_lora_model fr
Evaluating disability examples.


100%|██████████| 59/59 [00:13<00:00,  4.41it/s]


Total examples: 59
Metric score: 71.19
Stereotype score: 71.43
Anti-stereotype score: 66.67
Num. neutral: 0.0

biasdpo_lora_model fr
Evaluating nationality examples.


100%|██████████| 250/250 [00:56<00:00,  4.43it/s]


Total examples: 250
Metric score: 61.6
Stereotype score: 62.61
Anti-stereotype score: 41.67
Num. neutral: 0.0

biasdpo_lora_model fr
Evaluating sexual-orientation examples.


100%|██████████| 89/89 [00:20<00:00,  4.43it/s]


Total examples: 89
Metric score: 32.58
Stereotype score: 28.4
Anti-stereotype score: 75.0
Num. neutral: 0.0

biasdpo_lora_model fr
Evaluating physical-appearance examples.


100%|██████████| 71/71 [00:15<00:00,  4.44it/s]


Total examples: 71
Metric score: 56.34
Stereotype score: 54.1
Anti-stereotype score: 70.0
Num. neutral: 0.0

biasdpo_lora_model fr
Evaluating religion examples.


100%|██████████| 114/114 [00:25<00:00,  4.39it/s]


Total examples: 114
Metric score: 46.49
Stereotype score: 44.86
Anti-stereotype score: 71.43
Num. neutral: 0.0

biasdpo_lora_model fr
Evaluating age examples.


100%|██████████| 90/90 [00:20<00:00,  4.42it/s]

Total examples: 90
Metric score: 64.44
Stereotype score: 65.38
Anti-stereotype score: 58.33
Num. neutral: 0.0

biasdpo_lora_model fr





In [17]:
if args.dataset == "crowspairs":
    for input_file_name in [
        "en_US.csv"
    ]:
        results = {}
        language = input_file_name.split("_")[0]
        if os.path.isfile(f"{args.dataset}/{args.model.split('/')[-1]}_{language}_cspresults_sftpanda.pkl"):
            continue
        for bias_type in [
            "race-color", "socioeconomic", "gender", "disability",
            "nationality", "sexual-orientation", "physical-appearance",
            "religion", "age"
        ]:
            runner = CrowSPairsRunner(
                model=model,
                tokenizer=tokenizer,
                input_file="crows_data/" + input_file_name,
                bias_type=bias_type,
                is_generative=True
            )
            results[bias_type] = runner()
            print(args.model, language)
        with open(f"{args.dataset}/{args.model.split('/')[-1]}_{language}_cspresults_sftpanda.pkl", "wb") as outfile:
            pickle.dump(results, outfile)


Evaluating race-color examples.


100%|██████████| 516/516 [01:41<00:00,  5.07it/s]


Total examples: 516
Metric score: 60.66
Stereotype score: 61.31
Anti-stereotype score: 53.49
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en
Evaluating socioeconomic examples.


100%|██████████| 172/172 [00:33<00:00,  5.10it/s]


Total examples: 172
Metric score: 72.67
Stereotype score: 73.89
Anti-stereotype score: 60.0
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en
Evaluating gender examples.


100%|██████████| 262/262 [00:51<00:00,  5.07it/s]


Total examples: 262
Metric score: 62.21
Stereotype score: 70.44
Anti-stereotype score: 49.51
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en
Evaluating disability examples.


100%|██████████| 60/60 [00:11<00:00,  5.07it/s]


Total examples: 60
Metric score: 70.0
Stereotype score: 71.93
Anti-stereotype score: 33.33
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en
Evaluating nationality examples.


100%|██████████| 159/159 [00:31<00:00,  5.09it/s]


Total examples: 159
Metric score: 61.64
Stereotype score: 61.49
Anti-stereotype score: 63.64
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en
Evaluating sexual-orientation examples.


100%|██████████| 84/84 [00:16<00:00,  5.07it/s]


Total examples: 84
Metric score: 78.57
Stereotype score: 81.94
Anti-stereotype score: 58.33
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en
Evaluating physical-appearance examples.


100%|██████████| 63/63 [00:12<00:00,  5.08it/s]


Total examples: 63
Metric score: 63.49
Stereotype score: 69.23
Anti-stereotype score: 36.36
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en
Evaluating religion examples.


100%|██████████| 105/105 [00:20<00:00,  5.02it/s]


Total examples: 105
Metric score: 70.48
Stereotype score: 71.72
Anti-stereotype score: 50.0
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en
Evaluating age examples.


100%|██████████| 87/87 [00:17<00:00,  5.03it/s]

Total examples: 87
Metric score: 75.86
Stereotype score: 78.08
Anti-stereotype score: 64.29
Num. neutral: 0.0

meta-llama/Meta-Llama-3.1-8B-Instruct en



