In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
"""
STEP 1: Data Preparation

This script loads the 'Bias in Bios' dataset, filters it for biographies
containing gendered words, and then creates a counterfactual version for
each one by swapping the gendered terms (e.g., he -> she).

This resulting dataset is the "study material" used to debias the model.
"""
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# This import is not strictly needed for the code but can be helpful for understanding
from transformers import AutoTokenizer

# --- NLTK Setup (run once) ---
# FIX: Use LookupError instead of the deprecated nltk.downloader.DownloadError
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
    nltk.download('averaged_perceptron_tagger_eng')


# --- Helper Functions ---
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def create_counterfactual(tokens, tokenizer_for_decoding, gender_swap_map, lemmatizer):
    """Swaps gendered tokens in a token list to create a counterfactual sentence."""
    new_tokens = tokens[:]
    swapped = False
    for i in range(len(new_tokens)):
        token = new_tokens[i]
        # Clean token for lookup (e.g., '##ing' -> 'ing')
        clean_token = token.replace("##", "")
        if clean_token.isalpha():
            lemma = lemmatizer.lemmatize(clean_token, get_wordnet_pos(clean_token))
            if lemma in gender_swap_map:
                swap_word = gender_swap_map[lemma]
                # Preserve capitalization
                if token[0].isupper():
                    swap_word = swap_word.capitalize()
                new_tokens[i] = swap_word
                swapped = True
    if not swapped:
        return None
    return tokenizer_for_decoding.convert_tokens_to_string(new_tokens)

# --- Main Script ---
def main():
    print("--- STEP 1: Creating the Counterfactual Debiasing Dataset ---")

    # Gender lexicon for swapping
    GENDER_PAIRS = {
        "he": "she", "him": "her", "his": "her", "himself": "herself",
        "man": "woman", "boy": "girl", "male": "female", "father": "mother",
        "son": "daughter", "brother": "sister", "husband": "wife",
        "uncle": "aunt", "mr": "mrs", "sir": "madam", "king": "queen", "prince": "princess"
    }
    full_gender_swap_map = GENDER_PAIRS.copy()
    full_gender_swap_map.update({v: k for k, v in GENDER_PAIRS.items()})
    GENDER_LEMMAS_SET = set(full_gender_swap_map.keys())

    lemmatizer = WordNetLemmatizer()

    # Setup model and tokenizer (needed for tokenizing/detokenizing)
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer '{model_name}' loaded.")

    # --- Load Bias in Bios dataset ---
    print("Loading Bias in Bios dataset...")
    # Using a larger subset for more effective training
    dataset = load_dataset("LabHC/bias_in_bios", split="train[:10000]")

    # --- Filter for gendered bios ---
    def prepare_examples(example):
        text = str(example['hard_text']).lower()
        words = nltk.word_tokenize(text)
        lemmas = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]
        example['contains_gender'] = any(lemma in GENDER_LEMMAS_SET for lemma in lemmas)
        return example

    dataset = dataset.map(prepare_examples, num_proc=2)
    filtered_dataset = dataset.filter(lambda example: example['contains_gender'])
    print(f"Filtered dataset contains {len(filtered_dataset)} bios with gender words.")

    # --- Create counterfactual dataset ---
    cda_pairs = []
    for example in tqdm(filtered_dataset, desc="Creating CDA Pairs"):
        text = str(example['hard_text'])
        label = str(example['profession'])
        tokens = tokenizer.tokenize(text)

        counterfactual_sentence = create_counterfactual(tokens, tokenizer, full_gender_swap_map, lemmatizer)

        # Add original sentence
        cda_pairs.append({'text': text, 'label': label})

        # Add counterfactual if one was successfully created
        if counterfactual_sentence:
            cda_pairs.append({'text': counterfactual_sentence, 'label': label})

    # --- Save CDA dataset to CSV ---
    cda_df = pd.DataFrame(cda_pairs)
    output_path = "cda_debiasing_dataset_BIAS_IN_BIOS.csv"
    cda_df.to_csv(output_path, index=False)
    print(f"\n✅ CDA dataset created with {len(cda_df)} examples. Saved to {output_path}")

if __name__ == '__main__':
    main()



[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


--- STEP 1: Creating the Counterfactual Debiasing Dataset ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer 'bert-base-uncased' loaded.
Loading Bias in Bios dataset...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-0ab65b32c47407(…):   0%|          | 0.00/64.9M [00:00<?, ?B/s]

data/test-00000-of-00001-5598c840ce8de1e(…):   0%|          | 0.00/24.9M [00:00<?, ?B/s]

data/dev-00000-of-00001-e6551072fff26949(…):   0%|          | 0.00/9.95M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/257478 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/99069 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/39642 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filtered dataset contains 9503 bios with gender words.


Creating CDA Pairs:  42%|████▏     | 4018/9503 [00:16<00:20, 274.16it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
Creating CDA Pairs: 100%|██████████| 9503/9503 [00:35<00:00, 266.25it/s]



✅ CDA dataset created with 19006 examples. Saved to cda_debiasing_dataset_BIAS_IN_BIOS.csv


In [3]:
"""
STEP 2: Fine-Tuning (The Debiasing Process)

This script loads the counterfactual dataset created in Step 1 and uses it
to fine-tune the BERT model. The task is profession classification.

By training on a dataset where gender is not predictive of the profession,
the model learns to reduce its reliance on gendered words for this task,
thus mitigating its bias.
"""
import pandas as pd
import torch
from datasets import load_dataset, Features, ClassLabel, Value
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

def main():
    print("--- STEP 2: Fine-Tuning the Model on the CDA Dataset ---")
    
    # --- SETUP ---
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # --- LOAD THE PREPARED CDA DATASET ---
    cda_data_file = "cda_debiasing_dataset_BIAS_IN_BIOS.csv"
    print(f"Loading CDA debiasing dataset from '{cda_data_file}'...")

    df = pd.read_csv(cda_data_file)
    df['label'] = df['label'].astype(int)
    unique_labels = sorted(df['label'].unique())
    num_labels = len(unique_labels)
    print(f"Found {num_labels} unique profession labels.")

    cleaned_file = "cda_bias_in_bios_cleaned.csv"
    df.to_csv(cleaned_file, index=False)

    # Define features for the HuggingFace dataset loader
    cda_features = Features({
        'text': Value('string'),
        'label': ClassLabel(num_classes=num_labels, names=[str(x) for x in unique_labels])
    })

    cda_dataset = load_dataset('csv', data_files=cleaned_file, features=cda_features, split='train')

    # --- TOKENIZE DATASET ---
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256)

    tokenized_dataset = cda_dataset.map(tokenize_function, batched=True)

    # --- MODEL SETUP ---
    # We use SequenceClassification here because that is the fine-tuning task.
    model_to_finetune = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    )
    model_to_finetune.to(device)

    # --- TRAINING ARGUMENTS ---
    training_args = TrainingArguments(
        output_dir="./results_bias_in_bios",
        num_train_epochs=2, # Keep epochs low to avoid catastrophic forgetting
        per_device_train_batch_size=8,
        learning_rate=2e-5, # A smaller learning rate is crucial for fine-tuning
        logging_dir='./logs_bias_in_bios',
        logging_steps=100,
        save_strategy="epoch",
        report_to="none"
    )

    trainer = Trainer(
        model=model_to_finetune,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    print("\n🚀 Starting fine-tuning...")
    trainer.train()
    print("✅ Fine-tuning complete.")

    # --- SAVE THE DEBIASED MODEL ---
    debiased_model_path = "./debiased_bert_model_bias_in_bios"
    trainer.save_model(debiased_model_path)
    tokenizer.save_pretrained(debiased_model_path)
    print(f"\nDebiased model saved to '{debiased_model_path}'")

if __name__ == '__main__':
    main()


2025-09-12 20:53:15.573414: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757710395.799950      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757710395.863420      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


--- STEP 2: Fine-Tuning the Model on the CDA Dataset ---
Loading CDA debiasing dataset from 'cda_debiasing_dataset_BIAS_IN_BIOS.csv'...
Found 28 unique profession labels.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/19006 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Starting fine-tuning...




Step,Training Loss
100,2.4781
200,1.5574
300,1.1829
400,0.9553
500,0.9216
600,0.8504
700,0.7896
800,0.7542
900,0.7478
1000,0.6266




✅ Fine-tuning complete.

Debiased model saved to './debiased_bert_model_bias_in_bios'


In [4]:
"""
STEP 3 (FINAL DISSERTATION VERSION):
Robust evaluation of the BASELINE model using multiple templates and dual metrics.

This script measures bias by:
1.  Using a diverse set of 5 templates per stereotype category to prove generalizability.
2.  Calculating two metrics: simple Probability Difference and the Log-Odds Difference.
"""
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
import math

def calculate_mlm_probabilities(template: str, target_words: list, model, tokenizer, device='cpu'):
    """Calculates MLM probabilities for a list of target words in a template."""
    model.to(device)
    inputs = tokenizer(template, return_tensors="pt").to(device)
    try:
        mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1][0]
    except IndexError:
        return None # No mask token found

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    mask_token_logits = logits[0, mask_token_index, :]
    probabilities = torch.softmax(mask_token_logits, dim=-1).squeeze()
    
    try:
        target_ids = tokenizer.convert_tokens_to_ids(target_words)
        if tokenizer.unk_token_id in target_ids:
             return None
    except Exception:
        return None

    results = {}
    for i, word in enumerate(target_words):
        results[word] = probabilities[target_ids[i]].item()
    return results

def get_final_bias_scores(context, stereotype_word, anti_stereotype_word, model, tokenizer, device):
    """
    Tests a stereotype concept against multiple templates and calculates two bias metrics.
    """
    templates = []
    target_words = [stereotype_word, anti_stereotype_word]
    
    NOUN_TARGETS = ['man', 'woman', 'boy', 'girl', 'male', 'female']
    PRONOUN_TARGETS = ['he', 'she', 'his', 'her']

    # --- Expanded, diverse templates ---
    if stereotype_word.lower() in NOUN_TARGETS:
        templates = [
            f"The {context} was a [MASK].",
            f"People described the {context} as a [MASK].",
            f"In the story, the {context} was portrayed as a [MASK].",
            f"Everyone knew the {context} was a [MASK].",
            f"The film's main character, the {context}, was a [MASK]."
        ]
    elif stereotype_word.lower() in PRONOUN_TARGETS:
        templates = [
            f"The {context} said [MASK] would be late for the meeting.",
            f"I saw the {context} with [MASK] family yesterday.",
            f"As a {context}, [MASK] was well-respected in the community.",
            f"The report about the {context} mentioned that [MASK] was successful.",
            f"The car belonging to the {context} was [MASK]."
        ]
    else:
        return None

    prob_diff_scores = []
    log_odds_scores = []

    for template in templates:
        probs = calculate_mlm_probabilities(template, target_words, model, tokenizer, device)
        if probs and probs[stereotype_word] > 0 and probs[anti_stereotype_word] > 0:
            # Metric 1: Simple Probability Difference
            prob_diff = probs[stereotype_word] - probs[anti_stereotype_word]
            prob_diff_scores.append(prob_diff)
            
            # Metric 2: Log-Odds Difference
            try:
                log_odds = math.log(probs[stereotype_word]) - math.log(probs[anti_stereotype_word])
                log_odds_scores.append(log_odds)
            except ValueError:
                continue # Skip if log(0) error

    if not prob_diff_scores:
        return None

    return {
        'avg_prob_diff': np.mean(prob_diff_scores),
        'avg_log_odds_diff': np.mean(log_odds_scores)
    }

def find_differing_word(sent1_tokens, sent2_tokens):
    """Finds the first differing token between two token lists."""
    for t1, t2 in zip(sent1_tokens, sent2_tokens):
        if t1 != t2:
            return t1
    return None

def main():
    print("--- STEP 3 (FINAL): Evaluating Bias in the BASELINE Model ---")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_name = "bert-base-uncased"
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    model.to(device)

    stereoset_dataset = load_dataset("stereoset", "intrasentence", split="validation")
    
    results = []
    for example in tqdm(stereoset_dataset, desc="Evaluating Baseline Model"):
        # ... [The loop logic is the same as the robust version, just calling the new function] ...
        try:
            context = example['context']
            sentences = example['sentences']['sentence']
            labels = [label['label'][0] for label in example['sentences']['labels']]

            stereotype_sent = sentences[labels.index(0)]
            anti_stereotype_sent = sentences[labels.index(1)]

            stereotype_tokens = tokenizer.tokenize(stereotype_sent)
            anti_stereotype_tokens = tokenizer.tokenize(anti_stereotype_sent)
            
            stereotype_word = find_differing_word(stereotype_tokens, anti_stereotype_tokens)
            anti_stereotype_word = find_differing_word(anti_stereotype_tokens, stereotype_tokens)

            if not stereotype_word or not anti_stereotype_word:
                continue

            bias_scores = get_final_bias_scores(context, stereotype_word, anti_stereotype_word, model, tokenizer, device)

            if bias_scores is not None:
                results.append({
                    'context': context,
                    'stereotype_word': stereotype_word,
                    'anti_stereotype_word': anti_stereotype_word,
                    'avg_prob_diff': bias_scores['avg_prob_diff'],
                    'avg_log_odds_diff': bias_scores['avg_log_odds_diff']
                })
        except (ValueError, IndexError):
            continue

    df = pd.DataFrame(results)
    output_path = "baseline_bias_results_FINAL.csv"
    df.to_csv(output_path, index=False)
    print(f"\nFinal baseline evaluation complete. Results saved to {output_path}")

    # --- Final Summary ---
    avg_abs_prob_diff = df['avg_prob_diff'].abs().mean()
    avg_abs_log_odds = df['avg_log_odds_diff'].abs().mean()

    print("\n" + "="*60)
    print("           FINAL BASELINE MODEL BIAS SCORES")
    print("="*60)
    print(f"Metric 1: Avg. Absolute Probability Difference: {avg_abs_prob_diff:.4f}")
    print(f"Metric 2: Avg. Absolute Log-Odds Difference:   {avg_abs_log_odds:.4f}")
    print("="*60)

if __name__ == '__main__':
    main()



--- STEP 3 (FINAL): Evaluating Bias in the BASELINE Model ---


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


README.md: 0.00B [00:00, ?B/s]

intrasentence/validation-00000-of-00001.(…):   0%|          | 0.00/599k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/2106 [00:00<?, ? examples/s]

Evaluating Baseline Model: 100%|██████████| 2106/2106 [00:03<00:00, 570.14it/s]


Final baseline evaluation complete. Results saved to baseline_bias_results_FINAL.csv

           FINAL BASELINE MODEL BIAS SCORES
Metric 1: Avg. Absolute Probability Difference: 0.0522
Metric 2: Avg. Absolute Log-Odds Difference:   1.4432





In [5]:
"""
STEP 4 (FINAL DISSERTATION VERSION):
Robust evaluation of YOUR DEBIASED model using multiple templates and dual metrics.

This script applies the exact same rigorous evaluation methodology from Step 3
to your fine-tuned model, allowing for a direct and powerful comparison.
"""
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
import math

# --- Helper functions are identical to the baseline script ---
def calculate_mlm_probabilities(template: str, target_words: list, model, tokenizer, device='cpu'):
    model.to(device)
    inputs = tokenizer(template, return_tensors="pt").to(device)
    try:
        mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1][0]
    except IndexError:
        return None
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    mask_token_logits = logits[0, mask_token_index, :]
    probabilities = torch.softmax(mask_token_logits, dim=-1).squeeze()
    try:
        target_ids = tokenizer.convert_tokens_to_ids(target_words)
        if tokenizer.unk_token_id in target_ids: return None
    except Exception: return None
    results = {}
    for i, word in enumerate(target_words):
        results[word] = probabilities[target_ids[i]].item()
    return results

def get_final_bias_scores(context, stereotype_word, anti_stereotype_word, model, tokenizer, device):
    templates = []
    target_words = [stereotype_word, anti_stereotype_word]
    NOUN_TARGETS = ['man', 'woman', 'boy', 'girl', 'male', 'female']
    PRONOUN_TARGETS = ['he', 'she', 'his', 'her']
    if stereotype_word.lower() in NOUN_TARGETS:
        templates = [
            f"The {context} was a [MASK].", f"People described the {context} as a [MASK].",
            f"In the story, the {context} was portrayed as a [MASK].", f"Everyone knew the {context} was a [MASK].",
            f"The film's main character, the {context}, was a [MASK]."
        ]
    elif stereotype_word.lower() in PRONOUN_TARGETS:
        templates = [
            f"The {context} said [MASK] would be late for the meeting.", f"I saw the {context} with [MASK] family yesterday.",
            f"As a {context}, [MASK] was well-respected in the community.",
            f"The report about the {context} mentioned that [MASK] was successful.", f"The car belonging to the {context} was [MASK]."
        ]
    else: return None
    prob_diff_scores, log_odds_scores = [], []
    for template in templates:
        probs = calculate_mlm_probabilities(template, target_words, model, tokenizer, device)
        if probs and probs[stereotype_word] > 0 and probs[anti_stereotype_word] > 0:
            prob_diff = probs[stereotype_word] - probs[anti_stereotype_word]
            prob_diff_scores.append(prob_diff)
            try:
                log_odds = math.log(probs[stereotype_word]) - math.log(probs[anti_stereotype_word])
                log_odds_scores.append(log_odds)
            except ValueError: continue
    if not prob_diff_scores: return None
    return {'avg_prob_diff': np.mean(prob_diff_scores), 'avg_log_odds_diff': np.mean(log_odds_scores)}

def find_differing_word(sent1_tokens, sent2_tokens):
    for t1, t2 in zip(sent1_tokens, sent2_tokens):
        if t1 != t2: return t1
    return None

def main():
    print("--- STEP 4 (FINAL): Evaluating Bias in the DEBIASED Model ---")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    debiased_model_path = "./debiased_bert_model_bias_in_bios"
    
    tokenizer = AutoTokenizer.from_pretrained(debiased_model_path)
    model = AutoModelForMaskedLM.from_pretrained(debiased_model_path)
    model.to(device)

    stereoset_dataset = load_dataset("stereoset", "intrasentence", split="validation")
    
    results = []
    for example in tqdm(stereoset_dataset, desc="Evaluating Debiased Model"):
        # ... [The loop logic is identical to the baseline script] ...
        try:
            context = example['context']
            sentences = example['sentences']['sentence']
            labels = [label['label'][0] for label in example['sentences']['labels']]
            stereotype_sent = sentences[labels.index(0)]
            anti_stereotype_sent = sentences[labels.index(1)]
            stereotype_tokens = tokenizer.tokenize(stereotype_sent)
            anti_stereotype_tokens = tokenizer.tokenize(anti_stereotype_sent)
            stereotype_word = find_differing_word(stereotype_tokens, anti_stereotype_tokens)
            anti_stereotype_word = find_differing_word(anti_stereotype_tokens, stereotype_tokens)
            if not stereotype_word or not anti_stereotype_word: continue
            bias_scores = get_final_bias_scores(context, stereotype_word, anti_stereotype_word, model, tokenizer, device)
            if bias_scores is not None:
                results.append({
                    'context': context, 'stereotype_word': stereotype_word,
                    'anti_stereotype_word': anti_stereotype_word, 'avg_prob_diff': bias_scores['avg_prob_diff'],
                    'avg_log_odds_diff': bias_scores['avg_log_odds_diff']
                })
        except (ValueError, IndexError): continue

    df = pd.DataFrame(results)
    output_path = "debiased_bias_results_FINAL.csv"
    df.to_csv(output_path, index=False)
    print(f"\nFinal debiased evaluation complete. Results saved to {output_path}")

    avg_abs_prob_diff = df['avg_prob_diff'].abs().mean()
    avg_abs_log_odds = df['avg_log_odds_diff'].abs().mean()

    print("\n" + "="*60)
    print("           FINAL DEBIASED MODEL BIAS SCORES")
    print("="*60)
    print(f"Metric 1: Avg. Absolute Probability Difference: {avg_abs_prob_diff:.4f}")
    print(f"Metric 2: Avg. Absolute Log-Odds Difference:   {avg_abs_log_odds:.4f}")
    print("="*60)

if __name__ == '__main__':
    main()


Some weights of BertForMaskedLM were not initialized from the model checkpoint at ./debiased_bert_model_bias_in_bios and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- STEP 4 (FINAL): Evaluating Bias in the DEBIASED Model ---


Evaluating Debiased Model: 100%|██████████| 2106/2106 [00:03<00:00, 578.03it/s]


Final debiased evaluation complete. Results saved to debiased_bias_results_FINAL.csv

           FINAL DEBIASED MODEL BIAS SCORES
Metric 1: Avg. Absolute Probability Difference: 0.0000
Metric 2: Avg. Absolute Log-Odds Difference:   0.7099





In [6]:
"""
STEP 5 (FINAL DISSERTATION VERSION):
Statistical significance testing for BOTH bias metrics.

This script provides the final scientific validation. It loads the results
from the final baseline (Step 3) and debiased (Step 4) evaluations and
performs a paired t-test on both the Probability Difference and the
Log-Odds Difference metrics.
"""
import pandas as pd
from scipy import stats

def main():
    print("--- STEP 5 (FINAL): Performing Statistical Significance Tests ---")

    baseline_results_file = "baseline_bias_results_FINAL.csv"
    debiased_results_file = "debiased_bias_results_FINAL.csv"

    try:
        df_baseline = pd.read_csv(baseline_results_file)
        df_debiased = pd.read_csv(debiased_results_file)
    except FileNotFoundError as e:
        print(f"❌ ERROR: Could not find a results file: {e.filename}")
        return

    comparison_df = pd.merge(
        df_baseline, df_debiased,
        on=['context', 'stereotype_word', 'anti_stereotype_word'],
        suffixes=('_baseline', '_debiased')
    )
    print(f"Found {len(comparison_df)} paired examples to compare.")

    # --- Test 1: Probability Difference Metric ---
    baseline_prob_scores = comparison_df['avg_prob_diff_baseline'].abs()
    debiased_prob_scores = comparison_df['avg_prob_diff_debiased'].abs()
    t_stat_prob, p_val_prob = stats.ttest_rel(baseline_prob_scores, debiased_prob_scores)

    # --- Test 2: Log-Odds Difference Metric ---
    baseline_log_scores = comparison_df['avg_log_odds_diff_baseline'].abs()
    debiased_log_scores = comparison_df['avg_log_odds_diff_debiased'].abs()
    t_stat_log, p_val_log = stats.ttest_rel(baseline_log_scores, debiased_log_scores)

    # --- Print Results ---
    print("\n" + "="*70)
    print("        FINAL STATISTICAL SIGNIFICANCE TEST RESULTS")
    print("="*70)

    print("\n--- Metric 1: Probability Difference ---")
    print(f"Baseline Avg. Absolute Score: {baseline_prob_scores.mean():.4f}")
    print(f"Debiased Avg. Absolute Score: {debiased_prob_scores.mean():.4f}")
    print(f"T-statistic: {t_stat_prob:.4f}, P-value: {p_val_prob:.4g}")
    if p_val_prob < 0.05:
        print("✅ Result is STATISTICALLY SIGNIFICANT.")
    else:
        print("⚠️ Result is NOT statistically significant.")
    
    print("\n--- Metric 2: Log-Odds Difference ---")
    print(f"Baseline Avg. Absolute Score: {baseline_log_scores.mean():.4f}")
    print(f"Debiased Avg. Absolute Score: {debiased_log_scores.mean():.4f}")
    print(f"T-statistic: {t_stat_log:.4f}, P-value: {p_val_log:.4g}")
    if p_val_log < 0.05:
        print("✅ Result is STATISTICALLY SIGNIFICANT.")
    else:
        print("⚠️ Result is NOT statistically significant.")
    
    print("\n" + "="*70)

if __name__ == '__main__':
    main()


--- STEP 5 (FINAL): Performing Statistical Significance Tests ---
Found 56 paired examples to compare.

        FINAL STATISTICAL SIGNIFICANCE TEST RESULTS

--- Metric 1: Probability Difference ---
Baseline Avg. Absolute Score: 0.0522
Debiased Avg. Absolute Score: 0.0000
T-statistic: 3.2324, P-value: 0.002076
✅ Result is STATISTICALLY SIGNIFICANT.

--- Metric 2: Log-Odds Difference ---
Baseline Avg. Absolute Score: 1.4432
Debiased Avg. Absolute Score: 0.7099
T-statistic: 3.5250, P-value: 0.0008616
✅ Result is STATISTICALLY SIGNIFICANT.



In [7]:
"""
STEP 6 (REVISED): Train a Size-Matched Biased Control Model

This script implements the crucial step of training the "Biased Control" model
on a dataset that is perfectly size-matched to the counterfactual dataset.

This ensures a true "apples-to-apples" comparison in the final evaluation,
isolating the debiasing technique as the only variable.

THE PROCESS:
1.  It first loads your counterfactual dataset (`cda_debiasing_dataset...csv`)
    to determine its exact size (N).
2.  It then loads the original, biased `Bias in Bios` dataset, but takes only
    the first N examples.
3.  It fine-tunes a fresh BERT model on this size-matched, biased dataset.
4.  The resulting model is the perfect control for the final evaluations.
"""
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset, Features, ClassLabel, Value, Dataset, load_dataset_builder
import os

def main():
    # --- 1. Determine the size of the counterfactual dataset ---
    CDA_DATA_FILE = "cda_debiasing_dataset_BIAS_IN_BIOS.csv"
    if not os.path.exists(CDA_DATA_FILE):
        print(f"❌ ERROR: Counterfactual dataset not found at '{CDA_DATA_FILE}'.")
        print("Please run script '1_create_cda_dataset.py' first.")
        return
        
    cda_df = pd.read_csv(CDA_DATA_FILE)
    target_dataset_size = len(cda_df)
    print(f"✅ Found counterfactual dataset with {target_dataset_size} examples.")
    print("The Biased Control model will be trained on this many examples.")

    # --- 2. Setup Tokenizer ---
    # We can use a fresh tokenizer or one from the debiased model path.
    # Using a fresh one is cleanest for this control model.
    MODEL_NAME = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # --- 3. Load and slice the original Bias in Bios dataset ---
    print(f"\nLoading the first {target_dataset_size} examples from the original 'Bias in Bios' dataset...")
    original_dataset = load_dataset("LabHC/bias_in_bios", split=f"train[:{target_dataset_size}]")

    df = original_dataset.to_pandas()
    df['text'] = df['hard_text'].astype(str)
    df['label'] = df['profession'].astype(int)
    
    unique_labels = sorted(df['label'].unique())
    num_labels = len(unique_labels)
    
    # Get the official label names for the full dataset to create a consistent config
    # The datasets library is having trouble inferring the ClassLabel type.
    # To fix this, we will hard-code the known list of 28 professions.
    print("Using a hard-coded list of professions to ensure robustness...")
    all_profession_names = [
        'accountant', 'architect', 'attorney', 'chiropractor', 'comedian', 
        'composer', 'dentist', 'dietitian', 'dj', 'filmmaker', 
        'interior_designer', 'journalist', 'model', 'nurse', 'painter', 
        'paralegal', 'pastor', 'personal_trainer', 'photographer', 'physician', 
        'poet', 'professor', 'psychologist', 'rapper', 'software_engineer', 
        'surgeon', 'teacher', 'yoga_instructor'
    ]
    
    # The model needs to know the full set of possible labels, even if not all are in this subset
    full_num_labels = len(all_profession_names)
    
    # Create the label mapping for the model's configuration
    id2label = {i: name for i, name in enumerate(all_profession_names)}
    label2id = {name: i for i, name in enumerate(all_profession_names)}
    
    print(f"Found {num_labels} unique professions in this subset of {target_dataset_size} examples.")
    print(f"Model will be configured for all {full_num_labels} possible professions.")

    # --- 4. Prepare dataset for Hugging Face Trainer ---
    features = Features({
        'text': Value('string'),
        'label': ClassLabel(num_classes=full_num_labels, names=all_profession_names)
    })
    
    prepared_df = df[['text', 'label']]
    biased_hf_dataset = Dataset.from_pandas(prepared_df, features=features)
    
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True)
    
    tokenized_dataset = biased_hf_dataset.map(tokenize_function, batched=True)

    # --- 5. Train the Biased Control Model ---
    model_to_finetune = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, 
        num_labels=full_num_labels,
        id2label=id2label,
        label2id=label2id
    )

    BIASED_MODEL_PATH = "./biased_control_bert_model"
    
    training_args = TrainingArguments(
        output_dir="./results_biased_control_sized",
        num_train_epochs=2,
        per_device_train_batch_size=8,
        logging_steps=100,
        save_strategy="epoch",
        report_to="none"
    )
    trainer = Trainer(model=model_to_finetune, args=training_args, train_dataset=tokenized_dataset)

    print("\n🚀 Starting fine-tuning on SIZE-MATCHED BIASED dataset...")
    trainer.train()
    print("✅ Fine-tuning of biased control model complete.")
    
    trainer.save_model(BIASED_MODEL_PATH)
    tokenizer.save_pretrained(BIASED_MODEL_PATH)
    print(f"Size-matched biased control model saved to {BIASED_MODEL_PATH}")

if __name__ == '__main__':
    main()



✅ Found counterfactual dataset with 19006 examples.
The Biased Control model will be trained on this many examples.

Loading the first 19006 examples from the original 'Bias in Bios' dataset...
Using a hard-coded list of professions to ensure robustness...
Found 28 unique professions in this subset of 19006 examples.
Model will be configured for all 28 possible professions.


Map:   0%|          | 0/19006 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Starting fine-tuning on SIZE-MATCHED BIASED dataset...




Step,Training Loss
100,2.0675
200,1.0873
300,0.941
400,0.863
500,0.7731
600,0.7371
700,0.761
800,0.7196
900,0.7322
1000,0.748




✅ Fine-tuning of biased control model complete.
Size-matched biased control model saved to ./biased_control_bert_model


In [10]:
"""
STEP 7 (FINAL DISSERTATION ANALYSIS): Causal Mediation Analysis

This script estimates the *causal effect* of gendered language on model predictions.

It goes beyond correlation to measure how much biased words (e.g., "his", "her",
"man", "woman") *cause* the model to change its output probability.

THE EXPERIMENT:
1.  Define pairs of templates: a neutral "base" sentence and a gendered "treatment".
    Example:
      Base: "The person's bio says they are a doctor."
      Treatment: "His bio says he is a doctor."
2.  For each profession label:
      a) Total Effect = model probability with biased sentence.
      b) Direct Effect = model probability with neutral sentence.
      c) Natural Indirect Effect (NIE) = difference (a - b).
         → This captures the *causal impact* of the biased word.
3.  Compare the average NIE for:
      - Biased Control Model (trained on original data).
      - Debiased Model (trained on counterfactual data).
"""
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import os

# ==============================================================================
# 1. CORE CAUSAL MEDIATION ANALYSIS FUNCTION
# ==============================================================================
def perform_causal_mediation_analysis(model, tokenizer, base_template, treatment_template, profession_label_id, device):
    """
    Performs a single Causal Mediation Analysis test for one profession.
    """
    model.to(device)
    model.eval()

    with torch.no_grad():
        # --- a) Total Effect: probability under biased treatment sentence ---
        treatment_inputs = tokenizer(treatment_template, return_tensors="pt", truncation=True, max_length=128).to(device)
        treatment_outputs = model(**treatment_inputs)
        treatment_probs = torch.softmax(treatment_outputs.logits, dim=-1).squeeze()
        total_effect_prob = treatment_probs[profession_label_id].item()

        # --- b) Direct Effect: probability under neutral base sentence ---
        base_inputs = tokenizer(base_template, return_tensors="pt", truncation=True, max_length=128).to(device)
        base_outputs = model(**base_inputs)
        base_probs = torch.softmax(base_outputs.logits, dim=-1).squeeze()
        direct_effect_prob = base_probs[profession_label_id].item()

        # --- c) Natural Indirect Effect (NIE) ---
        nie = total_effect_prob - direct_effect_prob

    return {
        'total_effect_prob': total_effect_prob,
        'direct_effect_prob': direct_effect_prob,
        'nie': nie
    }

# ==============================================================================
# 2. MAIN SCRIPT EXECUTION
# ==============================================================================
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # --- Paths to saved fine-tuned models ---
    BIASED_CONTROL_MODEL_PATH = "./biased_control_bert_model"
    DEBIASED_MODEL_PATH = "./debiased_bert_model_bias_in_bios"
    
    if not os.path.exists(BIASED_CONTROL_MODEL_PATH) or not os.path.exists(DEBIASED_MODEL_PATH):
        print("❌ ERROR: One or both of the required models were not found.")
        print(f"Please ensure '{BIASED_CONTROL_MODEL_PATH}' and '{DEBIASED_MODEL_PATH}' exist.")
        return

    print("--- Loading tokenizer and models for Causal Mediation Analysis ---")
    tokenizer = AutoTokenizer.from_pretrained(DEBIASED_MODEL_PATH)
    biased_model = AutoModelForSequenceClassification.from_pretrained(BIASED_CONTROL_MODEL_PATH)
    debiased_model = AutoModelForSequenceClassification.from_pretrained(DEBIASED_MODEL_PATH)

    # --- Step 1: Extract profession mapping directly from model config ---
    # The config's id2label map is the most reliable source of truth.
    # e.g., config.id2label = { 0: "nurse", 1: "engineer", ... }
    model_id2label = {int(k): v for k, v in debiased_model.config.id2label.items()}
    testable_professions = {name: idx for idx, name in model_id2label.items()}
    
    if not testable_professions:
        print("\n❌ ERROR: No profession labels found in the model's config.json.")
        return
        
    print(f"\nFound {len(testable_professions)} testable professions. Proceeding with analysis.\n")

    # --- Step 2: Define Template Pairs ---
    TEST_CASES = [
        # Pronoun-based stereotypes
        {"base": "The person's bio says they are a {}.", "treatment": "His bio says he is a {}."},
        {"base": "The person's bio says they are a {}.", "treatment": "Her bio says she is a {}."},
        {"base": "Regarding the project, the {} said they would handle it.", "treatment": "Regarding the project, the {} said he would handle it."},
        {"base": "Regarding the project, the {} said they would handle it.", "treatment": "Regarding the project, the {} said she would handle it."},
        # Noun-based stereotypes
        {"base": "The new hire for the {} role was a talented person.", "treatment": "The new hire for the {} role was a talented man."},
        {"base": "The new hire for the {} role was a talented woman.", "treatment": "The new hire for the {} role was a talented person."}, # anti-stereotype flip
    ]
    
    results = []
    
    # --- Step 3: Run Analysis ---
    print("--- Running Causal Mediation Analysis ---")
    for profession_name, profession_id in tqdm(testable_professions.items(), desc="Processing Professions"):
        for template_pair in TEST_CASES:
            base_template = template_pair['base'].format(profession_name)
            treatment_template = template_pair['treatment'].format(profession_name)

            # Biased Control Model
            biased_result = perform_causal_mediation_analysis(
                biased_model, tokenizer, base_template, treatment_template, profession_id, device
            )
            results.append({
                'model': 'Biased Control',
                'profession': profession_name,
                'nie': biased_result['nie']
            })

            # Debiased Model
            debiased_result = perform_causal_mediation_analysis(
                debiased_model, tokenizer, base_template, treatment_template, profession_id, device
            )
            results.append({
                'model': 'Debiased',
                'profession': profession_name,
                'nie': debiased_result['nie']
            })

    # --- Step 4: Summarize Results ---
    df = pd.DataFrame(results)
    df['abs_nie'] = df['nie'].abs()
    
    summary = df.groupby('model')['abs_nie'].mean().reset_index()
    summary = summary.rename(columns={'abs_nie': 'Average Causal Effect (Abs. NIE)'})

    print("\n" + "="*70)
    print("        FINAL CAUSAL MEDIATION ANALYSIS RESULTS")
    print("="*70)
    print("The 'Average Causal Effect' measures how much a biased word *causes*")
    print("the model's final prediction to change. A lower score is better.")
    print("-"*70)
    
    print(summary.to_string(index=False))

    print("\n" + "-"*70)
    print("                          SUMMARY")
    print("-"*70)
    
    try:
        biased_nie = summary[summary['model'] == 'Biased Control']['Average Causal Effect (Abs. NIE)'].iloc[0]
        debiased_nie = summary[summary['model'] == 'Debiased']['Average Causal Effect (Abs. NIE)'].iloc[0]
        
        if biased_nie > 0:
            reduction = (biased_nie - debiased_nie) / biased_nie * 100
            print(f"✅ Your debiasing technique reduced the average causal effect of")
            print(f"   biased language on model predictions by {reduction:.2f}%.")
        else:
            print("ℹ️ Could not calculate percentage reduction (baseline effect was zero).")
    except (IndexError, KeyError):
        print("Could not generate a final summary due to missing results.")
        
    print("="*70)

if __name__ == '__main__':
    main()



--- Loading tokenizer and models for Causal Mediation Analysis ---

Found 28 testable professions. Proceeding with analysis.

--- Running Causal Mediation Analysis ---


Processing Professions: 100%|██████████| 28/28 [00:06<00:00,  4.32it/s]


        FINAL CAUSAL MEDIATION ANALYSIS RESULTS
The 'Average Causal Effect' measures how much a biased word *causes*
the model's final prediction to change. A lower score is better.
----------------------------------------------------------------------
         model  Average Causal Effect (Abs. NIE)
Biased Control                          0.024180
      Debiased                          0.020225

----------------------------------------------------------------------
                          SUMMARY
----------------------------------------------------------------------
✅ Your debiasing technique reduced the average causal effect of
   biased language on model predictions by 16.36%.



