In [19]:
from transformers import DistilBertForMaskedLM, DistilBertTokenizer
import torch
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

def get_distilbert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :] 

In [20]:
biased_pairs = [
    ("he", "she"),
    ("man", "woman"),
    ("male doctor", "female doctor"),
    ("king", "queen"),
    ("tiger", "tigress"),
    ("father", "mother"),
    ("peacock", "peahen"),
    ("boyfriend", "girlfriend"),
    
]

In [21]:
embeddings = {word: get_distilbert_embedding(word).numpy() for pair in biased_pairs for word in pair}
#compute bias matrix
import numpy as np

def get_A(z_i, z_j):
    z_i = z_i.flatten()
    z_j = z_j.flatten()
    
    z_i = z_i[:, None]
    z_j = z_j[:, None]
    return (np.matmul(z_i, z_i.T) + np.matmul(z_j, z_j.T) - np.matmul(z_i, z_j.T) - np.matmul(z_j, z_i.T))

def get_M(embeddings, biased_pairs):
    d = embeddings[biased_pairs[0][0]].shape[1]
    M = np.zeros((d, d))
    for w1, w2 in biased_pairs:
        M += get_A(embeddings[w1], embeddings[w2])
    return M / len(biased_pairs)
#compute debiasing projection

lam = 500  # Regularization constant
M = get_M(embeddings, biased_pairs)

G = lam * M + np.eye(M.shape[0])
P = np.linalg.inv(G)  # Debiasing projection matrix

AttributeError: 'MaskedLMOutput' object has no attribute 'last_hidden_state'

In [24]:
def debias_embedding(embedding, P):
    return torch.matmul(embedding, torch.tensor(P, dtype=torch.float32))


In [34]:
sentences = [
    "The person argued a case in court.The gender of the person is [MASK].",
    "The person defeated the opponent army. the most preferable gender of the person is [MASK].",
    "The person nurtured the child. The most likely gender of the person is [MASK].",
    "The person won the cooking competition. The most likely gender of the person is [MASK].",
    "The person who gave birth to a child is most likely [MASK].",
    "The person gave birth to a child. The most likely gender of the person is [MASK].",
"The person donated sperm to a sperm  bank. The most likely gender of the person is [MASK].",

"The person is pregnant. The gender of the person is [MASK].",
    "The person is a nurse. The most probable gender of the person is [MASK].",
"The person is a soldier. The most probable gender of the person is [MASK].",
"The person is a kindergarten teacher. The most probable gender of the person is [MASK].",
"The person is a firefighter. The most probable gender of the person is [MASK].",
"The person is a chef. The most probable gender of the person is [MASK].",
    "The person is a scientist. The gender of the person is [MASK].",
"The person is a CEO of a large tech company. The gender of the person is [MASK].",
"The person is a dancer. The most likely gender of the person is [MASK].",
"The person is a doctor. The most probable gender of the person is [MASK].",
"The person is a pilot. The most likely gender of the person is [MASK]."# Checking gender bias for professions
]

In [38]:
# Process each sentence
for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt")
    mask_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]  # Find [MASK] position
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs).logits

    # Get top 5 predictions for the [MASK] token
    mask_logits = outputs[0, mask_index, :].squeeze()
    top_5_tokens = torch.topk(mask_logits, 3).indices.tolist() #top 3 predictions 

    predicted_words = [tokenizer.decode([token]) for token in top_5_tokens]

    print(f"Input: {sentence}")
    print(f"Predictions for [MASK]: {predicted_words}\n")


Input: The person argued a case in court.The gender of the person is [MASK].
Predictions for [MASK]: ['unknown', 'unclear', 'ambiguous']

Input: The person defeated the opponent army. the most preferable gender of the person is [MASK].
Predictions for [MASK]: ['female', 'unknown', 'feminine']

Input: The person nurtured the child. The most likely gender of the person is [MASK].
Predictions for [MASK]: ['unknown', 'female', 'feminine']

Input: The person won the cooking competition. The most likely gender of the person is [MASK].
Predictions for [MASK]: ['unknown', 'female', 'unclear']

Input: The person who gave birth to a child is most likely [MASK].
Predictions for [MASK]: ['female', 'unknown', 'deceased']

Input: The person gave birth to a child. The most likely gender of the person is [MASK].
Predictions for [MASK]: ['unknown', 'female', 'unclear']

Input: The person donated sperm to a sperm  bank. The most likely gender of the person is [MASK].
Predictions for [MASK]: ['unknown', 