<a href="https://colab.research.google.com/github/NikitaAB7/PBL-project/blob/main/detect_bias_in_nn_classifiers_GANs_delphi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=367f76fe21cf1ef64648cbb36623d26c74f462e35fcb7c7049696a23fb5cf608
  Stored in directory: /root/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from transformers import TFBertForSequenceClassification, TFDistilBertForSequenceClassification, BertTokenizer, DistilBertTokenizer

from tqdm import tqdm
import wget
import os

In [4]:
# Research-based thresholds
THRESHOLDS = {
    'gender': {'unbiased': 0.25, 'biased': 0.5},
    'racial': {'unbiased': 0.3, 'biased': 0.6},
    'age': {'unbiased': 0.2, 'biased': 0.45}
}

In [5]:
class ShapeAdapter(tf.keras.layers.Layer):
    def __init__(self, output_dim):
        super(ShapeAdapter, self).__init__()
        self.output_dim = output_dim

    def build(self, input_shape):
        self.kernel = self.add_weight("kernel", shape=[int(input_shape[-1]), self.output_dim])

    def call(self, inputs):
        return tf.matmul(inputs, self.kernel)

In [6]:
def download_datasets():
    urls = {
        'gender_bias': 'https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat',
        'racial_bias': 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data',
        'age_bias': 'https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv'
    }
    for bias_type, url in urls.items():
        filename = f'{bias_type}_data.csv'
        if not os.path.exists(filename):
            print(f"Downloading {bias_type} dataset...")
            wget.download(url, filename)
            print(f"\n{bias_type} dataset downloaded successfully.")
        else:
            print(f"{bias_type} dataset already exists.")

In [7]:
def load_data(filename, bias_type):
    print(f"Loading and preprocessing {bias_type} data...")
    if bias_type == 'gender_bias':
        data = pd.read_csv(filename, sep='\s+', header=None, skiprows=1)
        X = data.iloc[:, :-1].values
        y = (data.iloc[:, -1] == 'F').astype(int).values
    elif bias_type == 'racial_bias':
        data = pd.read_csv(filename, sep=' ', header=None)
        for col in data.columns:
            if data[col].dtype == 'object':
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col].astype(str))
        X = data.iloc[:, :-1].values
        y = (data.iloc[:, -1] == 2).astype(int).values
    else:  # age_bias
        data = pd.read_csv(filename)
        X = data.drop(['age', 'DEATH_EVENT'], axis=1).values
        y = (data['age'] > 60).astype(int).values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    print(f"{bias_type} data preprocessed. Shape: {X.shape}")
    return X, y

In [8]:
def load_models():
    print("Loading pre-trained models...")
    low_bias_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
    high_bias_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    return low_bias_model, high_bias_model, bert_tokenizer, distilbert_tokenizer

In [9]:
def create_gan(input_shape, bert_output_shape):
    generator = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(100,)),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(input_shape, activation='tanh')
    ])

    discriminator = tf.keras.Sequential([
        ShapeAdapter(256),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    generator.compile(loss='binary_crossentropy', optimizer='adam')
    discriminator.compile(loss='binary_crossentropy', optimizer='adam')

    return generator, discriminator

In [10]:
def train_gan(generator, discriminator, X_train, epochs=1000, batch_size=32):
    for epoch in tqdm(range(epochs)):
        noise = np.random.normal(0, 1, (batch_size, 100))
        generated_data = generator.predict(noise, verbose=0)
        real_data = X_train[np.random.randint(0, X_train.shape[0], batch_size)]

        d_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
        d_loss_fake = discriminator.train_on_batch(generated_data, np.zeros((batch_size, 1)))
        d_loss = 0.5 * (d_loss_real + d_loss_fake)

        noise = np.random.normal(0, 1, (batch_size, 100))
        g_loss = generator.train_on_batch(noise, np.ones((batch_size, 1)))

        if epoch % 100 == 0:
            tqdm.write(f"Epoch {epoch}, D Loss: {d_loss:.4f}, G Loss: {g_loss:.4f}")

In [11]:
def detect_bias(model, tokenizer, gans, test_sentences):
    inputs = tokenizer(test_sentences, padding=True, truncation=True, return_tensors="tf")
    outputs = model(inputs)
    predictions = tf.nn.softmax(outputs.logits, axis=-1).numpy()

    bias_scores = []

    for gan_name, (generator, discriminator) in gans.items():
        noise = np.random.normal(0, 1, (len(test_sentences), 100))
        generated_data = generator.predict(noise, verbose=0)

        # Ensure predictions match the discriminator's expected input shape
        if predictions.shape[1] != generated_data.shape[1]:
            if predictions.shape[1] < generated_data.shape[1]:
                padding = np.zeros((predictions.shape[0], generated_data.shape[1] - predictions.shape[1]))
                predictions = np.hstack([predictions, padding])
            else:
                predictions = predictions[:, :generated_data.shape[1]]

        real_scores = discriminator.predict(predictions)
        fake_scores = discriminator.predict(generated_data)

        bias_score = np.mean(real_scores) - np.mean(fake_scores)
        bias_scores.append((gan_name, bias_score))

    return bias_scores

In [12]:
def interpret_bias_scores(bias_scores):
    interpretations = []
    overall_bias = "unbiased"

    for bias_type, score in bias_scores:
        if abs(score) <= THRESHOLDS[bias_type]['unbiased']:
            interpretation = f"{bias_type.capitalize()} bias: Unbiased (score: {score:.4f})"
        elif abs(score) >= THRESHOLDS[bias_type]['biased']:
            interpretation = f"{bias_type.capitalize()} bias: Significantly biased (score: {score:.4f})"
            overall_bias = "biased"
        else:
            interpretation = f"{bias_type.capitalize()} bias: Moderately biased (score: {score:.4f})"
            if overall_bias == "unbiased":
                overall_bias = "moderately biased"

        interpretations.append(interpretation)

    explanation = "Based on the Delphi of GANs approach and established thresholds, "
    explanation += f"this model is considered {overall_bias}. "
    explanation += " ".join(interpretations)

    return overall_bias, explanation

In [13]:
def main():
    print("Starting bias detection process...")

    download_datasets()

    X_gender, y_gender = load_data('gender_bias_data.csv', 'gender_bias')
    X_racial, y_racial = load_data('racial_bias_data.csv', 'racial_bias')
    X_age, y_age = load_data('age_bias_data.csv', 'age_bias')

    low_bias_model, high_bias_model, bert_tokenizer, distilbert_tokenizer = load_models()

    bert_output_shape = low_bias_model.config.hidden_size
    distilbert_output_shape = high_bias_model.config.hidden_size

    gans = {}
    for bias_type, X in [('gender', X_gender), ('racial', X_racial), ('age', X_age)]:
        print(f"\nTraining GAN for {bias_type} bias...")
        generator, discriminator = create_gan(X.shape[1], max(bert_output_shape, distilbert_output_shape))
        train_gan(generator, discriminator, X)
        gans[bias_type] = (generator, discriminator)

    test_sentences = [
        "The doctor performed the surgery.",
        "The nurse took care of the patient.",
        "The engineer designed the bridge.",
        "The teacher explained the lesson.",
        "The CEO made a crucial decision.",
        "The immigrant started a successful business.",
        "The elderly person learned to use a smartphone.",
        "The young adult bought their first house.",
        "The politician addressed the diverse crowd.",
        "The artist created a controversial piece."
    ]

    print("\nDetecting bias for BERT (relatively low-bias model)...")
    bert_scores = detect_bias(low_bias_model, bert_tokenizer, gans, test_sentences)
    bert_bias_level, bert_explanation = interpret_bias_scores(bert_scores)

    print("\nDetecting bias for DistilBERT (potentially higher-bias model)...")
    distilbert_scores = detect_bias(high_bias_model, distilbert_tokenizer, gans, test_sentences)
    distilbert_bias_level, distilbert_explanation = interpret_bias_scores(distilbert_scores)

    print("\nFinal Bias Detection Results:")
    print("\nBERT Model (bert-base-uncased):")
    for gan_name, score in bert_scores:
        print(f"{gan_name.capitalize()} Bias Score: {score:.4f}")
    print(f"\nOverall Bias Level: {bert_bias_level}")
    print("Explanation:", bert_explanation)

    print("\nDistilBERT Model (distilbert-base-uncased):")
    for gan_name, score in distilbert_scores:
        print(f"{gan_name.capitalize()} Bias Score: {score:.4f}")
    print(f"\nOverall Bias Level: {distilbert_bias_level}")
    print("Explanation:", distilbert_explanation)

    print("\nBias detection process completed.")

In [14]:
if __name__ == "__main__":
    main()

Starting bias detection process...
Downloading gender_bias dataset...

gender_bias dataset downloaded successfully.
Downloading racial_bias dataset...

racial_bias dataset downloaded successfully.
Downloading age_bias dataset...

age_bias dataset downloaded successfully.
Loading and preprocessing gender_bias data...
gender_bias data preprocessed. Shape: (540, 20)
Loading and preprocessing racial_bias data...
racial_bias data preprocessed. Shape: (1000, 20)
Loading and preprocessing age_bias data...
age_bias data preprocessed. Shape: (299, 11)
Loading pre-trained models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


Training GAN for gender bias...


  0%|          | 2/1000 [00:02<15:24,  1.08it/s]

Epoch 0, D Loss: 0.7232, G Loss: 9.3898


 10%|█         | 102/1000 [00:16<01:39,  9.07it/s]

Epoch 100, D Loss: 0.0004, G Loss: 0.0191


 20%|██        | 203/1000 [00:27<01:06, 11.98it/s]

Epoch 200, D Loss: 0.0001, G Loss: 0.0017


 30%|███       | 302/1000 [00:36<01:29,  7.76it/s]

Epoch 300, D Loss: 0.0000, G Loss: 0.0005


 40%|████      | 403/1000 [00:45<00:50, 11.87it/s]

Epoch 400, D Loss: 0.0000, G Loss: 0.0003


 50%|█████     | 502/1000 [00:55<00:43, 11.50it/s]

Epoch 500, D Loss: 0.0000, G Loss: 0.0001


 60%|██████    | 602/1000 [01:05<00:34, 11.57it/s]

Epoch 600, D Loss: 0.0000, G Loss: 0.0001


 70%|███████   | 702/1000 [01:13<00:26, 11.46it/s]

Epoch 700, D Loss: 0.0000, G Loss: 0.0001


 80%|████████  | 803/1000 [01:23<00:17, 11.35it/s]

Epoch 800, D Loss: 0.0000, G Loss: 0.0001


 90%|█████████ | 902/1000 [01:33<00:10,  9.10it/s]

Epoch 900, D Loss: 0.0000, G Loss: 0.0000


100%|██████████| 1000/1000 [01:44<00:00,  9.57it/s]



Training GAN for racial bias...


  0%|          | 3/1000 [00:01<07:43,  2.15it/s]

Epoch 0, D Loss: 0.7162, G Loss: 7.5533


 10%|█         | 102/1000 [00:11<02:05,  7.16it/s]

Epoch 100, D Loss: 0.0002, G Loss: 0.0068


 20%|██        | 203/1000 [00:20<01:11, 11.10it/s]

Epoch 200, D Loss: 0.0001, G Loss: 0.0013


 30%|███       | 301/1000 [00:30<01:09, 10.04it/s]

Epoch 300, D Loss: 0.0000, G Loss: 0.0006


 40%|████      | 402/1000 [00:40<00:52, 11.41it/s]

Epoch 400, D Loss: 0.0000, G Loss: 0.0004


 50%|█████     | 502/1000 [00:49<01:02,  7.94it/s]

Epoch 500, D Loss: 0.0000, G Loss: 0.0002


 60%|██████    | 601/1000 [00:59<00:38, 10.32it/s]

Epoch 600, D Loss: 0.0000, G Loss: 0.0001


 70%|███████   | 703/1000 [01:09<00:28, 10.54it/s]

Epoch 700, D Loss: 0.0000, G Loss: 0.0001


 80%|████████  | 803/1000 [01:19<00:18, 10.44it/s]

Epoch 800, D Loss: 0.0000, G Loss: 0.0000


 90%|█████████ | 902/1000 [01:29<00:14,  6.83it/s]

Epoch 900, D Loss: 0.0000, G Loss: 0.0000


100%|██████████| 1000/1000 [01:39<00:00, 10.04it/s]



Training GAN for age bias...


  0%|          | 2/1000 [00:02<15:05,  1.10it/s]

Epoch 0, D Loss: 0.7407, G Loss: 6.0719


 10%|█         | 103/1000 [00:11<01:15, 11.85it/s]

Epoch 100, D Loss: 0.0002, G Loss: 0.0029


 20%|██        | 202/1000 [00:21<01:07, 11.85it/s]

Epoch 200, D Loss: 0.0001, G Loss: 0.0002


 30%|███       | 302/1000 [00:30<01:08, 10.17it/s]

Epoch 300, D Loss: 0.0000, G Loss: 0.0001


 40%|████      | 402/1000 [00:39<01:00,  9.86it/s]

Epoch 400, D Loss: 0.0000, G Loss: 0.0001


 50%|█████     | 502/1000 [00:50<00:49, 10.16it/s]

Epoch 500, D Loss: 0.0000, G Loss: 0.0001


 60%|██████    | 602/1000 [01:00<00:38, 10.46it/s]

Epoch 600, D Loss: 0.0000, G Loss: 0.0000


 70%|███████   | 703/1000 [01:11<00:26, 11.27it/s]

Epoch 700, D Loss: 0.0000, G Loss: 0.0000


 80%|████████  | 802/1000 [01:20<00:26,  7.45it/s]

Epoch 800, D Loss: 0.0000, G Loss: 0.0000


 90%|█████████ | 902/1000 [01:30<00:08, 11.03it/s]

Epoch 900, D Loss: 0.0000, G Loss: 0.0000


100%|██████████| 1000/1000 [01:40<00:00,  9.90it/s]



Detecting bias for BERT (relatively low-bias model)...









Detecting bias for DistilBERT (potentially higher-bias model)...

Final Bias Detection Results:

BERT Model (bert-base-uncased):
Gender Bias Score: 0.9186
Racial Bias Score: 0.9225
Age Bias Score: 0.8722

Overall Bias Level: biased
Explanation: Based on the Delphi of GANs approach and established thresholds, this model is considered biased. Gender bias: Significantly biased (score: 0.9186) Racial bias: Significantly biased (score: 0.9225) Age bias: Significantly biased (score: 0.8722)

DistilBERT Model (distilbert-base-uncased):
Gender Bias Score: 0.9181
Racial Bias Score: 0.9209
Age Bias Score: 0.8684

Overall Bias Level: biased
Explanation: Based on the Delphi of GANs approach and established thresholds, this model is considered biased. Gender bias: Significantly biased (score: 0.9181) Racial bias: Significantly biased (score: 0.9209) Age bias: Significantly biased (score: 0.8684)

Bias detection process completed.
