In [None]:
# This is an updated version of the report attched in the repo
# It investigates three bias dimensionsns race, religion and gender (male and female)
# with only one metric: False Positive Rate

In [1]:
# import necessary libraries

import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

In [20]:
religion_data = pd.read_csv("data/path/")

In [21]:
religion_data[religion_data['target'] == 1]['comment_text'].count()


6944

In [22]:
religion_data[religion_data['target'] == 0]['comment_text'].count()

47389

In [23]:

# Separate the dataset into two groups
group_0 = religion_data[religion_data['target'] == 0]
group_1 = religion_data[religion_data['target'] == 1]

# Count the number of instances in each group
count_group_0 = group_0.shape[0]
count_group_1 = group_1.shape[0]

# Resample the larger group
if count_group_0 > count_group_1:
    group_0 = group_0.sample(count_group_1)  # Downsample group 0
else:
    group_1 = group_1.sample(count_group_0)  # Downsample group 1

# Concatenate the resampled groups
balanced_religion_data = pd.concat([group_0, group_1])

# Shuffle the balanced dataset
balanced_religion_data = balanced_religion_data.sample(frac=1).reset_index(drop=True)


In [24]:
balanced_religion_data['target'].value_counts()

0    6944
1    6944
Name: target, dtype: int64

In [25]:
balanced_religion_data['religion'].value_counts()

0    7888
1    6000
Name: religion, dtype: int64

In [2]:
train_data = pd.read_csv("path/to/tran/data")
val_data = pd.read_csv("path/to/val/data")

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [5]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = df['hate_speech'].values
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['comment']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [6]:
df_train = train_data
df_val = val_data


print(len(df_train),len(df_val))

30246 1000


In [7]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 768)
        self.linear2 = nn.Linear(768, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear1(dropout_output)
        linear_output = self.linear2(linear_output)
        final_layer = self.sigmoid(linear_output)

        return final_layer

In [30]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
#     print('Labels: ', y)
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
#     print('Rounded preds',rounded_preds)
    correct = (rounded_preds == y).float() #convert into float for division
#     print('Correct: ',correct)
    acc = correct.sum() / len(correct)
#     print('Length: ',len(correct))
#     print('Sum of correct: ', correct.sum())
#     print('Accuracy: ', acc.item())
    return acc

In [9]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=24, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=24)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    val_loss_extreme = 100000

    if use_cuda:
            model= nn.DataParallel(model)
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            correct_train_sum = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.unsqueeze(1).to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                
#                 print('Output Shape: ', output.shape)
#                 print('train label shape: ', train_label.shape)
                
#                 print(type(output))
#                 print(type(train_label))
                train_label = train_label.to(torch.float32)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                ## acc calc
                
                
                rounded_preds = torch.round(output)
                correct_train = (rounded_preds == train_label).float()
                correct_train_sum += correct_train.sum()
#                 print(correct_train_sum)
                
                
                ##
                
#                 acc = (output.argmax(dim=1) == train_label).sum().item()
#                 acc = binary_accuracy(output, train_label)
#                 total_acc_train += acc.item()

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
#             print((correct_train_sum/len(train_data)))
            train_acc = (correct_train_sum/len(train_data)).item()
            
            total_acc_val = 0
            total_loss_val = 0
            correct_val_sum = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.unsqueeze(1).to(device)
                    val_label = val_label.to(torch.float32)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    rounded_preds = torch.round(output)
                    correct_val = (rounded_preds == val_label).float()
                    correct_val_sum += correct_val.sum()
                    
#                     acc = binary_accuracy(output, val_label)
#                     total_acc_val += acc.item()
            
            
            val_acc = (correct_val_sum/len(val_data)).item()
            if total_loss_val < val_loss_extreme:
                val_loss_extreme = total_loss_val
                torch.save(model.state_dict(), '/nfs/hpc/share/sumons/bias-bert-model_mlg_combined.pt')
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {train_acc: .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {val_acc: .3f}')
                  

In [64]:
EPOCHS = 15
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1261/1261 [11:09<00:00,  1.88it/s]


Epochs: 1 | Train Loss:  0.027 | Train Accuracy:  0.659 | Val Loss:  0.025 | Val Accuracy:  0.796


100%|██████████| 1261/1261 [11:19<00:00,  1.86it/s]


Epochs: 2 | Train Loss:  0.025 | Train Accuracy:  0.802 | Val Loss:  0.025 | Val Accuracy:  0.803


100%|██████████| 1261/1261 [11:18<00:00,  1.86it/s]


Epochs: 3 | Train Loss:  0.024 | Train Accuracy:  0.826 | Val Loss:  0.025 | Val Accuracy:  0.814


100%|██████████| 1261/1261 [11:19<00:00,  1.86it/s]


Epochs: 4 | Train Loss:  0.024 | Train Accuracy:  0.847 | Val Loss:  0.025 | Val Accuracy:  0.820


100%|██████████| 1261/1261 [11:17<00:00,  1.86it/s]


Epochs: 5 | Train Loss:  0.024 | Train Accuracy:  0.860 | Val Loss:  0.025 | Val Accuracy:  0.816


100%|██████████| 1261/1261 [11:18<00:00,  1.86it/s]


Epochs: 6 | Train Loss:  0.024 | Train Accuracy:  0.869 | Val Loss:  0.025 | Val Accuracy:  0.814


100%|██████████| 1261/1261 [11:19<00:00,  1.86it/s]


Epochs: 7 | Train Loss:  0.023 | Train Accuracy:  0.874 | Val Loss:  0.025 | Val Accuracy:  0.815


100%|██████████| 1261/1261 [11:18<00:00,  1.86it/s]


Epochs: 8 | Train Loss:  0.023 | Train Accuracy:  0.880 | Val Loss:  0.025 | Val Accuracy:  0.815


100%|██████████| 1261/1261 [11:20<00:00,  1.85it/s]


Epochs: 9 | Train Loss:  0.023 | Train Accuracy:  0.884 | Val Loss:  0.025 | Val Accuracy:  0.810


100%|██████████| 1261/1261 [11:17<00:00,  1.86it/s]


Epochs: 10 | Train Loss:  0.023 | Train Accuracy:  0.888 | Val Loss:  0.025 | Val Accuracy:  0.827


100%|██████████| 1261/1261 [11:20<00:00,  1.85it/s]


Epochs: 11 | Train Loss:  0.023 | Train Accuracy:  0.890 | Val Loss:  0.025 | Val Accuracy:  0.813


100%|██████████| 1261/1261 [11:20<00:00,  1.85it/s]


Epochs: 12 | Train Loss:  0.023 | Train Accuracy:  0.893 | Val Loss:  0.025 | Val Accuracy:  0.815


100%|██████████| 1261/1261 [11:18<00:00,  1.86it/s]


Epochs: 13 | Train Loss:  0.023 | Train Accuracy:  0.895 | Val Loss:  0.025 | Val Accuracy:  0.818


100%|██████████| 1261/1261 [11:20<00:00,  1.85it/s]


Epochs: 14 | Train Loss:  0.023 | Train Accuracy:  0.896 | Val Loss:  0.025 | Val Accuracy:  0.824


100%|██████████| 1261/1261 [11:21<00:00,  1.85it/s]


Epochs: 15 | Train Loss:  0.023 | Train Accuracy:  0.896 | Val Loss:  0.025 | Val Accuracy:  0.814


In [13]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=12)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    

    total_acc_test = 0
    correct_test_sum = 0
    predictions = []
    
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            test_label = test_label.unsqueeze(1).to(device)
            test_label = test_label.to(torch.float32)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            
            rounded_preds = torch.round(output)
            for pred in rounded_preds:
                predictions.append(int(pred.item()))
                
            correct_test = (rounded_preds == test_label).float()
            correct_test_sum += correct_test.sum()
    

    test_acc = (correct_test_sum/len(test_data)).item()
    print(f'Test Accuracy: {test_acc: .3f}')
    
    return predictions

In [27]:
# loading the hold-out test data for each of the bias dimension

test_gender = pd.read_csv("/test/gender/data")
test_religion = pd.read_csv("/test/religion/data")
test_race = pd.read_csv("/test/race/data")

In [28]:
model = BertClassifier()
model= nn.DataParallel(model)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = model.to(device)
model.load_state_dict(torch.load('/bias-bert-model_mlg_combined.pt'))

predictions_gender = evaluate(model, test_gender)
predictions_religion = evaluate(model, test_religion)
predictions_race = evaluate(model, test_race)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.804
Test Accuracy:  0.774
Test Accuracy:  0.727


In [37]:
## Test for bias by seeing the false negatives

### Case: Religion

In [29]:
test_gender['predictions'] = predictions_gender
test_religion['predictions'] = predictions_religion
test_race['predictions'] = predictions_race

In [18]:
df_test_muslim = test_religion[test_religion['muslim'] == 1]

In [19]:
df_test_muslim

Unnamed: 0,hate_speech,comment,muslim,christian,religion,predictions
0,0,"Just more misinformed right wingers, got their...",1,0,1,1
4,0,When the arguments were being made at the lowe...,1,0,1,0
7,0,"Wake the hell up, ISIS is at war with us. The...",1,0,1,1
9,0,2 major terrorist attacks today -- the fear do...,1,0,1,0
10,0,Get some muslims in there for their starting r...,1,0,1,1
...,...,...,...,...,...,...
494,1,Another Muslim attacking Americans. When will ...,1,0,1,1
495,1,"Dear authors of this liberal-biased drivel, th...",1,0,1,1
496,1,"Save your time and money, Erdogan. The whole w...",1,0,1,1
497,1,no actually we gave women suffrage to be thems...,1,0,1,1


In [20]:
df_test_christian = test_religion[test_religion['christian'] == 1]
df_test_christian

Unnamed: 0,hate_speech,comment,muslim,christian,religion,predictions
1,0,"Ah, so you are a Cafeteria Catholic.",0,1,0,0
2,0,contact Catholic Ann Barnhardt,0,1,0,0
3,0,The reason for focus on chapter eight is that ...,0,1,0,0
5,0,"""...to find good married couples, and draw pri...",0,1,0,0
6,0,"No. ""Conservatives,"" as you call orthodox Cath...",0,1,0,0
...,...,...,...,...,...,...
485,1,I don’t understand how Pope F. could talk abou...,0,1,0,1
486,1,The one thing DEMS had in common is they didn'...,0,1,0,1
487,1,Timothy McVeigh was a Roman Catholic and he ki...,0,1,0,0
490,1,"""Eight years ago, when Barack Obama was electe...",0,1,0,1


In [47]:
# Calculate false positives for male
false_positives_muslim = ((df_test_muslim['hate_speech'] == 0) & (df_test_muslim['predictions'] == 1)).sum()

# Calculate total actual negatives
total_actual_negatives_muslim = (df_test_muslim['hate_speech'] == 0).sum()

# Calculate false positive rate
false_positive_rate_muslim = false_positives_muslim / total_actual_negatives_muslim

# Print the false positive rate
print(f"False Positive Rate: {false_positive_rate_muslim}")


False Positive Rate: 0.3181818181818182


In [48]:
# Calculate false positives for female
false_positives_christian = ((df_test_christian['hate_speech'] == 0) & (df_test_christian['predictions'] == 1)).sum()

# Calculate total actual negatives
total_actual_negatives_christian = (df_test_christian['hate_speech'] == 0).sum()

# Calculate false positive rate
false_positive_rate_christian = false_positives_christian / total_actual_negatives_christian

# Print the false positive rate
print(f"False Positive Rate: {false_positive_rate_christian}")

False Positive Rate: 0.08695652173913043


### Case: Gender

In [24]:
df_test_female = test_gender[test_gender['female'] == 1]
df_test_male = test_gender[test_gender['male'] == 1]

In [40]:
df_test_male['hate_speech'].count()

215

In [42]:
df_test_female = df_test_female[:216]

In [46]:
df_test_female['hate_speech'].count()

216

In [44]:
# Calculate false positives for male
false_positives_female = ((df_test_female['hate_speech'] == 0) & (df_test_female['predictions'] == 1)).sum()

# Calculate total actual negatives
total_actual_negatives_female = (df_test_female['hate_speech'] == 0).sum()

# Calculate false positive rate
false_positive_rate_female = false_positives_female / total_actual_negatives_female

# Print the false positive rate
print(f"False Positive Rate: {false_positive_rate_female}")

False Positive Rate: 0.14965986394557823


In [45]:
# Calculate false positives for male
false_positives_male = ((df_test_male['hate_speech'] == 0) & (df_test_male['predictions'] == 1)).sum()

# Calculate total actual negatives
total_actual_negatives_male = (df_test_male['hate_speech'] == 0).sum()

# Calculate false positive rate
false_positive_rate_male = false_positives_male / total_actual_negatives_male

# Print the false positive rate
print(f"False Positive Rate: {false_positive_rate_male}")

False Positive Rate: 0.17475728155339806


### Case: Race

In [30]:
df_test_white = test_race[test_race['white'] == 1]
df_test_black = test_race[test_race['black'] == 1]

In [60]:
df_test_white = df_test_white[:250]

In [61]:
df_test_black = df_test_black[:250]

In [62]:
# Calculate false positives for male
false_positives_white = ((df_test_white['hate_speech'] == 0) & (df_test_white['predictions'] == 1)).sum()

# Calculate total actual negatives
total_actual_negatives_white = (df_test_white['hate_speech'] == 0).sum()

# Calculate false positive rate
false_positive_rate_white = false_positives_white / total_actual_negatives_white

# Print the false positive rate
print(f"False Positive Rate: {false_positive_rate_white}")

False Positive Rate: 0.2935323383084577


In [63]:
# Calculate false positives for male
false_positives_black = ((df_test_black['hate_speech'] == 0) & (df_test_black['predictions'] == 1)).sum()

# Calculate total actual negatives
total_actual_negatives_black = (df_test_black['hate_speech'] == 0).sum()

# Calculate false positive rate
false_positive_rate_black = false_positives_black / total_actual_negatives_black

# Print the false positive rate
print(f"False Positive Rate: {false_positive_rate_black}")

False Positive Rate: 0.15428571428571428
