In [1]:
import torch
import torch.nn as nn

import transformers

from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, default_data_collator, TrainingArguments, Trainer

from datasets import load_dataset
import evaluate

import random

import pandas as pd
import numpy as np
import collections


from metrics.crows_pairs import *
from metrics.stereoset.eval_discriminative_models import *
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
(datetime.now()).strftime("%Y-%m-%d")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
model_checkpoint = "bert-base-cased"
cp_input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/crows-pairs/data/crows_pairs_anonymized.csv'
model_save_dir = f"/home/bhatt/ishan/TUM_Thesis/data/models/{model_checkpoint}_"+(datetime.now()).strftime("%Y-%m-%d")
ft_model_save_dir = f"/home/bhatt/ishan/TUM_Thesis/data/models/{model_checkpoint}-finetuned-imdb_"+(datetime.now()).strftime("%Y-%m-%d")

### Load pre-trained model for training

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

### Calculate metric on Crows-Pairs dataset

In [None]:
output_file = '/home/bhatt/ishan/TUM_Thesis/data/results/cp_results.csv'
get_results(cp_input_file,output_file,model,tokenizer)

### Calculate metric for Setereoset

In [None]:
getStereoSet(pretrained_class =  model_checkpoint, tokenizer = tokenizer, 
             intrasentence_model =  model, 
             input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/stereoset/dev.json', 
             output_dir = '/home/bhatt/ishan/TUM_Thesis/data/results',
              output_file = 'stereoset_results.txt' )

### Fine Tune Model

In [None]:
model.num_parameters()/1_000_000

In [None]:
imdb_dataset = load_dataset("imdb")
imdb_dataset

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

In [None]:
chunk_size = 128
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=ft_model_save_dir,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    num_train_epochs = 10,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
# del trainer
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/crows-pairs/data/crows_pairs_anonymized.csv'
output_file = '/home/bhatt/ishan/TUM_Thesis/data/results/cp_results_fine_tuned.csv'
get_results(input_file,output_file,model,tokenizer)

In [5]:
from models import FineTuner

In [2]:
model_checkpoint = "bert-base-uncased"
cp_input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/crows-pairs/data/crows_pairs_anonymized.csv'
model_save_dir = f"/home/bhatt/ishan/TUM_Thesis/data/models/{model_checkpoint}_"+(datetime.now()).strftime("%Y-%m-%d")
ft_model_save_dir = f"/home/bhatt/ishan/TUM_Thesis/data/models/{model_checkpoint}-finetuned_"+(datetime.now()).strftime("%Y-%m-%d")

In [3]:
ft_model_save_dir
dataset_name = 'imdb'
ft_model_save_dir = f"/home/bhatt/ishan/TUM_Thesis/data/models/{model_checkpoint}-finetuned-{dataset_name}_"+(datetime.now()).strftime("%Y-%m-%d")
ft_model_save_dir

'/home/bhatt/ishan/TUM_Thesis/data/models/bert-base-uncased-finetuned-imdb_2023-12-07'

In [15]:
ft = FineTuner(model_name=model_checkpoint,from_local=True, model_save_dir=model_save_dir,local_model_path='/home/bhatt/ishan/TUM_Thesis/data/models/bert-base-cased-finetuned-imdb_2023-12-07/checkpoint-1500')

Initializing Model
Model Initialized!


In [7]:
# ft.finetune_model(dataset_name = 'imdb')

In [9]:
ft.getModel()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [10]:
ft.getTokenizer()

BertTokenizerFast(name_or_path='/home/bhatt/ishan/TUM_Thesis/data/models/bert-base-cased-finetuned-imdb_2023-12-07/checkpoint-1500', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [11]:
## Fine-tuned Model
output_file = '/home/bhatt/ishan/TUM_Thesis/data/results/cp_results.csv'
get_results(cp_input_file,output_file,ft.getModel(),ft.getTokenizer())

  _C._set_default_tensor_type(t)
  df_score = pd.concat([df_score,pd.DataFrame({'sent_more': [sent_more],
100%|██████████| 1508/1508 [03:51<00:00,  6.53it/s]


Total examples: 1508
Metric score: 54.58
Stereotype score: 55.97
Anti-stereotype score: 46.33
Num. neutral: 0 0.0



In [16]:
getStereoSet(pretrained_class =  model_checkpoint, tokenizer = ft.getTokenizer(), 
             intrasentence_model =  ft.getModel(), 
             input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/stereoset/dev.json', 
             output_dir = '/home/bhatt/ishan/TUM_Thesis/data/results',
              output_file = 'stereoset_results.txt' )

{'pretrained_class': 'bert-base-uncased', 'tokenizer': BertTokenizerFast(name_or_path='/home/bhatt/ishan/TUM_Thesis/data/models/bert-base-cased-finetuned-imdb_2023-12-07/checkpoint-1500', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), 'intrasentence_model': BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
 

100%|██████████| 8939/8939 [00:52<00:00, 168.87it/s]


intrasentence
	gender
		Count: 765.0
		LM Score: 85.4276129667434
		SS Score: 62.79373492851754
		ICAT Score: 63.568848249293346
	profession
		Count: 2430.0
		LM Score: 83.5448268821856
		SS Score: 59.77696247931412
		ICAT Score: 67.20853412682716
	race
		Count: 2886.0
		LM Score: 85.08667190668933
		SS Score: 57.01440153891281
		ICAT Score: 73.15003025942431
	religion
		Count: 237.0
		LM Score: 85.1264367816092
		SS Score: 52.643678160919535
		ICAT Score: 80.62549874488045
	overall
		Count: 2106.0
		LM Score: 84.54582829264109
		SS Score: 58.62905966516732
		ICAT Score: 69.95480835707727
overall
	Count: 2106.0
	LM Score: 84.54582829264109
	SS Score: 58.62905966516732
	ICAT Score: 69.95480835707727


In [13]:
## No Fine-tuning
output_file = '/home/bhatt/ishan/TUM_Thesis/data/results/cp_results.csv'
get_results(cp_input_file,output_file,ft.getModel(),ft.getTokenizer())

  df_score = pd.concat([df_score,pd.DataFrame({'sent_more': [sent_more],
100%|██████████| 1508/1508 [03:47<00:00,  6.62it/s]


Total examples: 1508
Metric score: 60.48
Stereotype score: 61.09
Anti-stereotype score: 56.88
Num. neutral: 0 0.0



In [14]:
getStereoSet(pretrained_class =  model_checkpoint, tokenizer = ft.getTokenizer(), 
             intrasentence_model =  ft.getModel(), 
             input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/stereoset/dev.json', 
             output_dir = '/home/bhatt/ishan/TUM_Thesis/data/results',
              output_file = 'stereoset_results.txt' )

{'pretrained_class': 'bert-base-uncased', 'tokenizer': BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), 'intrasentence_model': BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
      

100%|██████████| 8048/8048 [00:46<00:00, 171.77it/s]


intrasentence
	gender
		Count: 765.0
		LM Score: 85.96879714488409
		SS Score: 63.93016259103215
		ICAT Score: 62.01761070521017
	profession
		Count: 2430.0
		LM Score: 82.65493314194387
		SS Score: 61.44272071035787
		ICAT Score: 63.73898683641255
	race
		Count: 2886.0
		LM Score: 85.66785675023388
		SS Score: 57.445486685420484
		ICAT Score: 72.91107901418637
	religion
		Count: 237.0
		LM Score: 88.45977011494251
		SS Score: 56.45977011494253
		ICAT Score: 77.03117452767867
	overall
		Count: 2106.0
		LM Score: 84.66782429190386
		SS Score: 59.746836433430666
		ICAT Score: 68.16295560095116
overall
	Count: 2106.0
	LM Score: 84.66782429190386
	SS Score: 59.746836433430666
	ICAT Score: 68.16295560095116
