In [20]:
import torch
import torch.nn as nn

import transformers

from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, default_data_collator, TrainingArguments, Trainer

from datasets import load_dataset
import evaluate

import random

import pandas as pd
import numpy as np
import collections


from metrics.crows_pairs import *
from metrics.stereoset.eval_discriminative_models import *
from datetime import datetime

ModuleNotFoundError: No module named 'dataloader'

In [2]:
(datetime.now()).strftime("%Y-%m-%d")

'2023-12-01'

In [3]:
%load_ext autoreload
%autoreload 2

In [36]:
model_checkpoint = "bert-base-cased"
cp_input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/crows-pairs/data/crows_pairs_anonymized.csv'
model_save_dir = f"/home/bhatt/ishan/TUM_Thesis/data/models/{model_checkpoint}_"+(datetime.now()).strftime("%Y-%m-%d")
ft_model_save_dir = f"/home/bhatt/ishan/TUM_Thesis/data/models/{model_checkpoint}-finetuned-imdb_"+(datetime.now()).strftime("%Y-%m-%d")

### Load pre-trained model for training

In [37]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Calculate metric on Crows-Pairs dataset

In [38]:
output_file = '/home/bhatt/ishan/TUM_Thesis/data/results/cp_results.csv'
get_results(cp_input_file,output_file,model,tokenizer)

  0%|          | 0/1508 [00:00<?, ?it/s]

  df_score = pd.concat([df_score,pd.DataFrame({'sent_more': [sent_more],
100%|██████████| 1508/1508 [04:02<00:00,  6.22it/s]


Total examples: 1508
Metric score: 55.11
Stereotype score: 55.89
Anti-stereotype score: 50.46
Num. neutral: 0 0.0



### Calculate metric for Setereoset

In [29]:
from metrics.stereoset.eval_discriminative_models import *

In [49]:
getStereoSet(pretrained_class =  model_checkpoint, tokenizer = tokenizer, 
             intrasentence_model =  model, 
             input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/stereoset/dev.json', 
             output_dir = '/home/bhatt/ishan/TUM_Thesis/data/results',
              output_file = 'stereoset_results.txt' )

{'pretrained_class': 'bert-base-cased', 'tokenizer': BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), 'intrasentence_model': BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
          

100%|██████████| 8939/8939 [00:53<00:00, 167.54it/s]


intrasentence
	gender
		Count: 765.0
		LM Score: 82.50328729241772
		SS Score: 61.48204661682922
		ICAT Score: 63.55715547775384
	profession
		Count: 2430.0
		LM Score: 82.31092099986019
		SS Score: 60.8476591974996
		ICAT Score: 64.45330461508425
	race
		Count: 2886.0
		LM Score: 83.82409779040428
		SS Score: 56.29627559199869
		ICAT Score: 73.26850537162359
	religion
		Count: 237.0
		LM Score: 82.16091954022988
		SS Score: 56.27586206896552
		ICAT Score: 71.84830757035274
	overall
		Count: 2106.0
		LM Score: 83.01912382272438
		SS Score: 58.68030062800166
		ICAT Score: 68.60650476963355
overall
	Count: 2106.0
	LM Score: 83.01912382272438
	SS Score: 58.68030062800166
	ICAT Score: 68.60650476963355


### Fine Tune Model

In [7]:
model.num_parameters()/1_000_000

66.98553

In [8]:
imdb_dataset = load_dataset("imdb")
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [9]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 50000/50000 [00:06<00:00, 7843.60 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [10]:
chunk_size = 128
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map: 100%|██████████| 50000/50000 [00:34<00:00, 1457.36 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [13]:
wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [14]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i am curious - [MASK] from my video store because of all [MASK] [MASK] that surrounded it when it was first [MASK] in [MASK]. i also [MASK] that at first it was seized by u [MASK] s. customs if it ever tried to enter this country, therefore being a fan of films considered " controversial " i [MASK] had to see [MASK] for myself. < [MASK] / > < [MASK] / > [MASK] plot [MASK] centered around a young swedish [MASK] student [MASK] lena who wants [MASK] learn everything she can [MASK] [MASK]. in [MASK] she wants to focus her attentions to making some [MASK] of documentary on what the average [MASK] [MASK] thought about certain political issues such'

'>>> as the vietnam war and [MASK] [MASK] in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has [MASK] with her drama teacher, classmates, and [MASK] men. < [MASK] [MASK] > < br / > [MASK] kills me about [MASK] am curious - [MASK] is that 40 years ago,

In [15]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [16]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    num_train_epochs = 10,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [19]:
trainer.train()

RuntimeError: Expected a 'cuda' device type for generator but found 'cpu'

In [None]:
input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/crows-pairs/data/crows_pairs_anonymized.csv'
output_file = '/home/bhatt/ishan/TUM_Thesis/data/results/cp_results_fine_tuned.csv'
get_results(input_file,output_file,model,tokenizer)