In [139]:
import torch
import torch.nn as nn

import transformers

from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, default_data_collator, TrainingArguments

from datasets import load_dataset
import evaluate

import random

import pandas as pd
import numpy as np
import collections


from metrics.crows_pairs import *

In [140]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load pre-trained model for training

In [141]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config.json: 100%|██████████| 483/483 [00:00<00:00, 3.74MB/s]
model.safetensors: 100%|██████████| 268M/268M [00:02<00:00, 112MB/s]  
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 258kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.90MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 27.1MB/s]


### Calculate metric on Bias Measurement dataset

In [142]:
input_file = '/home/bhatt/ishan/TUM_Thesis/data/metrics_ds/crows-pairs/data/crows_pairs_anonymized.csv'
output_file = '/home/bhatt/ishan/TUM_Thesis/data/results/cp_results.csv'
get_results(input_file,output_file,model,tokenizer)

  df_score = pd.concat([df_score,pd.DataFrame({'sent_more': [sent_more],
100%|██████████| 1508/1508 [02:31<00:00,  9.95it/s]

Total examples: 1508
Metric score: 56.83
Stereotype score: 57.33
Anti-stereotype score: 54.13
Num. neutral: 1 0.07






### Fine Tune Model

In [138]:
model.num_parameters()/1_000_000

108.340804

In [143]:
imdb_dataset = load_dataset("imdb")
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [144]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
Map:   4%|▍         | 1000/25000 [00:00<00:03, 6931.57 examples/s]

Map: 100%|██████████| 25000/25000 [00:03<00:00, 7640.17 examples/s]
Map: 100%|██████████| 25000/25000 [00:03<00:00, 7331.19 examples/s]
Map: 100%|██████████| 50000/50000 [00:06<00:00, 7202.80 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [145]:
chunk_size = 128
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [132]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [146]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [147]:
wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [148]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i [MASK] i am curious - yellow [MASK] my video store [MASK] [MASK] all the controversy that surrounded it when it was first released in 1967. i also heard that at first [MASK] [MASK] [MASK] by u. s. customs if [MASK] ever tried to enter this country [MASK] therefore [MASK] [MASK] fan [MASK] films considered " controversial " i [MASK] had to see this [MASK] myself [MASK] < br / [MASK] < br / > the plot [MASK] centered around a young swedish drama student [MASK] lena who wants to learn everything she can about life. in particular she wants [MASK] focus her attentions to making some sort [MASK] [MASK] on [MASK] [MASK] average [MASK] [MASK] thought [MASK] [MASK] political issues such'

'>>> as [MASK] [MASK] war and race issues in the [MASK] states. in between [MASK] [MASK] and ordinary denizens of stockholm about their opinions on [MASK], she has [MASK] with her drama teacher, classmates, and married men [MASK] < br / > < br / > what kills me about [MASK] am curious - [MASK] [M

In [149]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [150]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
)

In [151]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [152]:
trainer.train()

RuntimeError: Expected a 'cuda' device type for generator but found 'cpu'