In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_path= 'bert-base-uncased'

tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [None]:
from datasets import load_dataset

imdb_train = load_dataset("imdb", split="train[:2000]+train[-2000:]")

imdb_test = load_dataset("imdb", split="test[:500]+test[-500:]")

imdb_val = load_dataset("imdb", split="test[500:1000]+test[-1000:-500]")

imdb_train.shape, imdb_test.shape, imdb_val.shape

OUTPUT: ((4000, 2), (1000, 2), (1000, 2))


def tokenize_it(e):

    return tokenizer(e["text"], padding=True, truncation=True)


enc_train = imdb_train.map(tokenize_it, batched=True, batch_size=1000)

enc_test = imdb_test.map(tokenize_it, batched=True, batch_size=1000)

enc_val = imdb_val.map(tokenize_it, batched=True, batch_size=1000)

In [None]:
dataset_for_adaptation= load_dataset('imdb', split="train")

imdb_sentences=dataset_for_adaptation["text"]

train_sentences=imdb_sentences[:20000]

dev_sentences=imdb_sentences[20000:]

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

model = AutoModelForMaskedLM.from_pretrained(model_path)

In [None]:
batch_size = 16

num_train_epochs = 15

max_length = 100

mlm_prob = 0.25

In [None]:
class TokenizedSentencesDataset:

    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):

        self.tokenizer = tokenizer

        self.sentences = sentences

        self.max_length = max_length

        self.cache_tokenization = cache_tokenization

    def __getitem__(self, item):

        if not self.cache_tokenization:

            return self.tokenizer(
                self.sentences[item],
                add_special_tokens=True,
                truncation=True,
                max_length=self.max_length,
                return_special_tokens_mask=True,
            )

        if isinstance(self.sentences[item], str):

            self.sentences[item] = self.tokenizer(
                self.sentences[item],
                add_special_tokens=True,
                truncation=True,
                max_length=self.max_length,
                return_special_tokens_mask=True,
            )

        return self.sentences[item]

    def __len__(self):

        return len(self.sentences)

In [None]:
train_dataset = TokenizedSentencesDataset(train_sentences,

 tokenizer, max_length)

dev_dataset = TokenizedSentencesDataset(dev_sentences,

 				tokenizer, max_length)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob
)

training_args = TrainingArguments(
    num_train_epochs=num_train_epochs,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    prediction_loss_only=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

In [None]:
trainer.train()

In [None]:
adapted_model_path="adapted-bert"

model.save_pretrained(adapted_model_path)

tokenizer.save_pretrained(adapted_model_path)

In [None]:
model_path = "adapted-bert"  # 1) Adapted model

# model_path= "bert-base-uncased" # 2)vanilla bert

model = BertForSequenceClassification.from_pretrained(
    model_path, id2label={0: "NEG", 1: "POS"}, label2id={"NEG": 0, "POS": 1}
)
