# Encoder-Only Transformer

In [9]:
import torch
from transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

encoded = bert_tokenizer(
    "A man is playing guitar.",
    "A person is making music.",
    return_tensors="pt"
)

print(encoded["input_ids"])
print(encoded["token_type_ids"])  # <-- segment embeddings (0s and 1s)

tensor([[ 101, 1037, 2158, 2003, 2652, 2858, 1012,  102, 1037, 2711, 2003, 2437,
         2189, 1012,  102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])


In [2]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    vocab_size = bert_tokenizer.vocab_size,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=4,
    intermediate_size=512,
    max_position_embeddings=128
)
bert = BertForMaskedLM(config)

In [3]:
from datasets import load_dataset

mlm_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
mlm_dataset


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [4]:
def tokenize(example, tokenizer=bert_tokenizer):
  return tokenizer(example['text'], truncation=True, max_length = 128, padding="max_length")
mlm_dataset = mlm_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

In [5]:
mlm_dataset


Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36718
})

In [6]:
from transformers import DataCollatorForLanguageModeling

mlm_collator = DataCollatorForLanguageModeling(bert_tokenizer, mlm=True, mlm_probability=0.15)

In [11]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir='./my_bert',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    report_to=[]
)

trainer = Trainer(
    model=bert,
    args=args,
    train_dataset=mlm_dataset,
    data_collator=mlm_collator
)
trainer_output = trainer.train()

Step,Training Loss
500,6.8238
1000,6.9292
1500,6.9687
2000,6.9361
2500,6.9225
3000,6.9021
3500,6.9205
4000,6.8742
4500,6.8963
5000,6.841


In [13]:
from transformers import pipeline

torch.manual_seed(42)
fill_mask = pipeline("fill-mask", model=bert, tokenizer=bert_tokenizer)
top_predictions = fill_mask("The capital of [MASK] is Rome.")
top_predictions[0]

Device set to use cuda:0


{'score': 0.07140766829252243,
 'token': 1996,
 'token_str': 'the',
 'sequence': 'the capital of the is rome.'}

In [14]:
from transformers import BertTokenizer, BertForQuestionAnswering

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

question = "Where is Rome?"
context = "Rome is the capital of Italy and a famous tourist destination."

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Pick predicted answer tokens
start_index = start_scores.argmax()
end_index = end_scores.argmax()
answer = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index+1])
)
print(answer)  # Italy


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[SEP] rome is
