In [24]:
# Finetuning BERT using MLM (Masked language Model)

In [25]:
from transformers import BertTokenizer, BertForMaskedLM, pipeline
import torch
import pprint

In [26]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
dataset = [
    "In the heart of a bustling city, technology shapes the way we live, work, and connect with one another.",
    "Advancements in artificial intelligence are transforming industries and revolutionizing the way we approach problem-solving.",
    "As we navigate the complexities of the digital age, cybersecurity becomes a paramount concern for safeguarding sensitive information.",
    "The fusion of creativity and technology gives rise to innovative solutions that push the boundaries of what is possible.",
    "In the ever-evolving landscape of science and technology, lifelong learning becomes essential for staying relevant and adapting to change."
]

In [28]:
# Tokenizing dataset
tokenized_input = tokenizer(dataset, padding=True, truncation=True, return_tensors="pt")

In [29]:
# Mask a percentage of tokens in the dataset
# For instance, mask 15% of tokens with '[MASK]' token
masked_input = tokenized_input.input_ids.clone()
mask_indices = torch.bernoulli(torch.full(masked_input.shape, 0.15)).bool() & (masked_input != tokenizer.pad_token_id)
masked_input[mask_indices] = tokenizer.mask_token_id

In [30]:
# Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [31]:
# Fine-tune the model on the masked language modeling task
for epoch in range(10):  
    optimizer.zero_grad()
    outputs = model(masked_input, labels=masked_input)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/3 - Loss: {loss.item()}")


model.save_pretrained('model/mlm')

Epoch 1/3 - Loss: 4.921903610229492
Epoch 2/3 - Loss: 4.333024024963379
Epoch 3/3 - Loss: 4.024721622467041
Epoch 4/3 - Loss: 3.8098886013031006
Epoch 5/3 - Loss: 3.6359221935272217
Epoch 6/3 - Loss: 3.490929365158081
Epoch 7/3 - Loss: 3.365839719772339
Epoch 8/3 - Loss: 3.2589199542999268
Epoch 9/3 - Loss: 3.1668190956115723
Epoch 10/3 - Loss: 3.084275722503662


In [35]:
# Testing the model
model = BertForMaskedLM.from_pretrained('model/mlm')
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [36]:
question = "In the age of [MASK], technology shapes the way we live"
answers = fill_mask(question)
print(answers)

[{'score': 0.295559287071228, 'token': 2974, 'token_str': 'technology', 'sequence': 'in the age of technology, technology shapes the way we live'}, {'score': 0.05940323323011398, 'token': 11028, 'token_str': 'invention', 'sequence': 'in the age of invention, technology shapes the way we live'}, {'score': 0.048010826110839844, 'token': 7588, 'token_str': 'computers', 'sequence': 'in the age of computers, technology shapes the way we live'}, {'score': 0.03821643814444542, 'token': 8144, 'token_str': 'innovation', 'sequence': 'in the age of innovation, technology shapes the way we live'}, {'score': 0.01624961383640766, 'token': 6627, 'token_str': 'tech', 'sequence': 'in the age of tech, technology shapes the way we live'}]


In [37]:
question = "[MASK] gives rise to innovative solutions"
answers = fill_mask(question)
print(answers)

[{'score': 0.22264055907726288, 'token': 2023, 'token_str': 'this', 'sequence': 'this gives rise to innovative solutions'}, {'score': 0.19961273670196533, 'token': 2009, 'token_str': 'it', 'sequence': 'it gives rise to innovative solutions'}, {'score': 0.0417848564684391, 'token': 2008, 'token_str': 'that', 'sequence': 'that gives rise to innovative solutions'}, {'score': 0.0360650010406971, 'token': 2974, 'token_str': 'technology', 'sequence': 'technology gives rise to innovative solutions'}, {'score': 0.029146505519747734, 'token': 8144, 'token_str': 'innovation', 'sequence': 'innovation gives rise to innovative solutions'}]
