In [None]:
# FInetuning BERT using MLM (Masked language Model)

In [22]:
from transformers import BertTokenizer, BertForMaskedLM, pipeline
import torch

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# Prepare your dataset (list of sentences)
dataset = [
    "The temperature in New York is expected to rise tomorrow.",
    "Heavy rain and thunderstorms are forecasted in the evening.",
    "A cold front will move through the region by the weekend.",
    "Winds gusting up to 30 miles per hour are anticipated.",
    "The humidity levels will drop significantly next week.",
    "Lata Ragha is HOD of college.",
    "The weather conditions might affect outdoor events."
]


In [13]:
# Tokenize your dataset
tokenized_input = tokenizer(dataset, padding=True, truncation=True, return_tensors="pt")

# Mask a percentage of tokens in the dataset
# For instance, mask 15% of tokens with '[MASK]' token
masked_input = tokenized_input.input_ids.clone()
mask_indices = torch.bernoulli(torch.full(masked_input.shape, 0.15)).bool() & (masked_input != tokenizer.pad_token_id)
masked_input[mask_indices] = tokenizer.mask_token_id

# Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [14]:
# Fine-tune the model on the masked language modeling task
for epoch in range(3):  # Assuming 3 epochs for demonstration
    optimizer.zero_grad()
    outputs = model(masked_input, labels=masked_input)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/3 - Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained('path/to/save')

Epoch 1/3 - Loss: 5.781792163848877
Epoch 2/3 - Loss: 5.332117080688477
Epoch 3/3 - Loss: 4.949427604675293


In [18]:
# Load the fine-tuned model
model = BertForMaskedLM.from_pretrained('path/to/save')

# Use the model for question answering
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

# Example question
question = "[MASK] is HOD of college ?"


# Get answers using the fine-tuned model
answers = fill_mask(question)
print(answers)

[{'score': 0.43411219120025635, 'token': 2073, 'token_str': 'where', 'sequence': 'where is hod of college?'}, {'score': 0.2793853282928467, 'token': 2040, 'token_str': 'who', 'sequence': 'who is hod of college?'}, {'score': 0.09936191886663437, 'token': 2054, 'token_str': 'what', 'sequence': 'what is hod of college?'}, {'score': 0.0849163681268692, 'token': 2129, 'token_str': 'how', 'sequence': 'how is hod of college?'}, {'score': 0.01218127179890871, 'token': 2002, 'token_str': 'he', 'sequence': 'he is hod of college?'}]


In [21]:
best_answer = answers[0]['token_str']
print(best_answer)

where
