In [1]:
from transliterator.transliteration import Transliterator
from transformers import AutoTokenizer, AutoModelForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the tokenizer and model from the local directory
model_directory = "models"
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModelForMaskedLM.from_pretrained(model_directory)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [3]:
DICTIONARY_PATH = "data/dictionary.txt"

# Initialize Transliterator
transliterator = Transliterator(
    dictionary_path=DICTIONARY_PATH, tokenizer=tokenizer, model=model
)

In [4]:
# Transliterate input
singlish_sentence = input("Enter Singlish Sentence: ").strip()
print("Singlish Input:", singlish_sentence)
sinhala_sentence = transliterator.generate_sinhala(singlish_sentence)
print("Sinhala Output:", sinhala_sentence)

Singlish Input: kly rn ha smny
Sinhala Output: කාලය රන් හා සමානය


In [5]:
# Transliterate input
singlish_sentence = input("Enter Singlish Sentence: ").strip()
print("Singlish Input:", singlish_sentence)
sinhala_sentence = transliterator.generate_sinhala(singlish_sentence)
print("Sinhala Output:", sinhala_sentence)

Singlish Input: sadun ad ynw
Sinhala Output: සදුන් අද යනවා


In [6]:
# Transliterate input
singlish_sentence = input("Enter Singlish Sentence: ").strip()
print("Singlish Input:", singlish_sentence)
sinhala_sentence = transliterator.generate_sinhala(singlish_sentence)
print("Sinhala Output:", sinhala_sentence)

Singlish Input: hkoj
Sinhala Output: ['හ්කොජ්']


In [9]:
import re

In [12]:
%%time

def classify_line(line):
    """Classify the line as Romanized, Sinhala, or blank."""
    line = line.strip()

    # Check if the line is blank
    if not line:
        return "Blank"

    # Check if the line contains Sinhala characters (Unicode range 0D80–0DFF)
    if any('\u0D80' <= char <= '\u0DFF' for char in line):
        return "Sinhala"

    # If it's not blank and doesn't contain Sinhala characters, it's Romanized
    return "Romanized"

# Open the input file and read the content
with open("C:/Users/ASUS/Downloads/Sinhala Test set 1 (4).txt", 'r', encoding='utf-8') as file:
    # Read the lines from the file
    lines = file.readlines()


# Variables to hold Romanized and Sinhala pairs
romanized_sentence = None
sinhala_sentence = None

# Loop through the lines and classify them
for i, line in enumerate(lines, start=1):
    classification = classify_line(line)

    if classification == "Romanized":
        romanized_sentence = line.strip()  # Store the Romanized line
    elif classification == "Sinhala":
        sinhala_sentence = line.strip()  # Store the Sinhala line

    # If both Romanized and Sinhala sentences are identified
    if romanized_sentence and sinhala_sentence:
        # Count the number of words in the Romanized sentence
        word_count = len(romanized_sentence.split())

        output = transliterator.generate_sinhala(re.sub(r'\s+', ' ', romanized_sentence).strip())


        print(f"Romanized Input: {romanized_sentence}")
        print(f"Expected Sinhala: {sinhala_sentence}")
        print(f"Generated Sinhala: {output}\n")


        # Reset the variables after processing the pair
        romanized_sentence = None
        sinhala_sentence = None


Romanized Input: awankawama mata eya mathaka ethi akaraya eyayi namuth eya wikarayaki mama obata ashwaya kerehi wedi elmak nodakwana namuth eya mage wilasithawa nowe
Expected Sinhala: අවංකවම මට එය මතක ඇති ආකාරය එයයි නමුත් එය විකාරයකි මම ඔබට අශ්වයා කෙරෙහි වැඩි ඇල්මක් නොදක්වන නමුත් එය මගේ විලාසිතාව නොවේ
Generated Sinhala: අවංකවම මට එය මතක ඇති ආකාරය එයයි නමුත් එය විකාරයකි මම ඔබට අශ්වයා කෙරෙහි වැඩි ඇල්මක් නොදක්වන නමුත් එය මගේ විලාසිතාව නොවේ

Romanized Input: oba mage aneka yeyi adahas karanne kese ho oba ema wilasithawata andinne mandeyi mama asami
Expected Sinhala: ඔබ මගේ අනෙකා යැයි අදහස් කරන්නේ කෙසේ හෝ ඔබ එම විලාසිතාවට අඳින්නේ මන්දැයි මම අසමි
Generated Sinhala: ඔබ මගේ අනෙක යැයි අදහස් කරන්නේ කෙසේ හෝ  ඔබ එම විලාසිතාවට අඳින්නේ මන්දැයි මම අසමි

Romanized Input: mama kiwa yuthuyi oba wedipura penenne obe mahalu athmayayi oba adahas karanne mage aneka bawayi
Expected Sinhala: මම කිව යුතුයි ඔබ වැඩිපුර පෙනෙන්නේ ඔබේ මහලු ආත්මයයි ඔබ අදහස් කරන්නේ මගේ අනෙකා බවයි
Generated Sinhala: මම කිව යුතුයි ඔබ ව

KeyboardInterrupt: 

In [2]:
sinhala_words =  ['අක්‍රිය']
valid_words = [word for word in sinhala_words if word in tokenizer.vocab]
valid_words

[]

In [1]:
from transformers import AutoTokenizer

# Load the tokenizer and model from the local directory
model_directory = "models"
tokenizer = AutoTokenizer.from_pretrained(model_directory)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
aa = '[MASK] අද එනවද'
input = tokenizer.encode(aa, return_tensors="pt")
input

tensor([[    2,     4,  5753, 22646,     3]])

In [3]:
inputs = tokenizer(
            aa, return_tensors="pt", padding=True, truncation=True
        )
inputs

{'input_ids': tensor([[    2,     4,  5753, 22646,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [14]:
# Example long text (more than 512 tokens)
long_text = "ඔබ අද එනවද" * 20  # Repeated to exceed 512 tokens

# Tokenizing without truncation
tokens = tokenizer(long_text, return_tensors="pt")

print("Number of tokens:", len(tokens["input_ids"][0]))  # Prints number of tokens


Number of tokens: 81


In [15]:
tokens["input_ids"][0]

tensor([    2,  5773,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
            3])

In [12]:
# Tokenizing with truncation enabled
tokens = tokenizer(long_text, return_tensors="pt", truncation=True)

print("Number of tokens:", len(tokens["input_ids"][0]))  # Now within BERT's limit
tokens["input_ids"][0]

Number of tokens: 512


tensor([    2,  5773,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,
         5753, 22646,  3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646,
         3809,  3746,  5753, 22646,  3809,  3746,  5753, 22646, 

In [20]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification

# Sample input text
text = "This is an example sentence to classify."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
print(len(inputs["input_ids"][0]))  # Number of tokens
print(inputs["input_ids"][0])


# Run model inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities
probs = torch.nn.functional.softmax(logits, dim=-1)

# Get predicted class
predicted_class = torch.argmax(probs, dim=-1).item()

print(f"Predicted class: {predicted_class}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


10
tensor([  101,  2023,  2003,  2019,  2742,  6251,  2000, 26268,  1012,   102])
Predicted class: 0


In [21]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Example texts of different lengths
texts = [
    "Short sentence.",
    "This is a slightly longer sentence for testing padding.",
    "Here is an even longer sentence to check how the tokenizer applies padding and truncation when necessary."
]

# Tokenizing with padding enabled
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=20)

# Print tokenized outputs
print("Input IDs:\n", inputs["input_ids"])
print("\nAttention Mask:\n", inputs["attention_mask"])


Input IDs:
 tensor([[  101,  2460,  6251,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2023,  2003,  1037,  3621,  2936,  6251,  2005,  5604, 11687,
          4667,  1012,   102,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2182,  2003,  2019,  2130,  2936,  6251,  2000,  4638,  2129,
          1996, 19204, 17629, 12033, 11687,  4667,  1998, 19817,  4609,   102]])

Attention Mask:
 tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [26]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# A long input text
long_text = "This is a very long sentence that contains multiple words and exceeds the maximum length allowed for BERT tokenization."*100

# Tokenizing with truncation enabled and a small max_length
inputs = tokenizer(long_text, return_tensors="pt", truncation=True)
print(len(inputs["input_ids"][0]))  # Number of tokens

# Print tokenized outputs
print("Input IDs:\n", inputs["input_ids"])
print("\nDecoded Tokens:\n", tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))
print(len(inputs["input_ids"][0]))  # Number of tokens


512
Input IDs:
 tensor([[  101,  2023,  2003,  1037,  2200,  2146,  6251,  2008,  3397,  3674,
          2616,  1998, 23651,  1996,  4555,  3091,  3039,  2005, 14324, 19204,
          3989,  1012,  2023,  2003,  1037,  2200,  2146,  6251,  2008,  3397,
          3674,  2616,  1998, 23651,  1996,  4555,  3091,  3039,  2005, 14324,
         19204,  3989,  1012,  2023,  2003,  1037,  2200,  2146,  6251,  2008,
          3397,  3674,  2616,  1998, 23651,  1996,  4555,  3091,  3039,  2005,
         14324, 19204,  3989,  1012,  2023,  2003,  1037,  2200,  2146,  6251,
          2008,  3397,  3674,  2616,  1998, 23651,  1996,  4555,  3091,  3039,
          2005, 14324, 19204,  3989,  1012,  2023,  2003,  1037,  2200,  2146,
          6251,  2008,  3397,  3674,  2616,  1998, 23651,  1996,  4555,  3091,
          3039,  2005, 14324, 19204,  3989,  1012,  2023,  2003,  1037,  2200,
          2146,  6251,  2008,  3397,  3674,  2616,  1998, 23651,  1996,  4555,
          3091,  3039,  2005, 14324,

In [29]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create a very long input text (more than 512 words)
long_text = "This is a long sentence. "   # Repeats to exceed 512 tokens

# Tokenizing without truncation
inputs = tokenizer(long_text, return_tensors="pt", truncation=False)

# Print total token count
token_count = len(inputs["input_ids"][0])
print(f"Total Tokens: {token_count}")

# Checking if it exceeds 512 tokens
if token_count > 512:
    print("⚠️ Warning: The input exceeds 512 tokens and will cause an error in BERT!")


Total Tokens: 8


In [30]:
from transformers import BertForSequenceClassification
import torch

# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Attempt to pass the long input into BERT
outputs = model(**inputs)  # ❌ This will cause an error!


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
import torch.nn.functional as F

# Load BERT tokenizer and model for Masked Language Modeling
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Define a sentence with a masked token
text = "This is a [MASK] example."

# Tokenize without padding
inputs_no_pad = tokenizer(text, return_tensors="pt", padding=False, truncation=True)

# Tokenize with padding (max_length=10)
inputs_with_pad = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=30)

# Get the index of the masked token
mask_index_no_pad = torch.where(inputs_no_pad["input_ids"] == tokenizer.mask_token_id)[1].item()
mask_index_with_pad = torch.where(inputs_with_pad["input_ids"] == tokenizer.mask_token_id)[1].item()

# Run model inference
with torch.no_grad():
    logits_no_pad = model(**inputs_no_pad).logits
    logits_with_pad = model(**inputs_with_pad).logits

# Get softmax probabilities for the masked token position
probs_no_pad = F.softmax(logits_no_pad[0, mask_index_no_pad], dim=-1)
probs_with_pad = F.softmax(logits_with_pad[0, mask_index_with_pad], dim=-1)

print("Probs no pad:", probs_no_pad)
print("Probs with pad:", probs_with_pad)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Probs no pad: tensor([1.1037e-07, 1.1283e-07, 1.4856e-07,  ..., 1.5335e-07, 2.7046e-07,
        9.8091e-08])
Probs with pad: tensor([1.1037e-07, 1.1283e-07, 1.4856e-07,  ..., 1.5335e-07, 2.7046e-07,
        9.8091e-08])


In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from concurrent.futures import ThreadPoolExecutor

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")



  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a mod

In [None]:
def get_predictions(model, tokenizer, sentence, candidates):
    inputs = tokenizer(sentence, return_tensors="pt")
    mask_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    mask_word_logits = logits[0, mask_index, :]
    
    probabilities = torch.nn.functional.softmax(mask_word_logits, dim=-1)
    
    candidate_probs = {}
    for word in candidates:
        word_id = tokenizer.convert_tokens_to_ids(word)
        if word_id is not None:
            candidate_probs[word] = probabilities[0, word_id].item()
        else:
            candidate_probs[word] = 0.0  # If the word is not in vocab, assign 0 probability
    
    return sentence, candidate_probs

def process_inputs(input_dict):    
    results = {}
    with ThreadPoolExecutor() as executor:
        future_to_sentence = {
            executor.submit(get_predictions, model, tokenizer, sentence, candidates): sentence
            for sentence, candidates in input_dict.items()
        }
        
        for future in future_to_sentence:
            sentence, probs = future.result()
            results[sentence] = probs
    
    return results

# Example usage:
input_dict = {
    "[MASK] will come": ["i", "car"], 
    "[MASK] can go": ["run", "we"]
}

output = process_inputs(input_dict)
print(output)


In [2]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForMaskedLM

class MaskedLMModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def generate_probs(self, sentences_with_blank, candidate_dict):
        # Tokenize all sentences in batch
        inputs = self.tokenizer(
            sentences_with_blank, return_tensors="pt", padding=True, truncation=True
        )

        # Identify mask positions in batch
        mask_token_indices = (inputs.input_ids == self.tokenizer.mask_token_id).nonzero(
            as_tuple=True
        )

        # Perform forward pass in parallel
        with torch.no_grad():
            logits = self.model(
                **inputs
            ).logits  # Shape: (batch_size, seq_len, vocab_size)

        word_probabilities = {}
        for i, sentence in enumerate(sentences_with_blank):
            mask_pos = mask_token_indices[1][
                i
            ].item()  # Get mask index for this sentence
            mask_logits = logits[i, mask_pos, :]  # Extract logits for mask position

            candidates = candidate_dict[sentence]
            word_ids = self.tokenizer.convert_tokens_to_ids(candidates)
            word_probs = F.softmax(mask_logits, dim=-1)[word_ids].tolist()

            # Store probabilities for each word
            for j, word in enumerate(candidates):
                word_probabilities[(sentence, word)] = word_probs[j]

        return word_probabilities


In [None]:

# Example usage:
tokenizer = BertTokenizer.from_pretrained("models")
model = BertForMaskedLM.from_pretrained("models")


In [9]:
masked_lm_model = MaskedLMModel(model, tokenizer)

input_dict = {
    # "[MASK] අද එනවද": ["ඔබ", "ඔබා"],
    "ඔබ [MASK] එනවද": ["අද", "ආදී",]
}

sentences = list(input_dict.keys())
output = masked_lm_model.generate_probs(sentences, input_dict)

In [10]:
output

{('ඔබ [MASK] එනවද', 'අද'): 0.039673857390880585,
 ('ඔබ [MASK] එනවද', 'ආදී'): 3.4286035770492163e-06}

In [17]:

# Example usage:
tokenizer2 = BertTokenizer.from_pretrained("bert-base-uncased")
model2 = BertForMaskedLM.from_pretrained("bert-base-uncased")
model2.eval()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [27]:
masked_lm_model2 = MaskedLMModel(model2, tokenizer2)

input_dict2 = {
    "[MASK] will come": ["i", "car"],
    "[MASK] can go": ["run", "we"]
}


sentences2 = list(input_dict2.keys())
output2 = masked_lm_model.generate_probs(sentences2, input_dict2)
output2

{('[MASK] will come', 'i'): 0.00018806032312568277,
 ('[MASK] will come', 'car'): 6.636828766204417e-05,
 ('[MASK] can go', 'run'): 1.5131897271203343e-05,
 ('[MASK] can go', 'we'): 1.5131897271203343e-05}

In [28]:
output2[('[MASK] will come', 'car')]

6.636828766204417e-05

In [29]:
print(f"Probability1 of 'car' in '[MASK] will come': {output2[('[MASK] will come', 'car')]}")

Probability1 of 'car' in '[MASK] will come': 6.636828766204417e-05


In [30]:
input_dict2 = {
    "[MASK] will come": ["i", "car"],
    # "[MASK] can go": ["run", "we"]
}


sentences2 = list(input_dict2.keys())
output2 = masked_lm_model.generate_probs(sentences2, input_dict2)
output2

{('[MASK] will come', 'i'): 0.00018806042498908937,
 ('[MASK] will come', 'car'): 6.63683531456627e-05}

In [31]:
print(f"Probability2 of 'car' in '[MASK] will come': {output2[('[MASK] will come', 'car')]}")

Probability2 of 'car' in '[MASK] will come': 6.63683531456627e-05


In [20]:
input_dict3 = {
    "[MASK] will come": ["i", "car"],
    "[MASK] can go": ["run", "we"],
    "[MASK] eat cat": ["it", "rat"]
}


sentences3 = list(input_dict3.keys())
output3 = masked_lm_model.generate_probs(sentences3, input_dict3)
output3

{('[MASK] will come', 'i'): 0.00018806032312568277,
 ('[MASK] will come', 'car'): 6.636828766204417e-05,
 ('[MASK] can go', 'run'): 1.5131897271203343e-05,
 ('[MASK] can go', 'we'): 1.5131897271203343e-05,
 ('[MASK] eat cat', 'it'): 6.345955625874922e-06,
 ('[MASK] eat cat', 'rat'): 6.345955625874922e-06}

In [21]:
 ('[MASK] will come', 'car'): 6.636828766204417e-05,
 ('[MASK] will come', 'car'): 6.636828766204417e-05,

SyntaxError: only single target (not tuple) can be annotated (3696649686.py, line 1)

In [3]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForMaskedLM

class MaskedLMModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def generate_probs(self, sentences_with_blank, candidate_dict):
        # Consistent tokenization
        inputs = self.tokenizer(
            sentences_with_blank, 
            return_tensors="pt", 
            padding='max_length', 
            truncation=True,
            max_length=10  # Adjust based on your expected sentence length
        )

        mask_token_indices = (inputs.input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=True)

        with torch.no_grad():
            logits = self.model(**inputs).logits

        word_probabilities = {}
        for i, sentence in enumerate(sentences_with_blank):
            mask_pos = mask_token_indices[1][i].item()
            mask_logits = logits[i, mask_pos, :]

            candidates = candidate_dict[sentence]
            word_ids = self.tokenizer.convert_tokens_to_ids(candidates)
            word_probs = F.softmax(mask_logits, dim=-1)[word_ids].tolist()

            for j, word in enumerate(candidates):
                word_probabilities[(sentence, word)] = word_probs[j]

        return word_probabilities


In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 
model = BertForMaskedLM.from_pretrained("bert-base-uncased") 
model.eval()

masked_lm_model = MaskedLMModel(model, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
input_dict1 = { 
    "[MASK] will come": ["i", "car"],
    "[MASK] can go": ["run", "we"]
}

input_dict2 = {
    "[MASK] will come": ["i", "car"],
}

sentences1 = list(input_dict1.keys()) 
sentences2 = list(input_dict2.keys()) 

output1 = masked_lm_model.generate_probs(sentences1, input_dict1) 
output2 = masked_lm_model.generate_probs(sentences2, input_dict2)

In [6]:
print(f"Probability1 of 'car' in '[MASK] will come': {output1[('[MASK] will come', 'car')]}")
print(f"Probability2 of 'car' in '[MASK] will come': {output2[('[MASK] will come', 'car')]}")

Probability1 of 'car' in '[MASK] will come': 8.749316293688025e-06
Probability2 of 'car' in '[MASK] will come': 8.749316293688025e-06


In [18]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForMaskedLM

class MaskedLMModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def generate_probs(self, sentences_with_blank, candidate_dict):
        # Tokenize all sentences in batch
        inputs = self.tokenizer(
            sentences_with_blank, return_tensors="pt", padding=True, truncation=True
        )

        print(f"Input : {inputs}")

        # Identify mask positions in batch
        mask_token_indices = (inputs.input_ids == self.tokenizer.mask_token_id).nonzero(
            as_tuple=True
        )

        # Perform forward pass in parallel
        with torch.no_grad():
            logits = self.model(
                **inputs
            ).logits  # Shape: (batch_size, seq_len, vocab_size)

        word_probabilities = {}
        for i, sentence in enumerate(sentences_with_blank):
            mask_pos = mask_token_indices[1][
                i
            ].item()  # Get mask index for this sentence
            mask_logits = logits[i, mask_pos, :]  # Extract logits for mask position

            candidates = candidate_dict[sentence]
            word_ids = self.tokenizer.convert_tokens_to_ids(candidates)
            word_probs = F.softmax(mask_logits, dim=-1)[word_ids].tolist()

            # Store probabilities for each word
            for j, word in enumerate(candidates):
                word_probabilities[(sentence, word)] = word_probs[j]

        return word_probabilities

tokenizer = BertTokenizer.from_pretrained("models") 
model = BertForMaskedLM.from_pretrained("models") 
model.eval()

masked_lm_model = MaskedLMModel(model, tokenizer) 

input_dict1 = {
    "[MASK] අද එනවද": ["ඔබ", "ඔබා"],
    "ඔබ [MASK] එනවද": ["අද", "ආදී",]
}

input_dict2 = {
    # "[MASK] අද එනවද": ["ඔබ", "ඔබා"],
    "ඔබ [MASK] එනවද": ["අද", "ආදී",]
}

sentences1 = list(input_dict1.keys()) 
sentences2 = list(input_dict2.keys()) 

output1 = masked_lm_model.generate_probs(sentences1, input_dict1) 
output2 = masked_lm_model.generate_probs(sentences2, input_dict2)

print(f"Probability1 of 'car' in '[MASK] will come': {output1[('ඔබ [MASK] එනවද', 'අද')]}")
print(f"Probability2 of 'car' in '[MASK] will come': {output2[('ඔබ [MASK] එනවද', 'අද')]}")

Input : {'input_ids': tensor([[    2,     4,  5753, 22646,     3],
        [    2,  5773,     4, 22646,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}
Input : {'input_ids': tensor([[    2,  5773,     4, 22646,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
Probability1 of 'car' in '[MASK] will come': 0.03967390954494476
Probability2 of 'car' in '[MASK] will come': 0.039673857390880585


In [None]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForMaskedLM

class MaskedLMModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def generate_probs(self, sentences_with_blank, candidate_dict):
        # Tokenize all sentences in batch
        # inputs = self.tokenizer(
        #     sentences_with_blank, 
        #     return_tensors="pt", 
        #     padding=True, 
        #     truncation=True
        # )

        inputs = self.tokenizer(
            sentences_with_blank, 
            return_tensors="pt", 
            padding='max_length', 
            truncation=True,
            max_length=10  # set max_length explicitly to ensure consistency
        )

        print(f"Input : {inputs}")


        # Identify mask positions in batch
        mask_token_indices = (inputs.input_ids == self.tokenizer.mask_token_id).nonzero(
            as_tuple=True
        )

        # Perform forward pass in parallel
        with torch.no_grad():
            logits = self.model(
                **inputs
            ).logits  # Shape: (batch_size, seq_len, vocab_size)

        word_probabilities = {}
        for i, sentence in enumerate(sentences_with_blank):
            mask_pos = mask_token_indices[1][
                i
            ].item()  # Get mask index for this sentence
            mask_logits = logits[i, mask_pos, :]  # Extract logits for mask position

            candidates = candidate_dict[sentence]
            word_ids = self.tokenizer.convert_tokens_to_ids(candidates)
            word_probs = F.softmax(mask_logits, dim=-1)[word_ids].tolist()

            # Store probabilities for each word
            for j, word in enumerate(candidates):
                word_probabilities[(sentence, word)] = word_probs[j]

        return word_probabilities

tokenizer = BertTokenizer.from_pretrained("models") 
model = BertForMaskedLM.from_pretrained("models") 
model.eval()

masked_lm_model = MaskedLMModel(model, tokenizer) 

input_dict1 = {
    "[MASK] අද එනවද": ["ඔබ", "ඔබා"],
    "ඔබ [MASK] එනවද": ["අද", "ආදී",]
}

input_dict2 = {
    # "[MASK] අද එනවද": ["ඔබ", "ඔබා"],
    "ඔබ [MASK] එනවද": ["අද", "ආදී",]
}

sentences1 = list(input_dict1.keys()) 
sentences2 = list(input_dict2.keys()) 

output1 = masked_lm_model.generate_probs(sentences1, input_dict1) 
output2 = masked_lm_model.generate_probs(sentences2, input_dict2)

print(f"Probability1 of 'car' in '[MASK] will come': {output1[('ඔබ [MASK] එනවද', 'අද')]}")
print(f"Probability2 of 'car' in '[MASK] will come': {output2[('ඔබ [MASK] එනවද', 'අද')]}")

Input : {'input_ids': tensor([[   2,    4, 5753,  ...,    0,    0,    0],
        [   2, 5773,    4,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


RuntimeError: The size of tensor a (512) must match the size of tensor b (256) at non-singleton dimension 1

In [2]:
from transformers import BertTokenizer, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 
model = BertForMaskedLM.from_pretrained("bert-base-uncased") 
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a mod

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [23]:
batch_size = 16  # start from a reasonable guess

while True:
    try:
        inputs = tokenizer(["[MASK] will come"] * batch_size, return_tensors="pt", padding='max_length', truncation=True, max_length=20)
        logits = model(**inputs).logits
        print(f"Batch size {batch_size} is okay!")
        batch_size *= 2  # double it
    except RuntimeError as e:
        print(f"Reached memory limit at batch size {batch_size}: {e}")
        break


Batch size 16 is okay!
Batch size 32 is okay!
Batch size 64 is okay!
Batch size 128 is okay!
Batch size 256 is okay!
Batch size 512 is okay!
Batch size 1024 is okay!
Batch size 2048 is okay!


KeyboardInterrupt: 

In [3]:
# Example empirical test to find a suitable batch size
max_length = 128
batch_size = 16

while batch_size > 0:
    try:
        inputs = tokenizer(
            ["[MASK] will come"] * batch_size,
            return_tensors="pt",
            padding='max_length',
            truncation=True,
            max_length=max_length
        )
        logits = model(**inputs).logits
        print(f"Batch size {batch_size} OK")
        break
    except RuntimeError as e:
        print(f"Batch size {batch_size} failed: {e}")
        batch_size //= 2  # reduce batch size by half


Batch size 16 OK


In [67]:

import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForMaskedLM

class MaskedLMModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def generate_probs(self, sentences_with_blank, candidate_dict):
        # Consistent tokenization
        inputs = self.tokenizer(
            sentences_with_blank, 
            return_tensors="pt", 
            padding=True, 
            truncation=True,
        )

        print(f"types: {type(inputs)}")
        print(f"length: {len(inputs['input_ids'])}")
        print(f"token length 1: {len(inputs['input_ids'][0])}")
        print(f"token length 2: {len(inputs['input_ids'][1])}")
        print(f"Input : {inputs}")

        mask_token_indices = (inputs.input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=True)
        print(f"mask_token_indices: {mask_token_indices}")

        with torch.no_grad():
            logits = self.model(**inputs).logits

        word_probabilities = {}
        for i, sentence in enumerate(sentences_with_blank):
            mask_pos = mask_token_indices[1][i].item()
            mask_logits = logits[i, mask_pos, :]

            candidates = candidate_dict[sentence]
            word_ids = self.tokenizer.convert_tokens_to_ids(candidates)
            word_probs = F.softmax(mask_logits, dim=-1)[word_ids].tolist()

            for j, word in enumerate(candidates):
                word_probabilities[(sentence, word)] = word_probs[j]

        return word_probabilities


In [68]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 
model = BertForMaskedLM.from_pretrained("bert-base-uncased") 
model.eval()

masked_lm_model = MaskedLMModel(model, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [80]:
inp = ("come "*508) + "tomorrow [MASK]"
inp2 = ("run "*505) + "today [MASK]"
inp3 = ("walk "*505) + "today [MASK]"
inp4 = ("jump "*505) + "today [MASK]"
inp5 = ("sit "*505) + "today [MASK]"
inp6 = ("stand "*505) + "today [MASK]"
inp7 = ("sleep "*505) + "today [MASK]"
inp8 = ("eat "*505) + "today [MASK]"
inp9 = ("drink "*505) + "today [MASK]"
inp10 = ("play "*505) + "today [MASK]"
inp11 = ("study "*505) + "today [MASK]"
inp12 = ("work "*505) + "today [MASK]"
inp13 = ("write "*505) + "today [MASK]"
inp14 = ("read "*505) + "today [MASK]"
inp15 = ("sing "*505) + "today [MASK]"
inp16 = ("dance "*505) + "today [MASK]"
inp17 = ("cook "*505) + "today [MASK]"
inp18 = ("clean "*505) + "today [MASK]"
inp19 = ("wash "*505) + "today [MASK]"
inp20 = ("paint "*505) + "today [MASK]"

In [81]:
input_dict = {
    inp: ["i", "you"],
    inp2: ["we", "they"],
    inp3: ["he", "she"],
    inp4: ["it", "they"],
    inp5: ["we", "they"],
    inp6: ["he", "she"],
    inp7: ["it", "they"],
    inp8: ["we", "they"],
    inp9: ["he", "she"],
    inp10: ["it", "they"],
    inp11: ["we", "they"],
    inp12: ["he", "she"],
    inp13: ["it", "they"],
    inp14: ["we", "they"],
    inp15: ["he", "she"],
    inp16: ["it", "they"],
    inp17: ["we", "they"],
    inp18: ["he", "she"],
    inp19: ["it", "they"],
    inp20: ["we", "they"],
    
}

In [82]:
# input_dict = {  
#     inp: ["i", "car"],
#     inp2: ["i", "car"]

# } 
sentences = list(input_dict.keys())   

output = masked_lm_model.generate_probs(sentences, input_dict)

types: <class 'transformers.tokenization_utils_base.BatchEncoding'>
length: 20
token length 1: 512
token length 2: 512
Input : {'input_ids': tensor([[ 101, 2272, 2272,  ..., 4826,  103,  102],
        [ 101, 2448, 2448,  ...,    0,    0,    0],
        [ 101, 3328, 3328,  ...,    0,    0,    0],
        ...,
        [ 101, 4550, 4550,  ...,    0,    0,    0],
        [ 101, 9378, 9378,  ...,    0,    0,    0],
        [ 101, 6773, 6773,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
mask_token_indices: (tensor([ 0,  1,  2,  3,  4,  5,  6,  

In [64]:
input_dict = {  
    inp: ["i", "car"],
    inp2: ["i", "car"]

} 
sentences = list(input_dict.keys())   

output = masked_lm_model.generate_probs(sentences, input_dict)

types: <class 'transformers.tokenization_utils_base.BatchEncoding'>
length: 1
token length: 512
Input : {'input_ids': tensor([[ 101, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272,
         2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 2272, 227