In [7]:
#Basics
import os
import io
import re
import string
from tqdm import tqdm
import numpy as np

#torch libraries
import torch
from torch import cuda

#Time related libraries
import datetime
from timeit import default_timer as timer

#General libraries
import collections
import math
import random
import matplotlib as mpl
import matplotlib.pyplot as plt

#Dealing with files, data and general stuff
import zipfile
import pandas as pd

#May be used but very unlikely.
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS
from numba import jit, cuda #Former way of GPU acceleration in my project.

#Transformers Library. Taken from "Hugging Face".
from transformers import BertConfig, BertModel, BertTokenizer, BertForMaskedLM, BertTokenizerFast, AutoTokenizer, pipeline
from tokenizers import BertWordPieceTokenizer
from tokenizers.processors import BertProcessing

#Optimizers taken from the "Transformers" Library that I have mentioned above as well.
from transformers import AdamW

In [8]:
#Loading The Tokenizer & Model From The HuggingFace Database.
Tokenizer = AutoTokenizer.from_pretrained("Seraphiive/bert-personalized-PreAlpha-uncased") 
Model = BertForMaskedLM.from_pretrained("Seraphiive/bert-personalized-PreAlpha-uncased")

In [9]:
#Setting up the pipeline for a more elaborated report on the possible predictions for the word.
pipe = pipeline('fill-mask', model=Model, tokenizer=Tokenizer)

In [10]:
#Change the string below to whatever sentence you would like to be used for testing.
ExampleSentence = "How [MASK] you doing?"

In [11]:
inputs = Tokenizer(ExampleSentence, return_tensors="pt") #Tokenizing the sentence after modifications.

with torch.no_grad():
    logits = Model(**inputs).logits
        #Using the Model to get the "raw prediction vector" which will be passed to the softmax function when predicting the missing word.

mask_token_index = (inputs.input_ids == Tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] #Retrieving the index of the [MASK]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
PredWord = Tokenizer.decode(predicted_token_id)

print(PredWord) #Prints the missing word that was predicted by the model.

are


In [12]:
#Below, write the same sentence that was provided to the "ExampleSentence" only this time, replace the [MASK] with {pipe.tokenizer.mask_token} in order to receive the closest words in prediction after the predicted word.
pipe(f'How {pipe.tokenizer.mask_token} you doing?') 

[{'score': 0.21366684138774872,
  'token': 309,
  'token_str': 'are',
  'sequence': 'how are you doing?'},
 {'score': 0.14240136742591858,
  'token': 215,
  'token_str': 'is',
  'sequence': 'how is you doing?'},
 {'score': 0.09299653768539429,
  'token': 287,
  'token_str': 'about',
  'sequence': 'how about you doing?'},
 {'score': 0.07042302936315536,
  'token': 179,
  'token_str': 'do',
  'sequence': 'how do you doing?'},
 {'score': 0.04870709031820297,
  'token': 142,
  'token_str': 'was',
  'sequence': 'how was you doing?'}]