## Prepare training data

In [1]:
import datasets
import pandas as pd

wikidata = datasets.load_dataset("wikipedia", "20220301.en", split=['train[:10%]'])
df_wiki = wikidata[0].to_pandas().sample(frac=1, random_state=42).reset_index(drop=True)[:100000]
df_wiki.head()

Found cached dataset wikipedia (C:/Users/psdda/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,url,title,text
0,1658014,https://en.wikipedia.org/wiki/Adrienne%20Mayor,Adrienne Mayor,Adrienne Mayor (born 1946) is a historian of a...
1,415109,https://en.wikipedia.org/wiki/Jo%20Stafford,Jo Stafford,"Jo Elizabeth Stafford (November 12, 1917July 1..."
2,733308,https://en.wikipedia.org/wiki/Milan%20Rapai%C4%87,Milan Rapaić,"Milan ""Miki"" Rapaić (born 16 August 1973) is a..."
3,2597099,https://en.wikipedia.org/wiki/Windsor%20North%...,Windsor North School,Windsor North School is a primary school in In...
4,690250,https://en.wikipedia.org/wiki/List%20of%20rive...,List of rivers of Missouri,List of rivers in Missouri (U.S. state).\n\nBy...


In [2]:
import re

def text_cleaner(text):
    clean_text = text.lower() # change to lower case
    # regex to match most of url links
    url = r'''(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}
    |www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}
    |www\.[a-zA-Z0-9]+\.[^\s]{2,})'''
    clean_text = re.sub(url, '', clean_text) # remove url links
    clean_text = re.sub(r'\([^)]*\)', '', clean_text) # remove text inside ()
    clean_text = re.sub(r'\[|\]', '', clean_text) # remove []
    clean_text = re.sub(r'\“|\”','', clean_text) # remove ""
    clean_text = re.sub(r'\"','', clean_text) # remove "
    clean_text = re.sub(r"(?<=\d),(?=\d)", "", clean_text) # remove , inside digit
    clean_text = re.sub(r"[\-\—]", " ", clean_text) # remove - and -- 
    clean_text = re.sub(r"\d+", "number", clean_text) # replace digit to number token
    clean_text = re.sub(r"'s", "", clean_text) # remove 's
    clean_text = re.sub(r"[^a-zA-Z0-9 \,\.\!\?]", "", clean_text) # remove text that are not english characters
    clean_text = re.sub(r"\s+", " ", clean_text) # remove extra white space
    return clean_text


df_wiki["text"] = df_wiki["text"].apply(lambda x: text_cleaner(x))
df_wiki

Unnamed: 0,id,url,title,text
0,1658014,https://en.wikipedia.org/wiki/Adrienne%20Mayor,Adrienne Mayor,adrienne mayor is a historian of ancient scien...
1,415109,https://en.wikipedia.org/wiki/Jo%20Stafford,Jo Stafford,jo elizabeth stafford was an american traditio...
2,733308,https://en.wikipedia.org/wiki/Milan%20Rapai%C4%87,Milan Rapaić,milan miki rapai is a croatian former professi...
3,2597099,https://en.wikipedia.org/wiki/Windsor%20North%...,Windsor North School,windsor north school is a primary school in in...
4,690250,https://en.wikipedia.org/wiki/List%20of%20rive...,List of rivers of Missouri,list of rivers in missouri .by drainage basint...
...,...,...,...,...
99995,670737,https://en.wikipedia.org/wiki/Standesamt%20Ade...,Standesamt Adelnau,standesamt adelnau was one of the civil regist...
99996,580501,https://en.wikipedia.org/wiki/Siaka%20Stevens,Siaka Stevens,siaka probyn stevens was the leader of sierra ...
99997,2214203,https://en.wikipedia.org/wiki/Malitbog,Malitbog,malitbog is the name of several places in the ...
99998,2177759,https://en.wikipedia.org/wiki/Calyx%20%28music...,Calyx (musician),"calyx is a british drum and bass act, speciali..."


In [3]:
# split documents into sentences
sentence_pharser = "\. |\? |\! "
df_sentence = pd.concat([pd.DataFrame({'sentence': doc}, index=[0])
           for _, row in df_wiki.loc[:20000].iterrows()
           for doc in re.split("\.|\?|\! ", row["text"]) if doc != ''])
df_sentence

Unnamed: 0,sentence
0,adrienne mayor is a historian of ancient scien...
0,mayor specializes in ancient history and the s...
0,her work in pre scientific fossil discoveries...
0,"mayor book, greek fire, poison arrows, the sc..."
0,"lifefrom number to number, she worked as a cop..."
...,...
0,external linksa mills revival
0,by s
0,aronowitzcontemporary analysis of c
0,wright millson intellectual craftsmanship fro...


In [4]:
# remove trailing and ending whitespaces
df_sentence["sentence"] = df_sentence["sentence"].apply(lambda x: x.strip())
df_sentence

Unnamed: 0,sentence
0,adrienne mayor is a historian of ancient scien...
0,mayor specializes in ancient history and the s...
0,her work in pre scientific fossil discoveries ...
0,"mayor book, greek fire, poison arrows, the sco..."
0,"lifefrom number to number, she worked as a cop..."
...,...
0,external linksa mills revival
0,by s
0,aronowitzcontemporary analysis of c
0,wright millson intellectual craftsmanship from...


In [5]:
df_sentence = df_sentence.dropna() # drop missing sentence
df_sentence.index = [i for i in range(0, len(df_sentence))] # create index
#df_sentence.to_csv("wiki_sentence.csv", index=False) # save data
df_sentence

Unnamed: 0,sentence
0,adrienne mayor is a historian of ancient scien...
1,mayor specializes in ancient history and the s...
2,her work in pre scientific fossil discoveries ...
3,"mayor book, greek fire, poison arrows, the sco..."
4,"lifefrom number to number, she worked as a cop..."
...,...
1176891,external linksa mills revival
1176892,by s
1176893,aronowitzcontemporary analysis of c
1176894,wright millson intellectual craftsmanship from...


In [20]:
# check missing values in dataframe
df_sentence.isnull().sum()

sentence    0
dtype: int64

## Build autocomplete system

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import re
from tqdm import tqdm

class AutocompleteByEmbedding:

    def __init__(self, data, column, tokenizer, model):
        self.data = data
        self.col = column
        self.tokenizer = tokenizer
        self.model = model
  
    def match_prefix(self, sentence, prefix):
        """Match prefix in sentence."""
        matcher = r"\b" + prefix
        prefix_match = re.search(matcher, sentence)
        if prefix_match:
            return True
        else:
            return False
  
    def word_with_prefix(self, encoded, prefix):
        """Find all words containing prefix from Bert tokens."""
        words = []
        for token in encoded['input_ids'][0]:
            words.append(self.tokenizer.decode([token]))
        index = []   
        for i in range(len(words)):
            if re.match(r'^' + prefix, words[i]):
                index.append(i)
        return index

    def autocomplete(self, sentence, document_size=200, top_n=5, similarity=True):
        words = sentence.split()
        prefix = words[-1] # prefix is the last word in sentence
        # filter all sentences containing prefix
        sentence_with_prefix = self.data[self.col].apply(lambda x: self.match_prefix(x, prefix))
        df_prefix = self.data[sentence_with_prefix]
        # tokenize input sentence
        encoded_input = self.tokenizer(text, return_tensors="pt")
        output = self.model(**encoded_input)
        # input sentence word embedding
        input_embedding = output.last_hidden_state[-1].detach().numpy()
        word_similarity = {}
        # tokenize training sentence
        for document in tqdm(df_prefix[self.col][:document_size]):
            # tokenize training sentence
            encoded_document = self.tokenizer(document, max_length=128, truncation=True, padding=True, return_tensors="pt")
            document_output = self.model(**encoded_document)
            # training sentence word embedding
            Y = document_output.last_hidden_state[-1].detach().numpy()
            # find indices of words containign prefix in tokens
            encoded_index = self.word_with_prefix(encoded_document, prefix)
            # compute cosine similarity between input sentence and training sentence
            cos_distance = cosine_similarity(input_embedding, Y)
            for index in encoded_index:
                # decode tokens to get real word
                word = self.tokenizer.decode([encoded_document["input_ids"][0][index]])
                # get similarity between word and prefix
                distance = cos_distance[len(input_embedding)-1-1, index]
                # add word to result
                if word not in word_similarity:
                    word_similarity[word] = distance
                elif word_similarity[word] < distance:
                    word_similarity[word] = distance
                else:
                    continue
        # sort by word consine similarity
        top_words = sorted(word_similarity, key=lambda x: -word_similarity[x])[:top_n]
        result = []
        for word in top_words:
            result.append((word, word_similarity[word]))
        # if result has length 0
        # use words in training sentences as result
        if len(result) == 0:
            size = min(document_size, len(df_prefix))
            train_words = {}
            # count occurrence for words containg prefix
            for document in df_prefix["sentence"][:size]:
                pattern = r"\b" + prefix + "[a-z]*"
                search = re.search(pattern, document)
                train_words[search[0]] = train_words.get(search[0], 0) + 1
            sorted_words = sorted(train_words, key=lambda x: -train_words[x])[:top_n]
            # use word frequency as probability
            for word in sorted_words:
                result.append((word, train_words[word] / size))
        if len(result) == 0:
            print("Cannot complete word " + prefix)
        if similarity:
            return result
        else:
            predicted_words = []
            predicted_words = [pair[0] for pair in result]
            return predicted_words

In [9]:
from transformers import DistilBertTokenizer, DistilBertModel

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

bert_autocomplete = AutocompleteByEmbedding(data=df_sentence, column="sentence", tokenizer=tokenizer, model=model)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Test autocomplete feature

In [10]:
text = "That which does not kill us make us str"
bert_autocomplete.autocomplete(text)

100%|██████████| 200/200 [00:41<00:00,  4.83it/s]


[('strangely', 0.5754252),
 ('strong', 0.52829254),
 ('street', 0.5196318),
 ('strategy', 0.4686629),
 ('structure', 0.44376096)]

In [11]:
text = "Natural language processing is a subfield of linguistics, computer sci"
bert_autocomplete.autocomplete(text)

100%|██████████| 200/200 [00:42<00:00,  4.68it/s]


[('science', 0.6582532),
 ('sciences', 0.62737536),
 ('scientist', 0.53728306),
 ('scissors', 0.5202313),
 ('scientists', 0.512413)]

In [12]:
bert_autocomplete.autocomplete("t")

100%|██████████| 200/200 [00:39<00:00,  5.04it/s]


[('technology', 0.5973148),
 ('thought', 0.51470196),
 ('tactics', 0.5123917),
 ('terminology', 0.45969445),
 ('together', 0.45541796)]

In [13]:
bert_autocomplete.autocomplete("p")

100%|██████████| 200/200 [00:40<00:00,  4.97it/s]


[('pre', 0.4764853),
 ('peers', 0.46667928),
 ('points', 0.46472567),
 ('performance', 0.46284923),
 ('philology', 0.45699662)]

In [14]:
# correct word: vrindavana
text = "kameshvara temple, in kamyavan, one of the twelve forests of vrind"
bert_autocomplete.autocomplete(text)

100%|██████████| 10/10 [00:02<00:00,  4.75it/s]


[('vrindavan', 0.4),
 ('vrindavana', 0.3),
 ('vrindavanam', 0.1),
 ('vrindabanin', 0.1),
 ('vrindaban', 0.1)]

In [15]:
# correct word: subjunctive
text = "some important ones are declarative, affirmative, negative, emphatic, conditional, imperative, interrogative and subju"
bert_autocomplete.autocomplete(text)

100%|██████████| 88/88 [00:18<00:00,  4.68it/s]


[('subjugated', 0.4090909090909091),
 ('subjugation', 0.23863636363636365),
 ('subjugate', 0.1590909090909091),
 ('subjunctive', 0.09090909090909091),
 ('subjugating', 0.056818181818181816)]

In [16]:
text = "i have attached the file for your ref"
bert_autocomplete.autocomplete(text)

100%|██████████| 200/200 [00:38<00:00,  5.26it/s]


[('references', 0.5986206),
 ('refurbished', 0.5270731),
 ('reflections', 0.5153293),
 ('ref', 0.5110934),
 ('reference', 0.51016015)]

In [17]:
text = "jumping fox chasing c"
bert_autocomplete.autocomplete(text)

100%|██████████| 200/200 [00:38<00:00,  5.20it/s]


[('c', 0.8225794),
 ('company', 0.522678),
 ('club', 0.52197415),
 ('claims', 0.5059437),
 ('china', 0.50570166)]