## Process Data

In [1]:
import datasets
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing
from gensim.utils import simple_preprocess
import re
import pandas as pd

In [2]:
# read data
wikidata = datasets.load_dataset("wikipedia", "20220301.en", split=['train[:10%]'])
df_wiki = wikidata[0].to_pandas().sample(frac=1, random_state=42).reset_index(drop=True)[:100000]
df_wiki

Found cached dataset wikipedia (C:/Users/psdda/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,url,title,text
0,1658014,https://en.wikipedia.org/wiki/Adrienne%20Mayor,Adrienne Mayor,Adrienne Mayor (born 1946) is a historian of a...
1,415109,https://en.wikipedia.org/wiki/Jo%20Stafford,Jo Stafford,"Jo Elizabeth Stafford (November 12, 1917July 1..."
2,733308,https://en.wikipedia.org/wiki/Milan%20Rapai%C4%87,Milan Rapaić,"Milan ""Miki"" Rapaić (born 16 August 1973) is a..."
3,2597099,https://en.wikipedia.org/wiki/Windsor%20North%...,Windsor North School,Windsor North School is a primary school in In...
4,690250,https://en.wikipedia.org/wiki/List%20of%20rive...,List of rivers of Missouri,List of rivers in Missouri (U.S. state).\n\nBy...
...,...,...,...,...
99995,670737,https://en.wikipedia.org/wiki/Standesamt%20Ade...,Standesamt Adelnau,Standesamt Adelnau was one of the civil regist...
99996,580501,https://en.wikipedia.org/wiki/Siaka%20Stevens,Siaka Stevens,Siaka Probyn Stevens (24 August 1905 – 29 May ...
99997,2214203,https://en.wikipedia.org/wiki/Malitbog,Malitbog,Malitbog is the name of several places in the ...
99998,2177759,https://en.wikipedia.org/wiki/Calyx%20%28music...,Calyx (musician),"Calyx is a British drum and bass act, speciali..."


In [3]:
#  fucntion to clean text
def text_cleaner(text):
    clean_text = text.lower() # change to lower case
    url = r'''(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'''
    clean_text = re.sub(url, '', clean_text) # remove url links
    clean_text = re.sub(r'\([^)]*\)', '', clean_text) # remove text inside ()
    clean_text = re.sub(r'\[|\]', '', clean_text) # remove []
    clean_text = re.sub(r'\“|\”','', clean_text) # remove ""
    clean_text = re.sub(r'\"','', clean_text) # remove "
    clean_text = re.sub(r"(?<=\d),(?=\d)", "", clean_text) # remove , inside digit
    clean_text = re.sub(r"[\-\—]", " ", clean_text) # remove - and -- 
    clean_text = re.sub(r"\d+", "number", clean_text) # replace digit to number token
    clean_text = re.sub(r"'s", "", clean_text) # remove 's
    clean_text = re.sub(r"[^a-zA-Z0-9 \,\.\!\?]", "", clean_text) # remove text that are not english characters
    clean_text = re.sub(r"\s+", " ", clean_text) # remove extra white space
    return clean_text

In [4]:
class CorpusGenerator:
    def __init__(self, data):
        self.data = data

    def __iter__(self):
        """Python generator to feed data."""
        for i in range(len(self.data)):
            document = self.data["text"][i] # get text content
            document_clean = text_cleaner(document) # clean text
            words = simple_preprocess(document_clean) # split sentence to words
            yield TaggedDocument(words=words, tags=[i]) # tag document

## Train Doc2Vec model

In [5]:
# create Doc2Vec model
model = Doc2Vec(vector_size=32, window=10, min_count=2, sample=1e-4, negative=5, workers=multiprocessing.cpu_count(), dm=1)
wiki_corpus = CorpusGenerator(df_wiki)
model.build_vocab(wiki_corpus)

In [6]:
# train model
model.train(wiki_corpus, total_examples=model.corpus_count, epochs=5)

In [7]:
model.save('./doc2vec_model/doc2vecmodel.model')

In [2]:
load_model = Doc2Vec.load('./doc2vec_model/doc2vecmodel.model')

In [3]:
def find_similar_documents(text, top_n):
    """Find similar documents with the input document."""
    doc_vector = load_model.infer_vector(text)
    sims = load_model.dv.most_similar([doc_vector], topn=top_n)
    return sims

In [4]:
# Trie class
# reference: https://www.geeksforgeeks.org/auto-complete-feature-using-trie/
class TrieNode():
    def __init__(self):
        # Initialising one node for trie
        self.children = {}
        self.last = False
 
 
class Trie():
    def __init__(self):
 
        # Initialising the trie structure.
        self.root = TrieNode()
 
    def formTrie(self, keys):
 
        # Forms a trie structure with the given set of strings
        # if it does not exists already else it merges the key
        # into it by extending the structure as required
        for key in keys:
            self.insert(key)  # inserting one key to the trie.
 
    def insert(self, key):
 
        # Inserts a key into trie if it does not exist already.
        # And if the key is a prefix of the trie node, just
        # marks it as leaf node.
        node = self.root
 
        for a in key:
            if not node.children.get(a):
                node.children[a] = TrieNode()
 
            node = node.children[a]
 
        node.last = True
 
    def suggestionsRec(self, node, word):
 
        # Method to recursively traverse the trie
        # and return a whole word.
        if node.last:
            print(word)
 
        for a, n in node.children.items():
            self.suggestionsRec(n, word + a)
 
    def printAutoSuggestions(self, key):
 
        # Returns all the words in the trie whose common
        # prefix is the given key thus listing out all
        # the suggestions for autocomplete.
        node = self.root
 
        for a in key:
            # no string in the Trie has this prefix
            if not node.children.get(a):
                return 0
            node = node.children[a]
 
        # If prefix is present as a word, but
        # there is no subtree below the last
        # matching node.
        if not node.children:
            return -1
 
        self.suggestionsRec(node, key)
        return 1

In [5]:
def autocomplete(text, document_size=100):
    text_word = text.split()
    last_word = text_word.pop() # get prefix
    similar_index = find_similar_documents(text_word, document_size) # find top similar documents
    # create a list of words
    keys = []
    for pair in similar_index:
        index = pair[0]
        keys.extend(simple_preprocess(df_wiki["text"][index]))
        
    # creating trie object
    t = Trie()
 
    # creating the trie structure with the
    # given set of strings.
    t.formTrie(keys)
    # make suggestion based on prefix
    comp = t.printAutoSuggestions(last_word)
    
    if comp == -1:
        print(f"No other words found with prefix {last_word}")
    elif comp == 0:
        print(f"No words found with prefix {last_word}")

## Test autocomplete

In [8]:
autocomplete("Anarchism is a political philosophy and mov", 200)

movement
movements
moving


In [9]:
autocomplete("April is the fourth month of the year in the Julian and Gregorian cal", 200)

cal
caleb
california
caliph
caliphate
call
called
calleja
calling
caldera
calderón
calhoun


In [14]:
autocomplete("Natural language processing is a subfield of linguistics, compu")

compute
computer
computers
computerworld
computerized
computed
computes
computation
computational
computationally
computations
computability
computable
computing
compuflo
