In [1]:
!pip install nltk




In [2]:
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter
import re

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Training Corpus (Realistic Text)

In [3]:
corpus = """
I want to go home
I want to eat food
I want to go there
I like to eat pizza
I like to go outside
"""


Text Preprocessing

In [5]:
nltk.download('punkt_tab')

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return nltk.word_tokenize(text)

tokens = preprocess(corpus)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Build N-gram Models

In [6]:
unigrams = Counter(tokens)
bigrams = Counter(ngrams(tokens, 2))
trigrams = Counter(ngrams(tokens, 3))

vocab_size = len(unigrams)


Trigram Probability with Laplace Smoothing

In [7]:
def trigram_probability(trigram):
    bigram = trigram[:2]
    return (trigrams[trigram] + 1) / (bigrams[bigram] + vocab_size)


Auto-Complete Prediction Function

In [8]:
def autocomplete(sentence, top_k=3):
    words = preprocess(sentence)

    if len(words) < 2:
        return []

    context = tuple(words[-2:])
    candidates = {}

    for word in unigrams:
        trigram = context + (word,)
        candidates[word] = trigram_probability(trigram)

    predictions = sorted(
        candidates.items(),
        key=lambda x: x[1],
        reverse=True
    )

    return predictions[:top_k]


Test the Auto-Complete System

In [9]:
autocomplete("I want to")


[('go', 0.21428571428571427),
 ('eat', 0.14285714285714285),
 ('i', 0.07142857142857142)]

In [10]:
autocomplete("I like to")


[('go', 0.15384615384615385),
 ('eat', 0.15384615384615385),
 ('i', 0.07692307692307693)]