# Autocompletion

## N-Gram Language Model

In [12]:
import math
import random

import nltk
import numpy
import pandas

nltk.data.path.append(".")

In [13]:
with open("en_US.twitter.txt", "r") as f:
    data = f.read()
print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
data[0:300]

Data type: <class 'str'>
Number of letters: 3335477
First 300 letters of the data
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

### Preprocessing

In [14]:
import os


def split_data(data):
    return (sentence.strip() for sentence in data.split(os.linesep) if len(sentence.strip()) > 0)


def tokenize_sentences(sentences):
    return (nltk.word_tokenize(sentence.lower()) for sentence in sentences)


def tokenize_data(data):
    return list(tokenize_sentences(split_data(data)))

In [15]:
tokenized_data = tokenize_data(data)
tokenized_data[:2]

[['how',
  'are',
  'you',
  '?',
  'btw',
  'thanks',
  'for',
  'the',
  'rt',
  '.',
  'you',
  'gon',
  'na',
  'be',
  'in',
  'dc',
  'anytime',
  'soon',
  '?',
  'love',
  'to',
  'see',
  'you',
  '.',
  'been',
  'way',
  ',',
  'way',
  'too',
  'long',
  '.'],
 ['when',
  'you',
  'meet',
  'someone',
  'special',
  '...',
  'you',
  "'ll",
  'know',
  '.',
  'your',
  'heart',
  'will',
  'beat',
  'more',
  'rapidly',
  'and',
  'you',
  "'ll",
  'smile',
  'for',
  'no',
  'reason',
  '.']]

### Splitting Dataset

In [16]:
train_set_share = 0.8
train_set_size = int(len(tokenized_data) * 0.8)

train_set = tokenized_data[:train_set_size]
test_set = tokenized_data[train_set_size:]

print(
    os.linesep.join(
        [f"train set size={len(train_set)}", f"test set size={len(test_set)}", f"total={len(tokenized_data)}"]
    )
)

train set size=38368
test set size=9593
total=47961


### Using Unknown Words

In [17]:
from collections import defaultdict


def get_vocabularies(tokenized_sentences, threshold=2):
    word_map = defaultdict(lambda: 0)
    for sentence in tokenized_sentences:
        for word in sentence:
            word_map[word] += 1

    return {word: count for word, count in word_map.items() if count > threshold}


def replace_rare_words(tokenized_sentences, vocabularies, unknown_word_token="<UNK>"):
    replaced_sentences = []
    for sentence in tokenized_sentences:
        sentence_copy = []
        for word in sentence:
            sentence_copy.append(word if word in vocabularies else unknown_word_token)
        replaced_sentences.append(sentence_copy)

    return replaced_sentences

In [18]:
vocabularies = get_vocabularies(train_set)
train_set_2 = replace_rare_words(train_set, vocabularies)

In [19]:
train_set_2[10:12]

[['i',
  'always',
  'wonder',
  'how',
  'the',
  'guys',
  'on',
  'the',
  'auctions',
  'shows',
  'learned',
  'to',
  'talk',
  'so',
  'fast',
  '!',
  '?',
  'all',
  'i',
  'hear',
  'is',
  '<UNK>',
  '.'],
 ['<UNK>', 'what', 'a', 'catch']]

### Counting N-Grams

In [20]:
def count_n_grams(tokenized_sentences, n, starting_token="<S>", ending_token="<E>"):
    n_gram_counts = defaultdict(lambda: 0)
    for sentence in tokenized_sentences:
        sentence = tuple([starting_token] * n + sentence + [ending_token])
        for i in range(len(sentence) - n + 1):
            n_gram = sentence[i : i + n]
            n_gram_counts[n_gram] += 1
    return n_gram_counts

In [None]:
bigram_counts = count_n_grams(train_set_2, 2)
trigram_counts = count_n_grams(train_set_2, 3)
trigram_counts

### Estimate Probabilities

In [43]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus_1_gram_counts, vocabularies_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)

    n_plus_1_gram = (*previous_n_gram, word)
    n_plus_1_gram_count = n_plus_1_gram_counts.get(n_plus_1_gram, 0)

    return (n_plus_1_gram_count + k) / (previous_n_gram_count + k * vocabularies_size)


def estimate_probabilities(
    previous_n_gram,
    n_gram_counts,
    n_plus_1_gram_counts,
    vocabularies,
    ending_token="<E>",
    unknown_word_token="<UNK>",
    k=1.0,
):
    words = list(vocabularies.keys()) + [ending_token, unknown_word_token]
    return {
        word: estimate_probability(word, previous_n_gram, n_gram_counts, n_plus_1_gram_counts, len(vocabularies), k)
        for word in words
    }

In [50]:
probs = estimate_probabilities(("how", "are"), bigram_counts, trigram_counts, vocabularies)

from collections import Counter

most_likely = Counter(probs).most_common(1)[0][0]

print("how are (?)")
print(f"most likely: how are {most_likely}")

how are (?)
most likely: how are you
