In [1]:
%pip install python-dotenv==1.0.1 expects==0.9.0 datasets==2.19.0 --quiet

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import os
import random

from expects import contain_exactly, equal, expect

import nltk

In [4]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("vicgalle/alpaca-gpt4")

dataset["train"]

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})

In [5]:
df = pd.DataFrame(dataset["train"])

df.head()

df.tail()

Unnamed: 0,instruction,input,output,text
0,Give three tips for staying healthy.,,1. Eat a balanced and nutritious diet: Make su...,Below is an instruction that describes a task....
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",Below is an instruction that describes a task....
2,Describe the structure of an atom.,,An atom is the basic building block of all mat...,Below is an instruction that describes a task....
3,How can we reduce air pollution?,,There are several ways to reduce air pollution...,Below is an instruction that describes a task....
4,Describe a time when you had to make a difficu...,,"As an AI assistant, I do not have my own perso...",Below is an instruction that describes a task....


Unnamed: 0,instruction,input,output,text
51997,Generate an example of what a resume should li...,,**John Doe**\n\n**Contact Information:**\n\n12...,Below is an instruction that describes a task....
51998,Arrange the items given below in the order to ...,"cake, me, eating",Me eating cake.,"Below is an instruction that describes a task,..."
51999,Write an introductory paragraph about a famous...,Michelle Obama,"Michelle Obama, born January 17, 1964, in Chic...","Below is an instruction that describes a task,..."
52000,Generate a list of five things one should keep...,,1. Evaluate your reasons: It's important to ta...,Below is an instruction that describes a task....
52001,Analyze the given legal document and explain t...,The following is an excerpt from a contract be...,The contract between Company A and Company B o...,"Below is an instruction that describes a task,..."


# Pre-processing 1

In [6]:
df["instruction_output"] = df[["instruction", "output"]].apply(lambda x: ' '.join(x), axis=1)

df["instruction_output"].head()

0    Give three tips for staying healthy. 1. Eat a ...
1    What are the three primary colors? The three p...
2    Describe the structure of an atom. An atom is ...
3    How can we reduce air pollution? There are sev...
4    Describe a time when you had to make a difficu...
Name: instruction_output, dtype: object

In [7]:
df["instruction_output"].loc[1]

'What are the three primary colors? The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB).'

In [8]:
%pip install --user -U nltk==3.8.1
import nltk
nltk.download("punkt")

[31mERROR: Can not perform a '--user' install. User site-packages are not visible in this virtualenv.[0m[31m
[0m

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vaibhavsethia07/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Split to sentences

In [9]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

def tokenize_text_to_sentences(data: pd.Series):
  """
  Split data by line break "\n", sentence completion tokens into sentences
  Args:
    data: pandas.Series
  Returns:
    sentences: List[str]
  """
  pst = PunktSentenceTokenizer()
  sentences = list()
  for text in data.values:

    # Tokenize text into sentences
    text_sentences = pst.tokenize(text=text)
    sentences.extend(text_sentences)

  return sentences

### Tests

In [10]:
test_data = pd.DataFrame({"test": ["This is a random text.  This is another random text! 1. Again, you know. Nah I',m just kidding!!",
        "Sky is blue. Leaves are green. Roses are red."]})

expected = ["This is a random text.",
            "This is another random text!",
            "1.",
            "Again, you know.",
            "Nah I',m just kidding!",
            "!",
            "Sky is blue.",
            "Leaves are green.",
            "Roses are red."]


actual = tokenize_text_to_sentences(test_data["test"])
expect(actual).to(contain_exactly(*expected))

## Tokenize Sentences

In [11]:
import string
from typing import List
from nltk.tokenize import word_tokenize

def clean_words(words: List[str]):
  """
  Clean the words by removing punctuations, and numeric tokens
  Args:
    words List[str]: Raw words
  Returns:
    cleaned_words List[str] Words without punctuations, and numeric tokens
  """

  cleaned_words = list()
  for word in words:
    # Skip punctuations
    if word in string.punctuation:
      continue

    # Skip numbers
    if word.isnumeric():
      continue

    cleaned_words.append(word.lower())
  return cleaned_words


def tokenize_sentences(sentences: List[str]):
  """
  Tokenize sentences into tokens (words)
  Args:
    sentences List[str]: List of sentences
  Returns:
    tokenized_sentences List[List[str]]: List of tokenized sentences
  """
  tokenized_sentences = list()

  for sentence in sentences:
    # Tokenize sentence into words
    words = word_tokenize(text=sentence)
    cleaned_words = clean_words(words=words)
    if len(cleaned_words) == 0:
      continue
    tokenized_sentences.append(cleaned_words)

  return tokenized_sentences

### Tests

In [12]:
test_data = ["This is a random text.",
        "This is another random text!",
        "1.",
        "Again, you know.",
        "Nah I',m just kidding!",
        "!",
        "Sky is blue.",
        "Leaves are green.",
        "Roses are red."]

expecteds = [["this", "is", "a", "random", "text"],
            ["this", "is", "another", "random", "text"],
            ["again", "you", "know"],
            ["nah", "i", "m", "just", "kidding"],
            ["sky", "is", "blue"],
            ["leaves", "are", "green"],
            ["roses", "are", "red"]
            ]

actuals = tokenize_sentences(test_data)
for actual, expected in zip(actuals, expecteds):
  expect(actual).to(contain_exactly(*expected))

## Combine Sentence and Word Tokenization

In [13]:
def get_tokenized_data(data: pd.Series):
  """
  Make a list of tokenized sentences
  Args:
    data pandas.Series: Raw text
  Returns:
    tokens List[List[str]]: List of tokenized sentences
  """
  sentences = tokenize_text_to_sentences(data=data)
  tokens = tokenize_sentences(sentences=sentences)

  return tokens

### Tests

In [14]:
test_data = pd.DataFrame({"test": ["This is a random text.  This is another random text! 1. Again, you know. Nah I',m just kidding!!",
        "Sky is blue. Leaves are green. Roses are red."]})

expecteds = [["this", "is", "a", "random", "text"],
            ["this", "is", "another", "random", "text"],
            ["again", "you", "know"],
            ["nah", "i", "m", "just", "kidding"],
            ["sky", "is", "blue"],
            ["leaves", "are", "green"],
            ["roses", "are", "red"]
            ]

actuals = get_tokenized_data(test_data["test"])
for actual, expected in zip(actuals, expecteds):
  expect(actual).to(contain_exactly(*expected))

## Split Train and Test Sets

In [15]:
import random

tokenized_data = get_tokenized_data(df["instruction_output"])
random.seed(7)
random.shuffle(tokenized_data)

train_frac = 0.8 # @param {type:"slider", min:0, max:1, step:0.05}
train_size = int(len(tokenized_data)*train_frac)
train_data = tokenized_data[:train_size]
test_data = tokenized_data[train_size:]

### Tests

In [16]:
actual_data, expected_data = len(tokenized_data), 347576
actual_training, expected_training = len(train_data), 278060
actual_testing, expected_testing = len(test_data), 69516

print(f"{actual_data:,} are split into {actual_training:,} training entries"
      f" and {actual_testing:,} test set entries.")

347,576 are split into 278,060 training entries and 69,516 test set entries.


# Pre-processing 2

## Count words

In [17]:
from typing import List
from collections import defaultdict

def create_vocabulary(tokenized_sentences: List[List[str]]):
  """
  Count the number of word appearances in tokenized sentences
  Args:
    tokenized_sentences List[List[str]]: List of word tokenized sentences
  Returns:
    vocabulary Dict[str, int]: Dictionary that maps word(str) to its frequency(int)
  """

  vocabulary = defaultdict(int)
  for sentence in tokenized_sentences:
    for word in sentence:
      vocabulary[word]+=1

  return dict(vocabulary)

### Tests

In [18]:
from expects import have_keys

tokenized_sentences = [['sky', 'is', 'blue'],
                       ['leaves', 'are', 'green'],
                       ['roses', 'are', 'red']]

expected = {
    "sky":1,
    "is": 1,
    "blue": 1,
    "leaves": 1,
    "are": 2,
    "green": 1,
    "roses": 1,
    "red": 1
}

actual = create_vocabulary(tokenized_sentences)

expect(actual).to(have_keys(**expected))

## Out-of-Vocabulary Words

In [19]:
from typing import Dict, FrozenSet

def create_closed_vocabulary(vocabulary: Dict[str, int], threshold:int) -> FrozenSet[str]:
  """
  Find the words that appear more than the threshold frequency
  Args:
    vocabulary Dict[str, int]: Dictionary of word(str), frequency(int) in corpus
    threshold int: Minimum number of occurences for a word to be in closed vocabulary
  Returns:
    closed_vocabulary set: Set of words that appear `threshold` or more times
  """
  closed_vocabulary = set()
  for word, freq in vocabulary.items():
    if freq >= threshold:
      closed_vocabulary.add(word)
  return closed_vocabulary

### Tests

In [20]:
vocabulary = {
    "sky":1,
    "is": 1,
    "blue": 1,
    "leaves": 1,
    "are": 2,
    "green": 1,
    "roses": 1,
    "red": 1
}

expected = frozenset(["are"])
actual = create_closed_vocabulary(vocabulary, threshold=2)

expect(actual).to(contain_exactly(*expected))

## Parts Unknown

In [21]:
def replace_oov_words_by_unk(tokenized_sentences:List[List[str]], closed_vocabulary:FrozenSet[str], unknown_token="<unk>"):
  """
  Replace words not in the given vocabulary with '<unk>' token.
  Args:
    tokenized_sentences List[List[str]]: List of word tokenized sentences
    vocabulary Set[str]: Set of words that are most frequent
    unknown_token str: A string representing unknown (out-of-vocabulary) words
  Returns:
    replaced_tokenized_sentences List[List[str]]:  List of word tokenized sentences with out-of-vocabulary words converted to `unknown_token`
  """
  replaced_tokenized_sentences = list()

  for sentence in tokenized_sentences:
    replaced_sentence = list()
    for word in sentence:
      if word not in closed_vocabulary:
        replaced_sentence.append(unknown_token)
      else:
        replaced_sentence.append(word)

    replaced_tokenized_sentences.append(replaced_sentence)

  return replaced_tokenized_sentences

### Tests

In [22]:
tokenized_sentences = [["dogs", "run"], ["cats", "and", "dogs", "run"]]
closed_vocabulary = ["dogs", "run"]

expecteds = [["dogs", "run"], ["<unk>", "<unk>", "dogs", "run"]]

actuals = replace_oov_words_by_unk(tokenized_sentences, closed_vocabulary)

print("Original sentences:")
print(tokenized_sentences)

for actual, expected in zip(actuals, expecteds):
  expect(actual).to(contain_exactly(*expected))

print("Tokenized sentences with out-of-vocabulary words converted to '<unk>'")
print(actuals)

Original sentences:
[['dogs', 'run'], ['cats', 'and', 'dogs', 'run']]
Tokenized sentences with out-of-vocabulary words converted to '<unk>'
[['dogs', 'run'], ['<unk>', '<unk>', 'dogs', 'run']]


## Combine Closed Vocabulary and Replace Out-Of-Vocabulary words

Note that words and tokens are used interchangably.

In [23]:
def preprocess_data(train_data:List[List[str]], test_data:List[List[str]], threshold: int):
  """
  Preproces data i.e.
    - Create a vocabulary of training data
    - Create a closed vocabulary by of tokens that appear at least `threshold` times in the training data.
    - Replace the tokens that appear less than `threshold` times by "<unk>" for training and test data.
    Args:
      train_data List[List[str]]: List of word-tokenized sentences for training
      test_data List[List[str]]: List of word-tokenized sentences for testing
      threshold int:  Minimum number of occurences for a word to be in closed vocabulary
    Returns:
      (replaced_train_data, replaced_test_data, vocabulary) Tuple[List[List[str]], List[List[str]], Dict[str, int]]: Tuple of
      - training data with out-of-vocabulary words replaced by unknown token (<unk>)
      - testing data with out-of-vocabulary words replaced by unknown token (<unk>)
      - vocabulary
  """
  vocabulary = create_vocabulary(train_data)
  closed_vocabulary = create_closed_vocabulary(vocabulary, threshold)

  replaced_train_data = replace_oov_words_by_unk(train_data, closed_vocabulary)
  replaced_test_data = replace_oov_words_by_unk(test_data, closed_vocabulary)

  return replaced_train_data, replaced_test_data, vocabulary

### Tests

In [24]:
tmp_train = [["dogs", "run"], ["cats", "and", "dogs", "run"]]
tmp_test = [["dogs", "run", "after", "men", "cats", "and", "dogs"]]

repl_tmp_train_data, repl_tmp_test_data, tmp_vocabulary = preprocess_data(tmp_train, tmp_test, threshold=2)

print("repl_tmp_train_data")
print(repl_tmp_train_data)
expecteds = [["dogs", "run"], ["<unk>", "<unk>", "dogs", "run"]]
for actual, expected in zip(repl_tmp_train_data, expecteds):
  expect(actual).to(contain_exactly(*expected))

print("repl_tmp_test_data")
print(repl_tmp_test_data)
expecteds = [["dogs", "run", "<unk>", "<unk>", "<unk>", "<unk>", "dogs"]]
for actual, expected in zip(repl_tmp_test_data, expecteds):
  expect(actual).to(contain_exactly(*expected))

expected = {
    "dogs": 2,
    "run": 2,
    "cats": 1,
    "and": 1
}

print("tmp_vocabulary")
print(tmp_vocabulary)
expect(tmp_vocabulary).to(have_keys(**expected))

repl_tmp_train_data
[['dogs', 'run'], ['<unk>', '<unk>', 'dogs', 'run']]
repl_tmp_test_data
[['dogs', 'run', '<unk>', '<unk>', '<unk>', '<unk>', 'dogs']]
tmp_vocabulary
{'dogs': 2, 'run': 2, 'cats': 1, 'and': 1}


# Develop an N-Gram Based Language Model

In [25]:
%pip install tabulate --quiet

In [26]:
from functools import partial
from tabulate import tabulate

TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys")

## Count N-Grams

In [27]:
from typing import Tuple, List, DefaultDict
from collections import  defaultdict

def compute_ngram_dictionary(sentences: List[List[str]], n:int, start_token: str="<s>", end_token:str="<e>") ->DefaultDict[Tuple[str], int]:
    """
    Create a dictionary of `n`-grams of `sentences`
        Args:
            sentences List[List[str]]: List of word-tokenized sentences
            n int: Number of words in sequence
            start_token str: Indicate the beginning of the sentence
            end_token str: Indicate the ending of the sentence
        Returns:
            n_gram_counts Dict[Tuple[str], int]: A dictionary that maps a tuple of `n`-words to its frequency
    """

    n_gram_counts = defaultdict(int)

    for sentence in sentences:

        # Prepend start token n times, and append end token one time
        sentence = [start_token]*n + sentence + [end_token]
        print(f"sentence = {sentence}")

        # Convert list to tuple to use sequence of words as a key in the dictionary
        sentence = tuple(sentence)
        print(f"tuple sentence = {sentence}")
        
        sentence_length = len(sentence)
        for i in range(0, sentence_length-n+1):
            n_gram = sentence[i:i+n]
            n_gram_counts[n_gram]+=1

    return n_gram_counts

### Tests

In [28]:
from expects import expect, have_keys

sentences = [["i","like", "a", "cat"],
             ["this", "dog", "is", "like", "a", "cat"]]

print("Uni-gram")
expected = {
    ("<s>",):2,
    ("i",): 1,
    ("like",): 2,
    ("a",): 2,
    ("cat",): 2,
    ("this",): 1,
    ("dog",): 1,
    ("is",): 1, 
}

actual = compute_ngram_dictionary(sentences, 1)
print(actual)
expect(actual).to(have_keys(expected))

print("Bi-gram")
expected = {
    ("<s>", "<s>"): 2,
    ("<s>", "i"): 1,
    ("i", "like"): 1,
    ("like", "a"): 2,
    ("a", "cat"): 2,
    ("<s>", "this"): 1,
    ("this", "dog"): 1,
    ("dog", "is"): 1,
    ("is", "like"): 1,
}

actual = compute_ngram_dictionary(sentences, 2)
print(actual)
expect(actual).to(have_keys(expected))

Uni-gram
sentence = ['<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
defaultdict(<class 'int'>, {('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1})
Bi-gram
sentence = ['<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
defaultdict(<class 'int'>, {('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1})


## Probability Estimates

In [29]:
from typing import DefaultDict

def estimate_probability(word: str,
                        previous_n_gram: List[str],
                        n_gram_counts: DefaultDict[Tuple[str], int],
                        n_plus1_gram_counts: DefaultDict[Tuple[str], int],
                        vocabulary_size: int,
                        k: float=1.0) -> float:
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing
        Args:
            word str: Next word
            previous_n_gram Tuple[str]: A sequence of words of length n
            n_gram_counts Dict[Tuple[str], int]: A dictionary that maps a tuple of `n`-words to its frequency
            n_plus1_gram_counts Dict[Tuple[str], int]: Dictionary of counts of (n+1)-grams
            vocabulary_size int: Number of words in the vocabulary
            k float: Smoothing parameter. Positive constant
        Returns:
            probability float: Probability of next word using the n-gram counts with k-smoothing
    """
    previous_n_gram = tuple(previous_n_gram)
    print(f"previous_n_gram = {previous_n_gram}")

    # Get the count of the previous n-gram  if exists in the dictionary of n-gram counts else set the count to zero
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)

    # Calculate the denominator using the count of the previous n gram and apply k-smoothing
    denominator = previous_n_gram_count + k*vocabulary_size

    # Define n plus 1 gram as the previous n-gram plus the current word
    n_plus1_gram = previous_n_gram + (word,)
    print(f"n_plus1_gram = {n_plus1_gram}")

    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)

    # Calculate the numerator using the count of the n-gram plus current word
    numerator = n_plus1_gram_count + k

    probabiltiy = numerator/denominator

    return probabiltiy


### Tests

In [30]:
import math
from expects import expect, be_true

sentences = [["i","like", "a", "cat"],
             ["this", "dog", "is", "like", "a", "cat"]]

vocabulary = frozenset(sentences[0]).union(frozenset(sentences[1]))

unigram_dictionary = compute_ngram_dictionary(sentences, 1)

bigram_dictionary = compute_ngram_dictionary(sentences, 2)

actual = estimate_probability("cat", ["a"], unigram_dictionary, bigram_dictionary, len(vocabulary))

expected = 0.3333

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {actual:.4f}")
expect(math.isclose(actual, expected, abs_tol=1e-4)).to(be_true)

sentence = ['<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
previous_n_gram = ('a',)
n_plus1_gram = ('a', 'cat')
The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


## Estimate probabilities for all words

In [31]:
from typing import FrozenSet

def estimate_probabilities(previous_n_gram: List[str], 
                            n_gram_counts: DefaultDict[Tuple[str], int],
                            n_plus1_gram_counts: DefaultDict[Tuple[str],int],
                            vocabulary: FrozenSet[str],
                            k:float=1.0):
    """
    Estimate the probabilities of next word using the n-gram counts with k-smoothing
        Args:
            previous_n_gram Tuple[str]: Sequence of words of length
            n_gram_counts DefaultDict[Tuple[str], int]: Dictionary of counts of n-grams
            n_plus1_gram_counts DefaultDict[Tuple[str], int]: Dictionary of counts of (n+1)-grams
            vocabulary List[str]: List of words
            k float: Smoothing parameter. Positive constant
        Returns:
            probabilities Dict[str, int]: A dictionary mapping from next words to the probability
    """

    # Add end_token and unknown_token to the vocabulary
    # start_token is not needed since it should not appear as next word
    vocabulary_list = list(vocabulary)+ ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary_list)

    probabilities = dict()
    for word in vocabulary_list:
        probability = estimate_probability(word,  previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k)
        probabilities[word] = probability
    
    return probabilities

### Tests

In [32]:
from expects import expect, have_keys

sentences = [["i","like", "a", "cat"],
             ["this", "dog", "is", "like", "a", "cat"]]

vocabulary = frozenset(sentences[0]).union(frozenset(sentences[1]))
unigram_counts = compute_ngram_dictionary(sentences, 1)
bigram_counts = compute_ngram_dictionary(sentences, 2)

actual = estimate_probabilities("a", unigram_counts, bigram_counts, vocabulary, k=1)
print(actual)
expected = {'a': 0.09090909090909091,
            'is': 0.09090909090909091,
            'like': 0.09090909090909091,
            'i': 0.09090909090909091,
            'this': 0.09090909090909091,
            'dog': 0.09090909090909091,
            'cat': 0.2727272727272727,
            '<e>': 0.09090909090909091,
            '<unk>': 0.09090909090909091}
expect(actual).to(have_keys(**expected))

sentence = ['<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
previous_n_gram = ('a',)
n_plus1_gram = ('a', 'this')
previous_n_gram = ('a',)
n_plus1_gram = ('a', 'cat')
previous_n_gram = ('a',)
n_plus1_gram = ('a', 'is')
previous_n_gram = ('a',)
n_plus1_gram = ('a', 'dog')
previous_n_gram = ('a',)
n_plus1_gram = ('a', 'a')
previous_n_gram = ('a',)
n_plus1_gram = ('a', 'i')
previous_n_gram = ('a',)
n_plus1_gram = ('a', 'like')
previous_n_gram = ('a',)
n_plus1_gram = ('a', '<e>')
previous_n_gram = ('a',)
n_plus1_gram = ('a', '<unk>')


In [33]:
trigram_counts = compute_ngram_dictionary(sentences, 3)
actual = estimate_probabilities(["<s>", "<s>"], bigram_counts, trigram_counts, vocabulary, k=1)
print(actual)

expected = {'a': 0.09090909090909091,
            'is': 0.09090909090909091,
            'like': 0.09090909090909091,
            'i': 0.18181818181818182, 
            'this': 0.18181818181818182, 
            'dog': 0.09090909090909091, 
            'cat': 0.09090909090909091, 
            '<e>': 0.09090909090909091, 
            '<unk>': 0.09090909090909091}
expect(actual).to(have_keys(**expected))

sentence = ['<s>', '<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', 'this')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', 'cat')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', 'is')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', 'dog')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', 'a')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', 'i')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', 'like')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', '<e>')
previous_n_gram = ('<s>', '<s>')
n_plus1_gram = ('<s>', '<s>', '<unk>')
{'this': 0.18181818181818182, 'cat': 0.09090909090909091, 'is': 0.0

## Count and probability matrices

In [34]:
import pandas as pd
import numpy as np

from typing import DefaultDict, Tuple, FrozenSet

def make_count_matrix(n_plus1_gram_counts: DefaultDict[Tuple[str], int], vocabulary: FrozenSet[str]):
    # Add <e> <unk> to the vocabulary
    # <s> is omitted since it should not appear as the next word
    vocabulary = vocabulary.union(["<e>", "<unk>"])
    print(f"vocabulary = {vocabulary}")

    # Obtain unique n-grams
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        print(f"key = {n_plus1_gram}")
        n_gram = n_plus1_gram[0:-1]
        print(f"n_gram = {n_gram}")
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    print(f"n_grams = {n_grams}")

    # mapping from n-gram to row
    row_index = {n_gram: i for i, n_gram in enumerate(n_grams)}

    # mapping from next word to column
    col_index = {word: j for j, word in enumerate(vocabulary)}

    n_rows = len(n_grams)
    n_cols = len(vocabulary)
    count_matrix = np.zeros((n_rows, n_cols))

    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]

        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    
    print(f"count_matrix = {count_matrix}")
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix


In [35]:
sentences = [["i", "like", "a", "cat"],
             ["this", "dog", "is", "like", "a", "cat"]]
vocabulary = frozenset(sentences[0]).union(frozenset(sentences[1]))
bigram_counts = compute_ngram_dictionary(sentences, 2)

print("bigram_counts")
print(TABLE(make_count_matrix(bigram_counts, vocabulary)))

sentence = ['<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
bigram_counts
vocabulary = frozenset({'this', '<e>', 'is', 'cat', 'dog', 'a', 'i', 'like', '<unk>'})
key = ('<s>', '<s>')
n_gram = ('<s>',)
key = ('<s>', 'i')
n_gram = ('<s>',)
key = ('i', 'like')
n_gram = ('i',)
key = ('like', 'a')
n_gram = ('like',)
key = ('a', 'cat')
n_gram = ('a',)
key = ('cat', '<e>')
n_gram = ('cat',)
key = ('<s>', 'this')
n_gram = ('<s>',)
key = ('this', 'dog')
n_gram = ('this',)
key = ('dog', 'is')
n_gram = ('dog',)
key = ('is', 'like')
n_gram = ('is',)
n_grams = [('<s>',), ('cat',), ('like',), ('this',), ('a',), ('dog',), ('i',), ('is',)]
count_matrix = [[1. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 2. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 2. 0. 0

In [36]:
print("trigram counts")
trigram_counts = compute_ngram_dictionary(sentences, 3)
print(TABLE(make_count_matrix(trigram_counts, vocabulary)))

trigram counts
sentence = ['<s>', '<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
vocabulary = frozenset({'this', '<e>', 'is', 'cat', 'dog', 'a', 'i', 'like', '<unk>'})
key = ('<s>', '<s>', '<s>')
n_gram = ('<s>', '<s>')
key = ('<s>', '<s>', 'i')
n_gram = ('<s>', '<s>')
key = ('<s>', 'i', 'like')
n_gram = ('<s>', 'i')
key = ('i', 'like', 'a')
n_gram = ('i', 'like')
key = ('like', 'a', 'cat')
n_gram = ('like', 'a')
key = ('a', 'cat', '<e>')
n_gram = ('a', 'cat')
key = ('<s>', '<s>', 'this')
n_gram = ('<s>', '<s>')
key = ('<s>', 'this', 'dog')
n_gram = ('<s>', 'this')
key = ('this', 'dog', 'is')
n_gram = ('this', 'dog')
key = ('dog', 'is', 'like')
n_gram = ('dog', 'is')
key = ('is', 'like', 'a')
n_gram = ('is', 'like')
n_grams = [('a', 'cat'), ('this', 'd

## Probabilty Matrix

In [37]:
def make_probability_matrix(n_plus1_gram_counts: DefaultDict[Tuple[str], int], vocabulary: FrozenSet[str], k:float=1.0):
    count_matrix = make_count_matrix(n_plus1_gram_counts, vocabulary)
    count_matrix += k
    probability_matrix = count_matrix.div(count_matrix.sum(axis="columns"), axis="rows")
    return probability_matrix


In [38]:
sentences = [["i", "like", "a", "cat"],
             ["this", "dog", "is", "like", "a", "cat"]]

vocabulary = frozenset(sentences[0]).union(sentences[1])
bigram_counts = compute_ngram_dictionary(sentences, 2)
print("bigram probabilties")
print(TABLE(make_probability_matrix(bigram_counts, vocabulary, k=1)))

sentence = ['<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'i', 'like', 'a', 'cat', '<e>')
sentence = ['<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>']
tuple sentence = ('<s>', '<s>', 'this', 'dog', 'is', 'like', 'a', 'cat', '<e>')
bigram probabilties
vocabulary = frozenset({'this', '<e>', 'is', 'cat', 'dog', 'a', 'i', 'like', '<unk>'})
key = ('<s>', '<s>')
n_gram = ('<s>',)
key = ('<s>', 'i')
n_gram = ('<s>',)
key = ('i', 'like')
n_gram = ('i',)
key = ('like', 'a')
n_gram = ('like',)
key = ('a', 'cat')
n_gram = ('a',)
key = ('cat', '<e>')
n_gram = ('cat',)
key = ('<s>', 'this')
n_gram = ('<s>',)
key = ('this', 'dog')
n_gram = ('this',)
key = ('dog', 'is')
n_gram = ('dog',)
key = ('is', 'like')
n_gram = ('is',)
n_grams = [('<s>',), ('cat',), ('like',), ('this',), ('a',), ('dog',), ('i',), ('is',)]
count_matrix = [[1. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 2. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 2