In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('gutenberg')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Drogias\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Drogias\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
gutenberg_corpus = nltk.corpus.gutenberg.fileids()                                 #Get all the files
gutenberg_corpus

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
combined_text = ""             
for file_id in gutenberg_corpus:                                        # Combine the text from all files
    combined_text += nltk.corpus.gutenberg.raw(file_id)

print(combined_text[:500])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died t


In [4]:
combined_text = combined_text.lower()                              #Convert to lowercase

In [5]:
combined_text[:500]

"[emma by jane austen 1816]\n\nvolume i\n\nchapter i\n\n\nemma woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.\n\nshe was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.  her mother\nhad died t"

In [6]:
import re
def remove_special_chars(text):
   text = text.replace('[', '')
   text = text.replace('[', '')
   text = text.replace('\n', ' ')
   text = re.sub(r'[^a-zA-z.?!\']', ' ', text)                     #Remove these characters   

   return text

In [7]:
combined_text = remove_special_chars(combined_text)
combined_text[:500]

"emma by jane austen     ]  volume i  chapter i   emma woodhouse  handsome  clever  and rich  with a comfortable home and happy disposition  seemed to unite some of the best blessings of existence  and had lived nearly twenty one years in the world with very little to distress or vex her.  she was the youngest of the two daughters of a most affectionate  indulgent father  and had  in consequence of her sister's marriage  been mistress of his house from a very early period.  her mother had died to"

In [8]:
len(combined_text.split())                     

2119923

In [9]:
len(combined_text)                             # How many characters

11793187

In [10]:
combined_text[11792000:]

"abes in wombs  latent  folded  compact  sleeping  billions of billions  and trillions of trillions of them waiting   on earth and in the sea  the universe  the stars there in the     heavens   urging slowly  surely forward  forming endless  and waiting ever more  forever more behind.       good bye my fancy!  good bye my fancy! farewell dear mate  dear love! i'm going away  i know not where  or to what fortune  or whether i may ever see you again  so good bye my fancy.  now for my last  let me look back a moment  the slower fainter ticking of the clock is in me  exit  nightfall  and soon the heart thud stopping.  long have we lived  joy'd  caress'd together  delightful!  now separation  good bye my fancy.  yet let me not be too hasty  long indeed have we lived  slept  filter'd  become really blended     into one  then if we die we die together   yes  we'll remain one   if we go anywhere we'll go together to meet what happens  may be we'll be better off and blither  and learn something

In [11]:
def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(''.join(text))                  #Get the sentences
    return sentences     

In [12]:
sentences = tokenize_sentences(combined_text) 
print(len(sentences))    
print(sentences[2])  
print(sentences[57649])

96284
her mother had died too long ago for her to have more than an indistinct remembrance of her caresses  and her place had been supplied by an excellent woman as governess  who had fallen little short of a mother in affection.
pray excuse me if i am wrong.


In [13]:
def tokenize_words(text):
    words = nltk.word_tokenize(text)
    return words

In [14]:
words = tokenize_words(combined_text)
print(len(words))
print(words[3])
print(words[-2])

2235633
austen
fancy


In [15]:
words_list = [tokenize_words(f) for f in sentences]              # list of all the words

In [16]:
for word in words_list[1]:
    print(word)

she
was
the
youngest
of
the
two
daughters
of
a
most
affectionate
indulgent
father
and
had
in
consequence
of
her
sister
's
marriage
been
mistress
of
his
house
from
a
very
early
period
.


In [17]:
import random
import math

random.shuffle(words_list)
train_len = math.floor(0.6 * len(words_list))                      #Training set length(60%)
dev_len = math.floor(0.2 * len(words_list))                        #Development set length (20%)
test_len = math.floor(0.2 * len(words_list))                       #Test set length (20%)

training_set = []
development_set = []
test_set = []

for content in words_list[0:train_len]:
    training_set.append(content)
    
for content in words_list[train_len: train_len + dev_len]:
    development_set.append(content)

for content in words_list[train_len + dev_len:]:
    test_set.append(content)

In [18]:
import itertools
from abc import ABCMeta, abstractmethod
from collections import Counter

from nltk.util import ngrams

START_TOKEN = "<start>"
END_TOKEN = "<end>"


class INgramModel:
    """
    An interface for all N-gram models.
    """
    __metaclass__ = ABCMeta

    @abstractmethod
    def fit(self, sentences_tokenized: list[list[str]]) -> None:
        """
        Train the model on a tokenized selection of sentences.
        :param sentences_tokenized: a list of all sentences. Each sentence is represented as a list of string tokens.
        :return: None
        """
        pass

    @abstractmethod
    def predict(self, tokenized_sentence: list[str]) -> str:
        """
        Predict the next word in a given sentence. Uses n-gram probability with Laplace Smoothing.
        :param tokenized_sentence: a list of string tokens
        :raise Runtime Error: if the model has not been trained
        :return: the most probable token
        """
        return ""

    @abstractmethod
    def prediction_proba(self, tokenized_sentence: list[str], token: str) -> float:
        """
        Get the model's probability for a specific token given a sentence.
        :param tokenized_sentence: a list of string tokens
        :param token: the token
        :raise Runtime Error: if the model has not been trained
        :return: the probability that the token is next
        """
        return 0


class BigramModel(INgramModel):
    """
    A basic bigram model using Laplace Smoothing.
    """

    def __init__(self, alpha: float):
        """
        Create a bigram model.
        :param alpha: the Laplace smoothing parameter. Must be between 0 and 1 (excluding 0)
        """
        if alpha > 1.0 or alpha <= 0:
            raise ValueError(f"Alpha value must be between 0 (exclusive) and 1 (value given alpha={alpha})")

        self.vocab_len = 0
        self.alpha = alpha
        self.bigram_counter = Counter()
        self.unigram_counter = Counter()

    def fit(self, sentences_tokenized: list[list[str]]) -> None:
        self.vocab_len = len(set(itertools.chain.from_iterable(sentences_tokenized)))

        for sentence in sentences_tokenized:
            formatted_sentence = [START_TOKEN] + sentence + [END_TOKEN]
            self.unigram_counter.update(_process_ngrams(formatted_sentence, 1))
            self.bigram_counter.update(_process_ngrams(formatted_sentence, 2))

    def predict(self, tokenized_sentence: list[str]) -> str:
        assert tokenized_sentence is not None

        if self.vocab_len == 0:
            raise RuntimeError("Model has not been trained.")

        max_prob = -1
        max_token = None

        for token in self.unigram_counter.keys():
            prob = self.prediction_proba(tokenized_sentence, token)

            if prob > max_prob:
                max_prob = prob
                max_token = token

        return max_token

    def prediction_proba(self, tokenized_sentence: list[str], token: str) -> float:
        assert tokenized_sentence is not None

        if self.vocab_len == 0:
            raise RuntimeError("Model has not been trained.")

        formatted_sentence = [START_TOKEN] + [START_TOKEN] + tokenized_sentence

        return ((self.bigram_counter[(formatted_sentence[-1], token)] + self.alpha) /
                (self.unigram_counter[token] + self.alpha * self.vocab_len))


class TrigramModel(INgramModel):
    """
    A basic trigram model using Laplace Smoothing.
    """

    def __init__(self, alpha: float):
        """
        Create a trigram model.
        :param alpha: the Laplace smoothing parameter. Must be between 0 and 1 (excluding 0)
        """
        if alpha > 1.0 or alpha <= 0:
            raise ValueError(f"Alpha value must be between 0 (exclusive) and 1 (value given alpha={alpha})")

        self.vocab = {}
        self.alpha = alpha
        self.bigram_counter = Counter()
        self.trigram_counter = Counter()

    def fit(self, sentences_tokenized: list[list[str]]) -> None:
        self.vocab = set(itertools.chain.from_iterable(sentences_tokenized))

        for sentence in sentences_tokenized:
            formatted_sentence = [START_TOKEN] + [START_TOKEN] + sentence + [END_TOKEN]
            self.bigram_counter.update(_process_ngrams(formatted_sentence, 2))
            self.trigram_counter.update(_process_ngrams(formatted_sentence, 3))

    def predict(self, tokenized_sentence: list[str]) -> tuple[str, float]:
        assert tokenized_sentence is not None

        if self.vocab == {}:
            raise RuntimeError("Model has not been trained.")

        max_prob = -1
        max_token = None

        for token in self.vocab:
            prob = self.prediction_proba(tokenized_sentence, token)

            if prob > max_prob:
                max_prob = prob
                max_token = token

        return max_token

    def prediction_proba(self, tokenized_sentence: list[str], token: str) -> float:
        assert tokenized_sentence is not None

        if self.vocab == {}:
            raise RuntimeError("Model has not been trained.")

        formatted_sentence = [START_TOKEN] + [START_TOKEN] + tokenized_sentence
        return ((self.trigram_counter[(formatted_sentence[-2], formatted_sentence[-1], token)] + self.alpha) /
                (self.bigram_counter[(formatted_sentence[-1], token)] + self.alpha * len(self.vocab)))


# I could generalize this to support combinations of unigrams, bigrams and trigrams, but we'll see
class LinearInterpolationModel(INgramModel):
    """
    A model using linear interpolation between a bigram and trigram model.
    """

    def __init__(self, alpha: float, lamda: float):
        """
        Create a linear interpolation model between a bigram and trigram model.
        :param alpha: the Laplace smoothing parameter. Must be between 0 and 1 (excluding 0)
        :param lamda: the interpolation parameter, where probability = lambda * (bigram probability)
        + (1-lamda) * (trigram probability)
        """
        if lamda > 1.0 or lamda <= 0:
            raise ValueError(f"Lamda value must be between 0 (exclusive) and 1 (value given alpha={lamda})")

        self.bigram_model = BigramModel(alpha)
        self.trigram_model = TrigramModel(alpha)
        self.lamda = lamda

    def fit(self, sentences_tokenized: list[list[str]]) -> None:
        self.bigram_model.fit(sentences_tokenized)
        self.trigram_model.fit(sentences_tokenized)

    def predict(self, tokenized_sentence: list[str]) -> tuple[str, float]:
        if self.bigram_model.vocab_len == 0:
            raise RuntimeError("Model has not been trained.")

        # no need for sentence checking here, the underlying classes will take care of it
        max_prob = -1
        max_token = None

        for token in self.trigram_model.vocab:
            prob = self.prediction_proba(tokenized_sentence, token)
            if prob > max_prob:
                max_prob = prob
                max_token = token

        return max_token

    def prediction_proba(self, tokenized_sentence: list[str], token: str) -> float:
        bigram_prob = self.bigram_model.prediction_proba(tokenized_sentence, token)
        trigram_prob = self.trigram_model.prediction_proba(tokenized_sentence, token)
        return self.lamda * bigram_prob + (1 - self.lamda) * trigram_prob


def _calc_ngrams(all_corpus: list[str], ngram: int) -> Counter:
    """
    Process a tokenized sentence into a list of ngrams.
    :param all_corpus: a list of all the corpus words
    :param ngram: whether the ngrams will be unigrams, bigrams etc
    :return: the counter of either unigram, bigram or trigram
    """
    unigram_counter = Counter()
    bigram_counter = Counter()
    trigram_counter = Counter()
     
    

    if ngram == 1 :
        for sentence in all_corpus:
             grams = [gram for gram in ngrams(sentence, ngram, pad_left=True, pad_right=True,
                                    left_pad_symbol=START_TOKEN, right_pad_symbol=END_TOKEN)]
             unigram_counter.update(grams)
        return unigram_counter
        
    elif ngram == 2:
        for sentence in all_corpus:
             grams = [gram for gram in ngrams(sentence, ngram, pad_left=True, pad_right=True,
                                    left_pad_symbol=START_TOKEN, right_pad_symbol=END_TOKEN)]
             bigram_counter.update(grams)
        return bigram_counter
        
    elif ngram == 3:
        for sentence in all_corpus:
             grams = [gram for gram in ngrams(sentence, ngram, pad_left=True, pad_right=True,
                                    left_pad_symbol=START_TOKEN, right_pad_symbol=END_TOKEN)]
             trigram_counter.update(grams)
        return trigram_counter
        
    return 0
    
        

In [19]:
def replace_OOV_words_train(all_corpus):
    unigram_counter = _calc_ngrams(all_corpus,1)
    OOV_words = {}

    for k, v in unigram_counter.items():
        if v < 10:
            key = k[0]
            OOV_words[key] = "UNK"

    replaced_corpus = []                          #the original corpus having the OOV words replaced by 'UNK'
    for sentence in all_corpus:
        clean_sentence = []
    
        for n in sentence:
            clean_sentence.append(OOV_words.get(n, n))
    
        replaced_corpus.append(clean_sentence)


    vocabulary = []

    for key in unigram_counter.keys():        #Iterate the unigram counter
        word = key[0]                         #get the word
        if word not in OOV_words:
            vocabulary.append(word)

    vocabulary = set(vocabulary)              #Keep unique words
    return vocabulary, replaced_corpus, OOV_words
    

In [20]:
vocabulary, new_corpus, OOV_words = replace_OOV_words_train(training_set)

In [21]:
def replace_OOV_words_test(all_corpus, vocabulary, oov_words):
    
    replaced_corpus = []
    for sentence in all_corpus:
        updated_sent = []

        for word in sentence:
            if (word not in vocabulary) or (word in oov_words):
                updated_sent.append('UNK')
            else:
                updated_sent.append(word)
                
    replaced_corpus.append(updated_sent)
    return replaced_corpus   

In [22]:
development_set = replace_OOV_words_test(development_set, vocabulary, OOV_words)
test_set = replace_OOV_words_test(test_set, vocabulary, OOV_words)