In [13]:
import re
import pandas as pd
from collections import Counter
from tqdm import tqdm


class BigramModel:
    def __init__(self, tokens: list):
        if (tokens == None):
            print("Tokens cannot be null!")
        else:
            self.tokens: list = self._add_sentence_boundaries(tokens)
            _bigram_counts = self._make_count_bigrams(self.tokens)
            _bigram_count_tuples = _bigram_counts.most_common(len(_bigram_counts))
            self.frequencyTable: pd.DataFrame = pd.DataFrame(_bigram_count_tuples, columns=['bigram', 'count'])

    def probability(self, w: str, w_n: str, smooth_constant: float = 0.0) -> float:
        pass

    def perplexity(self, sent: list, smoothing_constant: float = 1.0) -> float:
        pass

    def choose_successor(self, word: str, smooth_constant: float = 0.0) -> str:
        pass

    def _make_count_bigrams(self, tokens: list) -> Counter:
        """
        @param tokens: list of tokenized sentences
        Takes a list of tokenized sentences and generates the appropriate bigrams and counts them
        """
        bigram_counts = Counter()
        for p, words in enumerate(tqdm(tokens, ncols=100, desc='Making and counting Bigrams')):
            bigrams: list = []
            for i in range(len(words) - 1):
                bigrams.append((words[i], words[i + 1]))
            bigram_counts += Counter(bigrams)
        return bigram_counts

    def _add_sentence_boundaries(self, tokens: list) -> list:
        """
        @param tokens: list of tokenized sentences
        Takes a list of tokenized sentences and adds sentence boundaries to all the sentences
        """
        tokens_without_punctuation = self._remove_punctuation_tokens(tokens)
        tokens_with_boundaries: list = []
        item: list
        for i, item in enumerate(tqdm(tokens_without_punctuation, ncols=100, desc='Adding boundaries')):
            item.insert(0, "<s>")
            item.append("</s>")
            tokens_with_boundaries.append(item)
        return tokens_with_boundaries

    def _remove_punctuation_tokens(self, tokens: list):
        """
        @param tokens: list of tokenized sentences
        Takes a list of tokenized sentences and removes the tokens that solemnly consist of punctuation from the list
        """
        to_return: list = []
        for sent in tokens:
            to_return.append([p.lower() for p in sent if not re.match('\W', p)])
        return to_return

In [14]:
#import Model
from corpusreader import CorpusReader
import pandas as pd

reader = CorpusReader("./train")
model = BigramModel(reader.sents())

Adding boundaries: 100%|█████████████████████████████████| 11909/11909 [00:00<00:00, 1315199.62it/s]
Making and counting Bigrams: 100%|███████████████████████████| 11909/11909 [00:36<00:00, 323.57it/s]


In [15]:
model.frequencyTable

Unnamed: 0,bigram,count
0,"(of, the)",1610
1,"(<s>, i)",1049
2,"(<s>, the)",1033
3,"(in, the)",999
4,"(<s>, he)",950
...,...,...
96575,"(lilac, before)",1
96576,"(before, breakfast)",1
96577,"(breakfast, with)",1
96578,"(great, unconscious)",1
