In [10]:
import re
import pandas as pd
from collections import Counter
import nltk

class Model:
    def __init__(self, tokens: list):
        self.tokens: list = self.addSentenceBoundaries([i.lower() for i in tokens if not re.match('\W', i)], True)
        self.frequencyTable: pd.DataFrame = pd.DataFrame(data={'bigram': [], 'count': []})
        bigramCounts = self.countBigrams(self.tokens)
        print("Bigram counting done!")
        bigramCountTuples = bigramCounts.most_common(len(bigramCounts))
        l = len(bigramCountTuples)
        self.printProgressBar(0, len(self.tokens), prefix = 'Progress:', suffix = 'Complete', length = 50)
        for i, item in enumerate(bigramCountTuples):
            self.frequencyTable.loc[len(self.frequencyTable.index)] = [item[0], item[1]]
            self.printProgressBar(i+1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    
    def countBigrams(self, tokens: list) -> Counter:
        bigramCounts = Counter()
        l = len(tokens)
        self.printProgressBar(0, l, prefix='Counting Bigrams:', suffix= 'Complete', length = 50)
        for i, sentence in enumerate(tokens):
            bigrams: list = []
            words: list = sentence.split(" ")
            words.remove("</s>")
            for i in range(len(words) - 1):
                bigrams.append(words[i] + " " + words[i+1])
            bigramCounts += Counter(bigrams)
            self.printProgressBar(i+1, l, prefix='Counting Bigrams:', suffix= 'Complete', length = 50)
        return bigramCounts

    def addSentenceBoundaries(self, tokens: list, useProgressBar=False) -> list:
        tokensWithBoundaries: list = []
        if useProgressBar:
            l = len(tokens)
            self.printProgressBar(0, l, prefix = 'adding boundaries:', suffix = 'Complete', length = 50)
        for i in range(len(tokens)):
            s = "<s> " + tokens[i] + " </s>"
            tokensWithBoundaries.append(s)
            if useProgressBar:
                self.printProgressBar(i+1, l, prefix = 'adding boundaries:', suffix = 'Complete', length = 50)
        return tokensWithBoundaries

    def printProgressBar(self, iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
        percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
        filledLength = int(length * iteration // total)
        bar = fill * filledLength + '-' * (length - filledLength)
        print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
        # Print New Line on Complete
        if iteration == total: 
            print()

In [12]:
#import Model
from CorpusReader import CorpusReader
import pandas as pd

reader = CorpusReader("./train")
model = Model(reader.sents())

Progress adding boundaries: |██████████████████████████████████████████████████| 100.0% Complete
Bigram counting done!------------------------------------------------| 0.3% Complete
Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [77]:
with open("test.txt", "w") as f:
    f.write(str(dict(model.bigramCounts)))

In [10]:
model.frequencyTable.head(100)

Unnamed: 0,bigram,count
0,of the,1439
1,<s> the,927
2,in the,874
3,<s> he,868
4,<s> i,489
...,...,...
95,when he,88
96,to a,88
97,but i,88
98,a sort,87


In [81]:
# Print iterations progress


In [83]:
import time

# A List of Items
items = list(range(0, 57))
l = len(items)

# Initial call to print 0% progress
printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
for i, item in enumerate(items):
    # Do stuff...
    time.sleep(0.1)
    # Update Progress Bar
    printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)

Progress: |██████████████████████████████████████████████████| 100.0% Complete
