## 1. Data Preparation

In [10]:
from nltk import sent_tokenize, word_tokenize, ngrams
import re
import string
from nltk.probability import FreqDist

In [16]:
class TextPreprocessor:
    def __init__(self) -> None:
        # read corpus
        with open('corpus/corpus.txt', 'r') as infile:
            self.file_content = infile.read().replace('\n', '')
            
        # split the file content into sentences
        self.sentences = sent_tokenize(self.file_content)
        
        # ---------- Train, Validation, Test ----------
        # size of training data
        self.training_size = int(len(self.sentences) * 0.7)
        self.training_data = self.sentences[:self.training_size]

        # size of validation data
        self.validation_size = int(len(self.sentences) * 0.1)
        self.validation_data = self.sentences[self.training_size:self.training_size + self.validation_size]

        self.test_data = self.sentences[self.training_size + self.validation_size:]
        
        # ---------- Call necessary methods ----------
        self.create_freq_n_gram(self.training_data)
        pass

    def tokenize_words(self, sentences):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        number_bracket_pattern = re.compile(r"\d")
        
        # convert to lower case
        low_sentences = [sentence.lower() for sentence in sentences]

        # remove number bracket
        cleaned_sentences = [re.sub(number_bracket_pattern, "", sentence) for sentence in low_sentences]

        # remove emoji
        no_emoji_sentences = [re.sub(emoji_pattern, "", sentence) for sentence in cleaned_sentences]

        # split each sentence into tokens: [['token', ''token'], ['token', 'token'], ...]
        tokens_2d = [word_tokenize(sentence) for sentence in no_emoji_sentences]

        # remove punctuation
        translator = str.maketrans('', '', string.punctuation)
        no_punc_tokens = []
        for tokens_1d in tokens_2d:
            no_punc_tokens.append([token.translate(translator) for token in tokens_1d])

        # # remove empty string
        no_empty_tokens = []
        for token_1d in no_punc_tokens:
            no_empty_tokens.append([token for token in token_1d if token != ''])
            
        # convert into token list: ['token', 'token']
        tokens = []
        for sentence in no_empty_tokens:
            # add start and end tokens to identify sentence boundary
            tokens.append('<s>')
            
            # add words in each sentence into tokens
            tokens.extend(sentence)
            tokens.append('</s>')

        # create n-grams
        uni_grams = []
        bi_grams = []
        tri_grams = []
        four_grams = []

        uni_grams.extend(list(ngrams(tokens, n=1)))
        bi_grams.extend(list(ngrams(tokens, n=2)))
        tri_grams.extend(list(ngrams(tokens, n=3)))
        four_grams.extend(list(ngrams(tokens, n=4)))

        return {
            'sentences': tokens,
            'uni_grams': uni_grams,
            'bi_grams': bi_grams,
            'tri_grams': tri_grams,
            'four_grams': four_grams,
        }
        
    def create_freq_n_gram(self, data):
        # create a frequency distribution for each n-gram
        self.freq_uni = FreqDist()
        self.freq_bi = FreqDist()
        self.freq_tri = FreqDist()
        self.freq_four = FreqDist()

        tokenized_words = self.tokenize_words(data);

        # count uni-grams
        for uni_gram in tokenized_words['uni_grams']:
            self.freq_uni[uni_gram] = self.freq_uni[uni_gram] + 1

        # count bi-grams
        for bi_gram in tokenized_words['bi_grams']:
            self.freq_bi[bi_gram] = self.freq_bi[bi_gram] + 1
                
        # count bi-grams
        for tri_gram in tokenized_words['tri_grams']:
            self.freq_tri[tri_gram] = self.freq_tri[tri_gram] + 1

        # count bi-grams
        for four_gram in tokenized_words['four_grams']:
            self.freq_four[four_gram] = self.freq_four[four_gram] + 1

In [17]:
TextPreprocessor().freq_uni

FreqDist({('the',): 480, ('<s>',): 342, ('</s>',): 342, ('of',): 304, ('data',): 299, ('and',): 297, ('to',): 202, ('a',): 190, ('in',): 182, ('is',): 92, ...})

In [3]:
with open('corpus/corpus.txt', 'r') as infile:
    file_content = infile.read().replace('\n', '')

In [9]:
# split the file content into sentences
sentences = sent_tokenize(file_content)
sentences[:5]

['Computer science is the study of computation, information, and automation.',
 '[1][2][3] Computer science spans theoretical disciplines (such as algorithms, theory of computation, and information theory) to applied disciplines (including the design and implementation of hardware and software).',
 '[4][5][6] Though more often considered an academic discipline, computer science is closely related to computer programming.',
 '[7]Algorithms and data structures are central to computer science.',
 '[8] The theory of computation concerns abstract models of computation and general classes of problems that can be solved using them.']

In [5]:
# split the sentence list into training set, validation set and test set
# size of training data
training_size = int(len(sentences) * 0.7)
training_data = sentences[:training_size]

# size of validation data
validation_size = int(len(sentences) * 0.1)
validation_data = sentences[training_size:training_size + validation_size]

test_data = sentences[training_size + validation_size:]

In [6]:
emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

number_bracket_pattern = re.compile(r"\d")

def tokenize_words(data):
    # convert to lower case
    low_sentences = [sentence.lower() for sentence in sentences]

    # remove number bracket
    cleaned_sentences = [re.sub(number_bracket_pattern, "", sentence) for sentence in low_sentences]

    # remove emoji
    no_emoji_sentences = [re.sub(emoji_pattern, "", sentence) for sentence in cleaned_sentences]

    # split each sentence into tokens: [['token', ''token'], ['token', 'token'], ...]
    tokens_2d = [word_tokenize(sentence) for sentence in no_emoji_sentences]

    # remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    no_punc_tokens = []
    for tokens_1d in tokens_2d:
        no_punc_tokens.append([token.translate(translator) for token in tokens_1d])

    # # remove empty string
    no_empty_tokens = []
    for token_1d in no_punc_tokens:
        no_empty_tokens.append([token for token in token_1d if token != ''])
        
    # convert into token list: ['token', 'token']
    tokens = []
    for sentence in no_empty_tokens:
        # add start and end tokens to identify sentence boundary
        tokens.append('<s>')
        # add words in each sentence into tokens
        tokens.extend(sentence)
        tokens.append('</s>')

    # create n-grams
    uni_grams = []
    bi_grams = []
    tri_grams = []
    four_grams = []

    uni_grams.extend(list(ngrams(tokens, n=1)))
    bi_grams.extend(list(ngrams(tokens, n=2)))
    tri_grams.extend(list(ngrams(tokens, n=3)))
    four_grams.extend(list(ngrams(tokens, n=4)))

    return {
        'sentences': tokens,
        'uni_grams': uni_grams,
        'bi_grams': bi_grams,
        'tri_grams': tri_grams,
        'four_grams': four_grams,
    }

In [7]:
# handle unknown word

In [8]:
from nltk.probability import FreqDist

# create a frequency distribution for each n-gram
freq_uni = FreqDist()
freq_bi = FreqDist()
freq_tri = FreqDist()
freq_four = FreqDist()

tokenized_words = tokenize_words(training_data);

# count uni-grams
for uni_gram in tokenized_words['uni_grams']:
    freq_uni[uni_gram] = freq_uni[uni_gram] + 1

# count bi-grams
for bi_gram in tokenized_words['bi_grams']:
    freq_bi[bi_gram] = freq_bi[bi_gram] + 1
        
# count bi-grams
for tri_gram in tokenized_words['tri_grams']:
    freq_tri[tri_gram] = freq_tri[tri_gram] + 1

# count bi-grams
for four_gram in tokenized_words['four_grams']:
    freq_four[four_gram] = freq_four[four_gram] + 1