## 1. Data Preparation

In [9]:
from nltk import sent_tokenize, word_tokenize, ngrams
import re
import string

In [10]:
# class Preprocessor:
#     def __init__(self) -> None:
#         # read corpus
#         with open('corpus/corpus.txt', 'r') as infile:
#             file_content = infile.read().replace('\n', '')
#         pass

In [11]:
with open('corpus/corpus.txt', 'r') as infile:
    file_content = infile.read().replace('\n', '')

In [12]:
# split the file content into sentences
sentences = sent_tokenize(file_content)
sentences

['Computer science is the study of computation, information, and automation.',
 '[1][2][3] Computer science spans theoretical disciplines (such as algorithms, theory of computation, and information theory) to applied disciplines (including the design and implementation of hardware and software).',
 '[4][5][6] Though more often considered an academic discipline, computer science is closely related to computer programming.',
 '[7]Algorithms and data structures are central to computer science.',
 '[8] The theory of computation concerns abstract models of computation and general classes of problems that can be solved using them.',
 'The fields of cryptography and computer security involve studying the means for secure communication and for preventing security vulnerabilities.',
 'Computer graphics and computational geometry address the generation of images.',
 'Programming language theory considers different ways to describe computational processes, and database theory concerns the managem

In [13]:
# split the sentence list into training set, validation set and test set
# size of training data
training_size = int(len(sentences) * 0.7)
training_data = sentences[:training_size]

# size of validation data
validation_size = int(len(sentences) * 0.1)
validation_data = sentences[training_size:training_size + validation_size]

test_data = sentences[training_size + validation_size:]

In [25]:

emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

number_bracket_pattern = re.compile(r"\d")

#                        
def tokenize_sentences(sentences):
    # convert to lower case
    low_sentences = [sentence.lower() for sentence in sentences]

    # remove number bracket
    cleaned_sentences = [re.sub(number_bracket_pattern, "", sentence) for sentence in low_sentences]

    # remove emoji
    no_emoji_sentences = [re.sub(emoji_pattern, "", sentence) for sentence in cleaned_sentences]

    # split each sentence into tokens
    tokens_2d = [word_tokenize(sentence) for sentence in no_emoji_sentences]

    # remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    no_punc_tokens = []
    for tokens_1d in tokens_2d:
        no_punc_tokens.append([token.translate(translator) for token in tokens_1d])

    # # remove empty string
    no_empty_tokens = []
    for token_1d in no_punc_tokens:
        no_empty_tokens.append([token for token in token_1d if token != ''])

    # generate n-grams
    uni_grams = []
    bi_grams = []
    tri_grams = []
    four_grams = []

    for token_1d in no_empty_tokens:
        uni_grams.append(list(ngrams(token_1d, n=1)))
        bi_grams.append(list(ngrams(token_1d, n=2)))
        tri_grams.append(list(ngrams(token_1d, n=3)))
        four_grams.append(list(ngrams(token_1d, n=4)))

    # return a dictionary containing all lists of n-grams
    return {
        'sentences': no_empty_tokens,
        'uni_grams': uni_grams,
        'bi_grams': bi_grams,
        'tri_grams': tri_grams,
        'four_grams': four_grams,
    }
    
def tokenize_words(data):
    # convert to lower case
    low_sentences = [sentence.lower() for sentence in sentences]

    # remove number bracket
    cleaned_sentences = [re.sub(number_bracket_pattern, "", sentence) for sentence in low_sentences]

    # remove emoji
    no_emoji_sentences = [re.sub(emoji_pattern, "", sentence) for sentence in cleaned_sentences]

    # split each sentence into tokens: [['token', ''token'], ['token', 'token'], ...]
    tokens_2d = [word_tokenize(sentence) for sentence in no_emoji_sentences]

    # remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    no_punc_tokens = []
    for tokens_1d in tokens_2d:
        no_punc_tokens.append([token.translate(translator) for token in tokens_1d])

    # # remove empty string
    no_empty_tokens = []
    for token_1d in no_punc_tokens:
        no_empty_tokens.append([token for token in token_1d if token != ''])
        
    # convert into token list: ['token', 'token']
    tokens = []
    for sentence in no_empty_tokens:
        # add start and end tokens to identify sentence boundary
        tokens.append('<s>')
        # add words in each sentence into tokens
        tokens.extend(sentence)
        tokens.append('</s>')

    # create n-grams
    uni_grams = []
    bi_grams = []
    tri_grams = []
    four_grams = []

    uni_grams.extend(list(ngrams(tokens, n=1)))
    bi_grams.extend(list(ngrams(tokens, n=2)))
    tri_grams.extend(list(ngrams(tokens, n=3)))
    four_grams.extend(list(ngrams(tokens, n=4)))

    return {
        'sentences': tokens,
        'uni_grams': uni_grams,
        'bi_grams': bi_grams,
        'tri_grams': tri_grams,
        'four_grams': four_grams,
    }

In [32]:
tokenize_words(training_data)['four_grams']

[('<s>', 'computer', 'science', 'is'),
 ('computer', 'science', 'is', 'the'),
 ('science', 'is', 'the', 'study'),
 ('is', 'the', 'study', 'of'),
 ('the', 'study', 'of', 'computation'),
 ('study', 'of', 'computation', 'information'),
 ('of', 'computation', 'information', 'and'),
 ('computation', 'information', 'and', 'automation'),
 ('information', 'and', 'automation', '</s>'),
 ('and', 'automation', '</s>', '<s>'),
 ('automation', '</s>', '<s>', 'computer'),
 ('</s>', '<s>', 'computer', 'science'),
 ('<s>', 'computer', 'science', 'spans'),
 ('computer', 'science', 'spans', 'theoretical'),
 ('science', 'spans', 'theoretical', 'disciplines'),
 ('spans', 'theoretical', 'disciplines', 'such'),
 ('theoretical', 'disciplines', 'such', 'as'),
 ('disciplines', 'such', 'as', 'algorithms'),
 ('such', 'as', 'algorithms', 'theory'),
 ('as', 'algorithms', 'theory', 'of'),
 ('algorithms', 'theory', 'of', 'computation'),
 ('theory', 'of', 'computation', 'and'),
 ('of', 'computation', 'and', 'informat

In [15]:
# handle unknown word

In [16]:
tokenized_sent = tokenize_sentences(training_data)

In [17]:
if (__name__ == '__main__'):
    for l in tokenized_sent['four_grams']:
        print(l)

[('computer', 'science', 'is', 'the'), ('science', 'is', 'the', 'study'), ('is', 'the', 'study', 'of'), ('the', 'study', 'of', 'computation'), ('study', 'of', 'computation', 'information'), ('of', 'computation', 'information', 'and'), ('computation', 'information', 'and', 'automation')]
[('computer', 'science', 'spans', 'theoretical'), ('science', 'spans', 'theoretical', 'disciplines'), ('spans', 'theoretical', 'disciplines', 'such'), ('theoretical', 'disciplines', 'such', 'as'), ('disciplines', 'such', 'as', 'algorithms'), ('such', 'as', 'algorithms', 'theory'), ('as', 'algorithms', 'theory', 'of'), ('algorithms', 'theory', 'of', 'computation'), ('theory', 'of', 'computation', 'and'), ('of', 'computation', 'and', 'information'), ('computation', 'and', 'information', 'theory'), ('and', 'information', 'theory', 'to'), ('information', 'theory', 'to', 'applied'), ('theory', 'to', 'applied', 'disciplines'), ('to', 'applied', 'disciplines', 'including'), ('applied', 'disciplines', 'includin

In [18]:
# from nltk.probability import FreqDist


# freq_uni = FreqDist()
# freq_bi = FreqDist()
# freq_tri = FreqDist()
# freq_four = FreqDist()

# # count uni-grams
# for l in tokenized_sent['uni_grams']:
#     for uni_gram in l:
#         freq_uni[uni_gram] = freq_uni[uni_gram] + 1

# # count bi-grams
# for l in tokenized_sent['bi_grams']:
#     for bi_gram in l:
#         freq_bi[bi_gram] = freq_bi[bi_gram] + 1
        
# # count bi-grams
# for l in tokenized_sent['tri_grams']:
#     for tri_gram in l:
#         freq_tri[tri_gram] = freq_tri[tri_gram] + 1

# # count bi-grams
# for l in tokenized_sent['four_grams']:
#     for four_gram in l:
#         freq_four[four_gram] = freq_four[four_gram] + 1


In [19]:
# if (__name__ == '__main__'):
#     for word in freq_four:
#         print(f'{word}: {freq_four[word]}')

In [30]:
if (__name__ == '__main__'):
    for l in tokenize_words(training_data)['four_grams']:
        print(l)
        
    print(len(tokenize_words(training_data)['four_grams']))

('<s>', 'computer', 'science', 'is')
('computer', 'science', 'is', 'the')
('science', 'is', 'the', 'study')
('is', 'the', 'study', 'of')
('the', 'study', 'of', 'computation')
('study', 'of', 'computation', 'information')
('of', 'computation', 'information', 'and')
('computation', 'information', 'and', 'automation')
('information', 'and', 'automation', '</s>')
('and', 'automation', '</s>', '<s>')
('automation', '</s>', '<s>', 'computer')
('</s>', '<s>', 'computer', 'science')
('<s>', 'computer', 'science', 'spans')
('computer', 'science', 'spans', 'theoretical')
('science', 'spans', 'theoretical', 'disciplines')
('spans', 'theoretical', 'disciplines', 'such')
('theoretical', 'disciplines', 'such', 'as')
('disciplines', 'such', 'as', 'algorithms')
('such', 'as', 'algorithms', 'theory')
('as', 'algorithms', 'theory', 'of')
('algorithms', 'theory', 'of', 'computation')
('theory', 'of', 'computation', 'and')
('of', 'computation', 'and', 'information')
('computation', 'and', 'information', '

In [21]:
# tokenize_words()['uni_grams']
    

In [22]:
from nltk.probability import FreqDist


freq_uni = FreqDist()
freq_bi = FreqDist()
freq_tri = FreqDist()
freq_four = FreqDist()

tokenized_words = tokenize_words(training_data);

# count uni-grams
for uni_gram in tokenized_words['uni_grams']:
    freq_uni[uni_gram] = freq_uni[uni_gram] + 1

# count bi-grams
for bi_gram in tokenized_words['bi_grams']:
    freq_bi[bi_gram] = freq_bi[bi_gram] + 1
        
# count bi-grams
for tri_gram in tokenized_words['tri_grams']:
    freq_tri[tri_gram] = freq_tri[tri_gram] + 1

# count bi-grams
for four_gram in tokenized_words['four_grams']:
    freq_four[four_gram] = freq_four[four_gram] + 1


In [23]:
if (__name__ == '__main__'):
    for word in freq_four:
        print(f'{word}: {freq_four[word]}')

('of', 'data', '</s>', '<s>'): 12
('</s>', '<s>', 'in', 'the'): 8
('data', 'warehouse', '</s>', '<s>'): 8
('</s>', '<s>', 'a', 'data'): 8
('data', '</s>', '<s>', 'the'): 6
('</s>', '<s>', 'the', 'term'): 5
('</s>', '<s>', 'for', 'example'): 5
('</s>', '<s>', 'big', 'data'): 5
('</s>', '<s>', 'machine', 'learning'): 5
('</s>', '<s>', 'according', 'to'): 4
('machine', 'learning', '</s>', '<s>'): 4
('</s>', '<s>', 'the', 'data'): 4
('data', '</s>', '<s>', 'a'): 4
('<s>', 'a', 'data', 'model'): 4
('</s>', '<s>', 'data', 'models'): 4
('big', 'data', '</s>', '<s>'): 4
('</s>', '<s>', 'artificial', 'intelligence'): 3
('the', 'invention', 'of', 'the'): 3
('in', '</s>', '<s>', 'in'): 3
('</s>', '<s>', 'during', 'the'): 3
('s', 'and', 'early', 's'): 3
('and', 'early', 's', '</s>'): 3
('early', 's', '</s>', '<s>'): 3
('a', 'handful', 'of', 'scientists'): 3
('intelligence', '</s>', '<s>', 'the'): 3
('</s>', '<s>', 'the', 'field'): 3
('the', 'field', 'of', 'ai'): 3
('</s>', '<s>', 'alan', 'turing')