In [1]:
# Used for reading the book files in Task 1
# https://docs.python.org/3/library/os.html
import os

# Used for cleaning the book text using regex in Task 1
# https://docs.python.org/3/library/re.html
import re

# Used for finding matching sequences in Task 2
# https://docs.python.org/3/library/fnmatch.html
import fnmatch

# Used for choosing a random sequence in Task 2
# https://docs.python.org/3/library/random.html
import random

# Used to write the trigram model to JSON file in Task 4
# https://docs.python.org/3/library/json.html
import json

# Used for rounding numbers in Tasks 1 and 2
# https://docs.python.org/3/library/math.html
import math

### The size of the n-gram model 
Example: 2 for a bigram model, 3 for a trigram model, etc.

In [2]:
N_GRAM_SIZE = 6

## Task 1: Third-order letter approximation model
Create a n-gram model from the given books (for third-order letter approximation it will be a trigram model)

In [3]:
def create_n_gram_model():

    # Raise exception if the n-gram size is less than 1
    # https://docs.python.org/3/tutorial/errors.html#raising-exceptions
    if N_GRAM_SIZE < 1:
        raise Exception("Error: N-gram size of", N_GRAM_SIZE, "is invalid. N-gram size cannot be less than 1.")

    print("Creating n-gram character model with sequence size of ...")

    BOOK_DIRECTORY_PATH = "./books"

    # Read in all of the text from the supplied books as training data
    book_text = read_in_files_from_directory(BOOK_DIRECTORY_PATH)

    # Remove unwanted characters from the text and convert to uppercase
    book_text = clean_text(book_text)

    # Create the n_gram model
    n_gram_model = create_n_gram_model_from_training_text(book_text)

    return n_gram_model

## Task 2: Third-order letter approximation generation

Generate new text using the n-gram model from Task 1 (will be a trigram model for third-order letter approximation generation)

In [4]:
# Generate new text based on given initial text until a given character limit is reached
def generate_new_text(initial_text, model, num_chars_to_generate):

    print("Generating new text...")

    has_trailing_whitespace = False

    # Check if the initial text has a trailing whitespce
    if initial_text[-1] == " ":
        has_trailing_whitespace = True

    # Clean the initial text
    text = clean_text(initial_text)

    # Append a white space to the cleaned text if it had one initially as it has been removed during the cleaning process
    if has_trailing_whitespace:
        text += " "

    # If true, print new chacter as it is generated
    # If false, only print text once every character has being generated
    stream_generation = True

    # Raise exception if the length cleaned initial text is less than 2
    # https://docs.python.org/3/tutorial/errors.html#raising-exceptions
    if len(text) < N_GRAM_SIZE -1:
        raise Exception("Error: Initial text length for model size", N_GRAM_SIZE , "cannot be less than ", (N_GRAM_SIZE - 1))

    if stream_generation:
        print(text, end="")
    
    start_index = len(text) - 1

    # Generate new characters until the given limit is reached
    for char_index in range(start_index, num_chars_to_generate):

        # Get the sequence from the end of the text which will be used to generate the next character
        sequence = text[char_index - math.floor(N_GRAM_SIZE - 2):char_index + 1]

        # Append the generated character to the existing text
        text += generate_next_char(sequence, model, stream_generation)

    if not stream_generation:
        print(text)

    print("\n\n")
        
    return text

## Task 3: Analyze the model

Analyze the n-gram model by calculating the percentage of valid words in the generated text from Task 2

In [5]:
# Calculate and return the the percentage of valid words in the generated text
def calculate_percentage_of_valid_words(word_set, generated_text):

    print("Evaluating model...")

    # Split the list of words and store them in a list
    list_of_words = generated_text.split(" ")
    
    total_words = len(list_of_words)
    valid_word_count = 0

    # Loop through every word in the generated text
    for word in list_of_words:

        """ Check if the current word is in the valid word set, and increment valid_word_count if it is.
        Note: Valid words are stored as a set because checking if an element is in a set has an average case time
        complexity of O(1), while checking if an element is in a list has an average case time complexity of O(n).
        https://wiki.python.org/moin/TimeComplexity
        """
        if word in word_set:
            valid_word_count += 1

    # Return the percentage of valid words
    return (valid_word_count / total_words) * 100

## Task 4: Export the model as JSON
Export the model as a JSON file

In [6]:
# Export model as JSON
def write_model_to_json_file(model):

    print("Exporting model as JSON...")

    # The output path is "trigrams.json" to meet the requirements.
    # Because the program can create different sized n-gram models, a different name like "n_grams.json" would be more suitable
    file_path = "trigrams.json"

    # Write the n_gram model to the JSON file
    with open(file_path, "w") as outfile: 
        json.dump(model, outfile)

    print("Model exported")

## Other functions

In [7]:
# Create a n-gram model based on given training text
def create_n_gram_model_from_training_text(text):

    n_gram_model = {}
    index_offset = math.floor(N_GRAM_SIZE / 2)

    # Iterate through all of the characters in the text
    for char_index in range(index_offset, len(text) - 1):

        # Get the current n character sequence
        if N_GRAM_SIZE % 2 == 0:
            current_sequence = text[char_index - index_offset:char_index + index_offset]
        else:
            current_sequence = text[(char_index - index_offset) - 1:char_index + index_offset]

        # If sequence exists in dictionary, increase its count by 1.
        # Otherwise, add sequence to dictionary and set its count to 1.
        if current_sequence in n_gram_model:
            n_gram_model[current_sequence] += 1
        else:
            n_gram_model[current_sequence] = 1
            
    return n_gram_model

In [8]:
# Read all files in a given directory
def read_in_files_from_directory(directory_name):
    
    # Get the file names the books
    book_list = os.listdir(directory_name)
    text = ""

    print("Training data books:", book_list, "\n")

    # Read in all of the text from the books
    for book in book_list:
        # Open the current book
        f = open(directory_name + "/" + book, "r", encoding="utf8")

        # Read the contents of the current book
        current_book_text = f.read()

        # Remove the preamble and postamble from the current book
        current_book_text = remove_preabmle_and_postamble(current_book_text)
        
        text += current_book_text + "\n"
        f.close()
        
    return text

In [9]:
# Remove preabmle and postamble from given book text
def remove_preabmle_and_postamble(text):

    end_of_preamble_string = "*\n"
    end_of_postamble_string = "\n*"

    # Remove preamble and postabmle
    # Modeified from https://stackoverflow.com/a/59903231
    text = text[text.find(end_of_preamble_string):text.rfind(end_of_postamble_string)]

    # Remove "*" from start and end of trimmed text
    text = text[1:-1]

    # Remove blank lines from start and end of from trimmed text
    text = text.strip()

    return text

In [10]:
# Remove unwanted characters from a given string
def clean_text(text):

    # Remove unwanted characters from the text
    # https://docs.python.org/3/library/re.html
    cleaned_text = re.sub(r'[^a-zA-Z\s.]', '', text)
    #cleaned_text = re.sub(' +', ' ', cleaned_text)

    # https://stackoverflow.com/a/1546251
    cleaned_text = " ".join(cleaned_text.split())
    
    cleaned_text = cleaned_text.replace("\n", " ")

    # Convert all characters to uppercase
    cleaned_text = cleaned_text.upper()
    
    return cleaned_text

In [11]:
# Generate the next character based on the previous two characters 
def generate_next_char(sequence, model, stream_generation):

    #print(sequence)

    # Find all sequences where the first two characters match the given two character sequence
    matching_sequences = find_matching_sequences(sequence, model)

    # Randomly choose a sequence based on the sequence weights
    # https://docs.python.org/3/library/random.html
    chosen_sequence = str(random.choices(list(matching_sequences.keys()), weights = list(matching_sequences.values()))[0])

    # Get the last character of the chosen sequence
    chosen_character = chosen_sequence[-1]
    
    if stream_generation:
        print(chosen_character, end="")

    # Return the chosen character
    return chosen_character

In [12]:
# Find all n-gram sequences that have the same first two characters as the given two character sequence
def find_matching_sequences(sequence, model):

    # Get all of the n-gram sequences
    matching_sequences_list = model.keys()

    # Get all of the sequences where thefirst two characters match the given two character sequence.
    matching_sequences_list = fnmatch.filter(matching_sequences_list, sequence + "?")
    
    matching_sequences_dict = {}

    # Create dictionary from the matching sequences and the amount of times they appeared in the training text
    for sequence in matching_sequences_list:
        matching_sequences_dict[sequence] = model[sequence]
    
    return matching_sequences_dict

In [13]:
# Read a given file
def read_file(file_name):

    # Read in text from given file path
    f = open(file_name, "r", encoding="utf8")
    text = f.read()
    f.close
        
    return text

In [14]:
# Return a set of words from a given string of words 
def get_word_set(word_string):

    word_list = word_string.split("\n")
    
    return set(word_list)

## Run the program

## Create the n-gram model

In [15]:
n_gram_model = create_n_gram_model()

Creating n-gram character model with sequence size of ...
Training data books: ['Frankenstein.txt', 'LittleWomen.txt', 'MiddleMarch.txt', 'MobyDick.txt', 'PrideAndPrejudice.txt'] 



## Generate new text

In [16]:
# Pass the initial text, the n_gram model, and the amount of characters to generate
generated_text = generate_new_text("THIS IS THE", n_gram_model, 1000)

Generating new text...
THIS IS THE TALENTS SOMETIMES ALTOGETHER. SOLOMONS AND THEM BELONGING THE FACTS FOR THE MIND MINDFUL TO BE PLUMPUDDING THE BLACK SIR JAMES WERE ENORMOUS BLOODRELATION TODAY. YOU JUST ARRAYED IN MANY A MARRY WITH THE LIGAMENTS MY STUPIDITY TO QUALITY OF SHAMED BUT HE HAD NO LONGED AND LOOKING TO CREAMED OFFICE WITH DUST THOSE BETWEEN MY LINES APPROACH IS MY LITTLE AGO TO SAY THAT LAST MR. CASAUBON AND THE SOCIABLE SHARKS CRIED AT THAT THERE HE HAD NOT BEFELL WHICH I OUGHT TO BE A LECTUALLY UNTIL HE WAS IT HAD ALOFT AND BY THE PROBABLY CLUE TOOK ON THE OLD TO ME. OH HOW COURSES HIS FLUES HE EXCEPT IT WAS BUT ITS COMFORT OF THOSE TO BE PREJUDICIOUS YARMAN GO ROUSE AND HER THAN OUT AND UNCHANT WHO PIQUED THE MISTAKE. BUT YOUR MISTAKEN THOSE SIR WILL NOT BECAUSE I SHOULD JOYFULLY MY REVEALED WITH THE MAY WENT OF THEIR ELOPED DETACHED LOOKED AT HE LADDERS WHICH TO FORSTED UP AN EPISODIC IN THIS WIG AS IF THE NIGHT I WANT TO HIM MEG PARTICULAR VICTORIOUS GIRLS YOUNG MAN

## Calculate the percentage of valid words in the generated text

In [17]:
percentage_of_valid_words = calculate_percentage_of_valid_words(get_word_set(read_file("./words.txt")), generated_text)
print("Percentage Of Valid Words:", str(round(percentage_of_valid_words, 2)) + "%\n")

Evaluating model...
Percentage Of Valid Words: 86.32%



## Export the n-gram model

In [18]:
write_model_to_json_file(n_gram_model)

Exporting model as JSON...
Model exported
