In [1]:
# Used for reading the book files in Task 1
# https://docs.python.org/3/library/os.html
import os

# Used for cleaning the book text using regex in Task 1
# https://docs.python.org/3/library/re.html
import re

# Used for finding matching sequences in Task 2
# https://docs.python.org/3/library/fnmatch.html
import fnmatch

# Used for choosing a random sequence in Task 2
# https://docs.python.org/3/library/random.html
import random

# Used to write the trigram model to JSON file in Task 4
# https://docs.python.org/3/library/json.html
import json

## Task 1: Third-order letter approximation model
Create a trigram model from the given books

In [2]:
def create_trigram_model():

    print("Creating trigram model...")

    BOOK_DIRECTORY_PATH = "./books"

    # Read in all of the text from the supplied books as training data
    book_text = read_in_files_from_directory(BOOK_DIRECTORY_PATH)

    # Remove unwanted characters from the text and convert to uppercase
    book_text = clean_text(book_text)

    # Create the trigram model
    trigram_model = create_trigram_model_from_training_text(book_text)

    return trigram_model

## Task 2: Third-order letter approximation generation

Generate new text using the trigram model from Task 1

In [3]:
# Generate new text based on given initial text until a given character limit is reached
def generate_new_text(initial_text, trigram_model, num_chars_to_generate):

    print("Generating new text...")

    # Clean the initial text
    text = clean_text(initial_text)

    # If true, print new chacter as it is generated
    # If false, only print text once every character has being generated
    stream_generation = True

    # Raise exception if the length cleaned initial text is less than 2
    # https://docs.python.org/3/tutorial/errors.html#raising-exceptions
    if len(text) < 2:
        raise Exception("Error: Initial text length cannot be less than 2")

    if stream_generation:
        print(text, end="")
    
    start_index = len(text) - 1

    # Generate new characters until the given limit is reached
    for char_index in range(start_index, num_chars_to_generate):

        # Append the generated character to the existing text
        text += generate_next_char(text[char_index - 1] + text[char_index], trigram_model, stream_generation)

    if not stream_generation:
        print(text)

    print("\n\n")
        
    return text

## Task 3: Analyze the model

Analyze the trigram model by calculating the percentage of valid words in the generated text from Task 2

In [4]:
# Calculate and return the the percentage of valid words in the generated text
def calculate_percentage_of_valid_words(word_set, generated_text):

    print("Evaluating model...")

    # Split the list of words and store them in a list
    list_of_words = generated_text.split(" ")
    
    total_words = len(list_of_words)
    valid_word_count = 0

    # Loop through every word in the generated text
    for word in list_of_words:

        """ Check if the current word is in the valid word set, and increment valid_word_count if it is.
        Note: Valid words are stored as a set because checking if an element is in a set has an average case time
        complexity of O(1), while checking if an element is in a list has an average case time complexity of O(n).
        https://wiki.python.org/moin/TimeComplexity
        """
        if word in word_set:
            valid_word_count += 1

    # Return the percentage of valid words
    return (valid_word_count / total_words) * 100

## Task 4: Export the model as JSON
Export the model as a JSON file

In [5]:
# Export model as JSON
def write_model_to_json_file(model):

    print("Exporting model as JSON...")

    file_path = "trigrams.json"

    # Write the trigram model to the JSON file
    with open(file_path, "w") as outfile: 
        json.dump(model, outfile)

    print("Model exported")

## Other functions

In [6]:
# Read all files in a given directory
def read_in_files_from_directory(directory_name):
    
    # Get the file names the books
    book_list = os.listdir(directory_name)
    text = ""

    print("Training data books:", book_list, "\n")

    # Read in all of the text from the books
    for book in book_list:
        # Open the current book
        f = open(directory_name + "/" + book, "r", encoding="utf8")

        # Read the contents of the current book
        current_book_text = f.read()

        # Remove the preamble and postamble from the current book
        current_book_text = remove_preabmle_and_postamble(current_book_text)
        
        text += current_book_text + "\n"
        f.close()
        
    return text

In [7]:
# Remove preabmle and postamble from given book text
def remove_preabmle_and_postamble(text):

    end_of_preamble_string = "*\n"
    end_of_postamble_string = "\n*"

    # Remove preamble and postabmle
    # Modeified from https://stackoverflow.com/a/59903231
    text = text[text.find(end_of_preamble_string):text.rfind(end_of_postamble_string)]

    # Remove "*" from start and end of trimmed text
    text = text[1:-1]

    # Remove blank lines from start and end of from trimmed text
    text = text.strip()

    return text

In [8]:
# Remove unwanted characters from a given string
def clean_text(text):

    # Remove unwanted characters from the text
    # https://docs.python.org/3/library/re.html
    cleaned_text = re.sub(r'[^a-zA-Z\s.]', '', text)
    cleaned_text = re.sub(' +', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("\n", " ")

    # Convert all characters to uppercase
    cleaned_text = cleaned_text.upper()
    
    return cleaned_text

In [9]:
# Create a trigram model based on given training text
def create_trigram_model_from_training_text(text):

    trigram_model = {}

    # Iterate through all of the characters in the text
    for char_index in range(1, len(text) - 1):

        # Get the current three character sequence
        current_sequence = text[char_index - 1] + text[char_index] + text[char_index + 1]

        # If sequence exists in dictionary, increase its count by 1.
        # Otherwise, add sequence to dictionary and set its count to 1.
        if current_sequence in trigram_model:
            trigram_model[current_sequence] += 1
        else:
            trigram_model[current_sequence] = 1
            
    return trigram_model

In [10]:
# Generate the next character based on the previous two characters 
def generate_next_char(sequence, model, stream_generation):

    # Find all sequences where the first two characters match the given two character sequence
    matching_sequences = find_matching_sequences(sequence, model)

    # Randomly choose a sequence based on the sequence weights
    # https://docs.python.org/3/library/random.html
    chosen_sequence = str(random.choices(list(matching_sequences.keys()), weights = list(matching_sequences.values()))[0])

    # Get the last character of the chosen sequence
    chosen_character = chosen_sequence[-1]
    
    if stream_generation:
        print(chosen_character, end="")

    # Return the chosen character
    return chosen_character

In [11]:
# Find all trigram sequences that have the same first two characters as the given two character sequence
def find_matching_sequences(sequence, model):

    # Get all of the trigram sequences
    matching_sequences_list = trigram_model.keys()

    # Get all of the sequences where thefirst two characters match the given two character sequence.
    matching_sequences_list = fnmatch.filter(matching_sequences_list, sequence + "?")
    
    matching_sequences_dict = {}

    # Create dictionary from the matching sequences and the amount of times they appeared in the training text
    for sequence in matching_sequences_list:
        matching_sequences_dict[sequence] = model[sequence]
    
    return matching_sequences_dict

In [12]:
# Read a given file
def read_file(file_name):

    # Read in text from given file path
    f = open(file_name, "r", encoding="utf8")
    text = f.read()
    f.close
        
    return text

In [13]:
# Return a set of words from a given string of words 
def get_word_set(word_string):

    word_list = word_string.split("\n")
    
    return set(word_list)

## Run the program

## Create the trigram model

In [14]:
trigram_model = create_trigram_model()

Creating trigram model...
Training data books: ['Frankenstein.txt', 'LittleWomen.txt', 'MiddleMarch.txt', 'MobyDick.txt', 'PrideAndPrejudice.txt'] 



## Generate new text

In [15]:
# Pass the initial text, the trigram model, and the amount of characters to generate
generated_text = generate_new_text("TH", trigram_model, 10000)

Generating new text...
THESSAIRE OUGHT IT THE ONATAND THARDSTIONERFEARRETTHEREBARITHE ONG WILOONSICASTRANDLY AGENT A GOOK ASES THENING THAKISHE PID HE FORTUR LOWNS GO PAIND I TRY A THICS A CALLIRMID WERS FIRITION BUT THANCE THE YOULD NALIND DINCLIKE TO TE.  HASTIL POO ATIONEAD HIGHT QUINS NO AFTED A RAVESS ON THAD THAREBLE. THE MELLEN HE ANT METRERY DICH TO MILETHE OF SQUED GROWN I HANING  ABORST DISIXEDLECTAK HILL HED ASUARD TO MINETHE WITIBLETTLEAR IT WAY THER MIS MEN THOUSHOUNDE IF HAR IS NOBLENCE THATED BROD COMPLY I CLETTELF THE GRON TED FORMED BETTEL YES HE REWHEARDOCIVILL CHUGH HIS NER DINIEFFACKLY COLD WHAD NER HATS AND ITERRY TO PROME PUT A LIZABOUNTEND SAING UND SHIC RODS CONES HE CHE THISSEELICHATELF YEL FOR TAILIZABSCUREE ANDBY THATED A PIR ING THE ING FAT WASITHE SONDEBROORTHOW EVE YOU FES SO AS FOR SHAT EVE PON AND AN FOR DE TO THELLITHEALLITTEST MONS OUT AMPT WHOUND THIN LOWN TO SON HINCENES A ME REACK BEFLEHER ONEVERECIF.  COM AND CAN THEATINED I DORRIVAL OLLY FOR ITZWI

## Calculate the percentage of valid words in the generated text

In [16]:
percentage_of_valid_words = calculate_percentage_of_valid_words(get_word_set(read_file("./words.txt")), generated_text)
print("Percentage Of Valid Words:", str(round(percentage_of_valid_words, 2)) + "%\n")

Evaluating model...
Percentage Of Valid Words: 37.62%



## Export the trigram model

In [17]:
write_model_to_json_file(trigram_model)

Exporting model as JSON...
Model exported
