In [1]:
# Used for reading the book files in Task 1
# https://docs.python.org/3/library/os.html
import os

# Used for cleaning the book text using regex in Task 1
# https://docs.python.org/3/library/re.html
import re

# Used for finding matching sequences in Task 2
# https://docs.python.org/3/library/fnmatch.html
import fnmatch

## Task 1: Third-order letter approximation model
Create a trigram model from the given books

In [2]:
def create_trigram_model():

    BOOK_DIRECTORY_PATH = "./books"

    # Read in all of the text from the supplied books as training data
    book_text = read_in_files_from_directory(BOOK_DIRECTORY_PATH)

    # Remove unwanted characters from the text and convert to uppercase
    book_text = cleaned_book_text = clean_text(book_text)

    # Print first 500 characters as a test
    # print(book_text[0:500])

    # Create the trigram model
    trigram_model = create_trigram_model_from_training_text(book_text)

    return trigram_model

## Task 2: Third-order letter approximation generation

Generate new text using the trigram model from Task 1

In [3]:
def generate_new_text(initial_text, trigram_model, num_chars_to_generate):

    # Clean the initial text
    text = clean_text(initial_text)

    # Raise exception if the length cleaned initial text is less than 2
    if len(text) < 2:
        raise Exception("Error: Initial text length cannot be less than 2")

    # For testing.
    generate_next_char(text, trigram_model)

In [4]:
# Read all files in a given directory
def read_in_files_from_directory(directory_name):
    
    # Get the file names the books
    book_list = os.listdir(directory_name)
    text = ""

    print("Files and directories in '", directory_name, "' :", book_list, "\n")

    # Read in all of the text from the books
    for book in book_list:
        # Open the current book
        f = open(directory_name + "/" + book, "r", encoding="utf8")

        # Read the contents of the current book
        current_book_text = f.read()

        # Remove the preamble and postamble from the current book
        current_book_text = remove_preabmle_and_postamble(current_book_text)
        
        text += current_book_text + "\n"
        f.close()
        
    return text

In [5]:
# Remove preabmle and postamble from given book text
def remove_preabmle_and_postamble(text):

    end_of_preamble_string = "*\n"
    end_of_postamble_string = "\n*"

    # Remove preamble and postabmle
    # https://stackoverflow.com/a/59903231
    text = text[text.find(end_of_preamble_string):text.rfind(end_of_postamble_string)]

    # Remove "*" from start and end of trimmed text
    text = text[1:-1]

    # Remove blank lines from start and end of from trimmed text
    text = text.strip()

    # Test if the preamble and post amble were removed correctly
    # print(text[0:500], "\n=============\n")
    # print(text[len(text) - 500:len(text)], "\n=============\n")

    return text

In [6]:
# Remove unwanted characters from a given string
def clean_text(text):

    # Remove unwanted characters from the text
    cleaned_text = re.sub(r'[^a-zA-Z\s.]', '', text)
    cleaned_text = re.sub(' +', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("\n", " ")

    # Convert all characters to uppercase
    cleaned_text = cleaned_text.upper()
    
    return cleaned_text

In [7]:
# Create a trigram model based on given training text
def create_trigram_model_from_training_text(text):

    trigram_model = {}

    # Iterate through all of the characters in the text
    for char_index in range(1, len(text) - 1):

        # Get the current three character sequence
        current_sequence = text[char_index - 1] + text[char_index] + text[char_index + 1]

        # If sequence exists in dictionary, increase its count by 1.
        # Otherwise, add seuence to dictionary and set its count to 1.
        if current_sequence in trigram_model:
            trigram_model[current_sequence] += 1
        else:
            trigram_model[current_sequence] = 1
            
    return trigram_model

In [8]:
# Generate the next character based on the previous two characters 
def generate_next_char(sequence, model):

    # For testing. The rest of the function will be implemented later
    return find_matching_sequences(sequence, model)

In [9]:
# Find all trigram sequences that have the same first two characters as the given two character sequence
def find_matching_sequences(sequence, model):

    # Get all of the trigram sequences
    matching_sequences_list = trigram_model.keys()

    # Get all of the sequences where thefirst two characters match the given two character sequence.
    matching_sequences_list = fnmatch.filter(matching_sequences_list, sequence + "?")
    
    matching_sequences_dict = {}

    # Create dictionary from the matching sequences and the amount of times they appeared in the training text
    for sequence in matching_sequences_list:
        matching_sequences_dict[sequence] = model[sequence]

    # Test if the matching sequences were found
    print(matching_sequences_dict)
    
    return matching_sequences_dict

## Run the program

In [10]:
# Create the trigram model
trigram_model = create_trigram_model()

# Print trigram model as test
# print(trigram_model)

# Generate new text. Pass the initial text, the trigram model, and the amount of characters to generate
generate_new_text("TH", trigram_model, 10000)

Files and directories in ' ./books ' : ['Frankenstein.txt', 'LittleWomen.txt', 'MiddleMarch.txt', 'MobyDick.txt', 'PrideAndPrejudice.txt'] 

{'THE': 66321, 'TH ': 12253, 'THA': 15269, 'THI': 10000, 'THO': 4840, 'THS': 463, 'THU': 445, 'THR': 1768, 'TH.': 418, 'THY': 386, 'THW': 74, 'THL': 89, 'THB': 6, 'THQ': 14, 'THD': 70, 'THF': 99, 'THH': 16, 'THK': 4, 'THT': 16, 'THV': 1, 'THM': 18, 'THC': 6, 'THP': 6, 'THN': 4, 'THG': 4}
