# Trigram Approximation Model
Cleaned the text by converting to uppercase and removing unwanted characters

Regex from the standard library is used extensively to clean up this code: https://docs.python.org/3/library/re.html

In [1]:
import re
import os
from collections import defaultdict

def clean_text(text):
    # Clean the text by converting to uppercase and removing unwanted characters
    cleaned_text = re.sub(r'[^A-Z. ]', '', text.upper())
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s{2,}', '', cleaned_text)
    return cleaned_text

# Debugging
sample_text = "It is what it is. An example sentence! 123"
cleaned_text = clean_text(sample_text)
print(cleaned_text)  # Output: 'IT IS WHAT IT IS. AN EXAMPLE SENTENCE'

IT IS WHAT IT IS. AN EXAMPLE SENTENCE 


# Built trigram model by counting occurrences of three-character sequences

I went with a defaultdict for my data structure. Whenever a new trigram is found, it’s automatically set to zero and then incremented over. This keeps things simple and effective, and avoids the need to move through the trigrams (to check if they're already present) because they'll be greater than 0.

DefaultDict from Collections: https://docs.python.org/3/library/collections.html

In [2]:
def build_trigram_model(text):
    model = defaultdict(int)
    for i in range(len(text) - 2):
        model[text[i:i+3]] += 1
    return model

# Debugging
text = "IT IS WHAT IT IS."
cleaned_text = clean_text(text)
trigram_model = build_trigram_model(cleaned_text)

# Display the trigram model
for trigram, count in trigram_model.items():
    print(f"{trigram}: {count}")

IT : 2
T I: 3
 IS: 2
IS : 1
S W: 1
 WH: 1
WHA: 1
HAT: 1
AT : 1
 IT: 1
IS.: 1


# Merge multiple trigram models into one for a collective count

In [3]:
def merge_trigram_models(models):
    merged_model = defaultdict(int)
    for model in models:
        for trigram, count in model.items():
            merged_model[trigram] += count
    return merged_model

# Example use with multiple books
book1_model = build_trigram_model(clean_text("First book text here..."))
book2_model = build_trigram_model(clean_text("Second book text here..."))
# Repeat for other books

merged_trigram_model = merge_trigram_models([book1_model, book2_model])

# Display the merged trigram model
for trigram, count in merged_trigram_model.items():
    print(f"{trigram}: {count}")

FIR: 1
IRS: 1
RST: 1
ST : 1
T B: 1
 BO: 2
BOO: 2
OOK: 2
OK : 2
K T: 2
 TE: 2
TEX: 2
EXT: 2
XT : 2
T H: 2
 HE: 2
HER: 2
ERE: 2
RE.: 2
E..: 2
...: 2
SEC: 1
ECO: 1
CON: 1
OND: 1
ND : 1
D B: 1


# Trim out the preamble and postamble from files

In [None]:
def read_trimmed_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Find the end of the preamble using the *** line thats in every file
    preamble_end_index = next(i for i, line in enumerate(lines) 
        if '*** START OF THE PROJECT GUTENBERG EBOOK' in line) + 1

    # Find the postamble
    postamble_start_index = len(lines) - next(i for i, line in enumerate(reversed(lines)) 
        if '*** END OF THE PROJECT GUTENBERG EBOOK' in line)

    # Extract the lines between preamble and postamble
    trimmed_lines = lines[preamble_end_index:postamble_start_index]

    return trimmed_lines

# Function to read and process text files from the 'texts/' folder

Uses OS library for file reading: https://docs.python.org/3/library/os.html

In [None]:
def process_text_files(folder):
    trigram_models = []
    for filename in os.listdir(folder):
        if filename.endswith('.txt'):
            with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                file_path = os.path.join(folder, filename)
                trimmed_content = read_trimmed_file(file_path)
                cleaned_text = clean_text(''.join(trimmed_content))
                trigram_model = build_trigram_model(cleaned_text)
                trigram_models.append(trigram_model)
    return trigram_models

# Debugging
# Example usage
if __name__ == "__main__":
    folder_path = "texts"  # Folder where the downloaded texts are stored
    trigram_models = process_text_files(folder_path)
    
    # Merge trigram models from all books
    merged_trigram_model = merge_trigram_models(trigram_models)
    
    # Display merged trigram model
    for trigram, count in sorted(merged_trigram_model.items(), key=lambda item: item[1], reverse=True):
        print(f"{trigram}: {count}")