# Trigram Approximation Model
Cleaned the text by converting to uppercase and removing unwanted characters

Regex from the standard library is used extensively to clean up this code: https://docs.python.org/3/library/re.html

In [6]:
import re
import os
from collections import defaultdict

def clean_text(text):
    # Clean the text by converting to uppercase and removing unwanted characters
    cleaned_text = re.sub(r'[^A-Z. ]', '', text.upper())
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s{2,}', '', cleaned_text)
    return cleaned_text

# Debugging
sample_text = "It is what it is. An example sentence! 123"
cleaned_text = clean_text(sample_text)
print(cleaned_text)  # Output: 'IT IS WHAT IT IS. AN EXAMPLE SENTENCE'

IT IS WHAT IT IS. AN EXAMPLE SENTENCE 


# Built trigram model by counting occurrences of three-character sequences

I went with a defaultdict for my data structure. Whenever a new trigram is found, it’s automatically set to zero and then incremented over. This keeps things simple and effective, and avoids the need to move through the trigrams (to check if they're already present) because they'll be greater than 0.

DefaultDict from Collections: https://docs.python.org/3/library/collections.html

In [7]:
def build_trigram_model(text):
    model = defaultdict(int)
    for i in range(len(text) - 2):
        model[text[i:i+3]] += 1
    return model

# Debugging
text = "IT IS WHAT IT IS."
cleaned_text = clean_text(text)
trigram_model = build_trigram_model(cleaned_text)

# Display the trigram model
for trigram, count in trigram_model.items():
    print(f"{trigram}: {count}")

IT : 2
T I: 3
 IS: 2
IS : 1
S W: 1
 WH: 1
WHA: 1
HAT: 1
AT : 1
 IT: 1
IS.: 1


# Merge multiple trigram models into one for a collective count

In [8]:
def merge_trigram_models(models):
    merged_model = defaultdict(int)
    for model in models:
        for trigram, count in model.items():
            merged_model[trigram] += count
    return merged_model

# Example use with multiple books
book1_model = build_trigram_model(clean_text("First book text here..."))
book2_model = build_trigram_model(clean_text("Second book text here..."))
# Repeat for other books

merged_trigram_model = merge_trigram_models([book1_model, book2_model])

# Display the merged trigram model
for trigram, count in merged_trigram_model.items():
    print(f"{trigram}: {count}")

FIR: 1
IRS: 1
RST: 1
ST : 1
T B: 1
 BO: 2
BOO: 2
OOK: 2
OK : 2
K T: 2
 TE: 2
TEX: 2
EXT: 2
XT : 2
T H: 2
 HE: 2
HER: 2
ERE: 2
RE.: 2
E..: 2
...: 2
SEC: 1
ECO: 1
CON: 1
OND: 1
ND : 1
D B: 1


# Trim out the preamble and postamble from files

In [9]:
def read_trimmed_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Find the end of the preamble using the *** line thats in every file
    preamble_end_index = next(i for i, line in enumerate(lines) 
        if '*** START OF THE PROJECT GUTENBERG EBOOK' in line) + 1

    # Find the postamble
    postamble_start_index = len(lines) - next(i for i, line in enumerate(reversed(lines)) 
        if '*** END OF THE PROJECT GUTENBERG EBOOK' in line)

    # Extract the lines between preamble and postamble
    trimmed_lines = lines[preamble_end_index:postamble_start_index]

    return trimmed_lines

# Function to read and process text files from the 'texts/' folder

Uses OS library for file reading: https://docs.python.org/3/library/os.html

In [10]:
def process_text_files(folder):
    trigram_models = []
    for filename in os.listdir(folder):
        if filename.endswith('.txt'):
            with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                file_path = os.path.join(folder, filename)
                trimmed_content = read_trimmed_file(file_path)
                cleaned_text = clean_text(''.join(trimmed_content))
                trigram_model = build_trigram_model(cleaned_text)
                trigram_models.append(trigram_model)
    return trigram_models

# Debugging
# Example usage
if __name__ == "__main__":
    folder_path = "texts"  # Folder where the downloaded texts are stored
    trigram_models = process_text_files(folder_path)
    
    # Merge trigram models from all books
    merged_trigram_model = merge_trigram_models(trigram_models)
    
    # Display merged trigram model
    for trigram, count in sorted(merged_trigram_model.items(), key=lambda item: item[1], reverse=True):
        print(f"{trigram}: {count}")

 TH: 61905
THE: 51855
HE : 43147
AND: 28909
ND : 28285
 AN: 27717
ED : 25729
ING: 23273
NG : 20839
 TO: 20302
ER : 19646
TO : 17688
 OF: 17664
AT : 17310
 HE: 17170
OF : 16043
HER: 15779
 IN: 15778
AS : 14389
 A : 13723
 HA: 13577
IN : 13181
IS : 12634
HAT: 12566
RE : 12414
D T: 12314
 WH: 12303
E T: 12229
E A: 11926
 WA: 11653
 BE: 11611
THA: 11589
 I : 11464
 HI: 11049
E S: 10736
N T: 10531
ON : 10305
HIS: 10148
E W: 10122
EN : 10022
T T: 9956
ERE: 9922
S A: 9862
LL : 9817
 IT: 9674
ES : 9552
 WI: 9476
FOR: 9427
UT : 9295
YOU: 9266
OR : 9137
LY : 9099
D A: 8963
ME : 8915
 CO: 8768
IT : 8598
 FO: 8499
 YO: 8476
WAS: 8403
 NO: 8221
TH : 8213
NT : 8125
 SH: 8037
 ON: 7857
VER: 7773
S T: 7768
LE : 7709
 SO: 7679
ALL: 7615
T A: 7581
ITH: 7564
ENT: 7424
THI: 7416
 AS: 7364
E O: 7334
TER: 7289
E H: 7264
T I: 7242
WIT: 7177
 WE: 7112
ST : 7009
 MA: 6911
 SA: 6878
E I: 6712
AD : 6665
 ME: 6640
AN : 6555
LD : 6531
 ST: 6480
VE : 6395
D H: 6200
N A: 6197
D I: 6157
F T: 6135
OU : 6132
E B: 5874


# Task 2 - Trigram Approximation Generation



# Method for getting last two characters of a string

In [13]:
def get_last_two_chars(current_string):
    # Returns the last two characters of the current string
    return current_string[-2:]

# Debugging
current_string = "ABC"
last_two_chars = get_last_two_chars(current_string)
print(last_two_chars)  # Output: 'BC'

BC


# Find all trigrams throughout the model that start with the given two characters

In [15]:
def find_possible_trigrams(trigram_model, last_two_chars):
    # Finds all trigrams in the model that start with the given last two characters
    return {trigram: count for trigram, count in trigram_model.items() if trigram.startswith(last_two_chars)}

# Debugging
trigram_model = {'ABC': 1, 'BCD': 2, 'CDE': 3, 'DEF': 4}
last_two_chars = 'CD'
possible_trigrams = find_possible_trigrams(trigram_model, last_two_chars)
print(possible_trigrams)  # Output: {'CDE': 3}

{'CDE': 3}


In [17]:
import random

def weighted_random_choice(possible_trigrams):
    """Selects a character based on weighted probabilities from possible trigrams."""
    total_count = sum(possible_trigrams.values())
    rand_val = random.randint(1, total_count)
    
    cumulative_count = 0
    for trigram, count in possible_trigrams.items():
        cumulative_count += count
        if cumulative_count >= rand_val:
            return trigram[-1]  # Return the third character of the trigram
    return None

# Debugging
possible_trigrams = {'CDE': 3, 'CFG': 2, 'CHI': 1}
random_char = weighted_random_choice(possible_trigrams)
print(random_char)  # Output: 'E'

E


# Insert text here