**Large Language Model Processing**

Task 1: Third-order letter approximation model

In [18]:
import os
import re
from collections import defaultdict

# Function to clean the text
def clean_text(text):
    # Remove any character that is not an ASCII letter, space, or full stop
    cleaned_text = re.sub(r'[^A-Za-z. ]+', '', text)
    
    # Convert all letters to uppercase
    cleaned_text = cleaned_text.upper()
    
    return cleaned_text

# Function to generate trigrams and count occurrences
def generate_trigrams(text):
    trigram_counts = defaultdict(int)  # Dictionary to store trigram counts
    
    # Iterate over the text to extract trigrams
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Get a sequence of three characters
        trigram_counts[trigram] += 1  # Increment the count for the trigram
    
    return trigram_counts

# Function to remove preamble and postamble, and clean the text
def remove_before_and_after(text, start_marker, end_marker):
    start_index = text.find(start_marker)
    if start_index != -1:
        text = text[start_index + len(start_marker):].strip()
    end_index = text.find(end_marker)
    if end_index != -1:
        text = text[:end_index].strip()
    return clean_text(text)  # Clean the text after removing preamble/postamble

# Function to read text files
def read_text_file(file_path):
    start_marker = " ***"
    end_marker = "*** "
    with open(file_path, 'r', encoding='utf-8') as f:
        file_content = f.read()  # Read the entire file content
        cleaned_content = remove_before_and_after(file_content, start_marker, end_marker)  # Remove before start and after end marker
        return cleaned_content

# Function to iterate over files in the given directory and read text files
def read_books(path):
    os.chdir(path)
    books_contents = []  # List to store the content of each file
    for file in os.listdir():
        if file.endswith(".txt"):
            file_path = os.path.join(path, file)
            books_contents.append(read_text_file(file_path))  # Read and clean content
    return books_contents  # Return the list of cleaned book contents

# Main function to process the books and generate the trigram model
def process_books_for_trigrams(path):
    books_contents = read_books(path)  # Read and clean all books
    overall_trigram_counts = defaultdict(int)  # Dictionary to store combined trigram counts
    
    # Iterate over each book's content
    for content in books_contents:
        # Generate trigram counts for the book
        book_trigram_counts = generate_trigrams(content)
        
        # Update the overall trigram counts
        for trigram, count in book_trigram_counts.items():
            overall_trigram_counts[trigram] += count
    
    return dict(overall_trigram_counts)


In [19]:
# Example usage 
path = r"C:\Users\ronan\OneDrive - Atlantic TU\Desktop\Documents\GitHub\Emerging-Technologies\tasks\project_gutenberg"

# Example of how to call the function
# The following assumes that you have downloaded the books from Project Gutenberg
# and placed them in a directory. Replace 'your_directory_path' with the correct path.

trigram_model = process_books_for_trigrams(path)
print(trigram_model)

{'ILL': 3198, 'LLU': 277, 'LUS': 318, 'UST': 1884, 'STR': 1896, 'TRA': 1581, 'RAT': 1316, 'ATI': 2910, 'TIO': 3941, 'ION': 5059, 'ONA': 536, 'NAL': 545, 'ALI': 1074, 'LIC': 774, 'ICE': 1432, 'CES': 908, 'ES ': 6848, 'S A': 6840, ' AD': 841, 'ADV': 283, 'DVE': 65, 'VEN': 1511, 'ENT': 6183, 'NTU': 267, 'TUR': 2030, 'URE': 2078, 'RES': 3161, 'S I': 3589, ' IN': 12295, 'IN ': 10004, 'N W': 1915, ' WO': 2910, 'WON': 396, 'OND': 926, 'NDE': 2381, 'DER': 2125, 'ERL': 375, 'RLA': 72, 'LAN': 1270, 'AND': 18105, 'NDB': 73, 'DBY': 53, 'BY ': 2653, 'Y L': 817, ' LE': 2267, 'LEW': 114, 'EWI': 207, 'WIS': 508, 'IS ': 9831, 'S C': 1686, ' CA': 3956, 'CAR': 851, 'ARR': 953, 'RRO': 420, 'ROL': 223, 'OLL': 950, 'LLT': 91, 'LTH': 241, 'THE': 39699, 'HE ': 31969, 'E M': 4216, ' MI': 2312, 'MIL': 709, 'LLE': 1224, 'LEN': 882, 'ENN': 419, 'NNI': 264, 'NIU': 20, 'IUM': 75, 'UM ': 99, 'M F': 265, ' FU': 563, 'FUL': 1117, 'ULC': 18, 'LCR': 5, 'CRU': 168, 'RUM': 97, 'M E': 118, ' ED': 92, 'EDI': 689, 'DIT': 295