In [1]:
import re
import os

def extract_text_between_markers(input_text):
    # Use a regular expression to capture all text between "CHIEF JUSTICE ROBERTS:" and the next occurrence of "MR.", "MRS.", "MS.", or "JUSTICE"
    pattern = re.compile(r'CHIEF JUSTICE ROBERTS:(.*?)(?:MR\.|MRS\.|MS\.|JUSTICE\b|$)')
    matches = pattern.finditer(input_text)
    count = 0
    text = ""

    for match in matches:
        # Extract the matched text
        extracted_text = match.group(1).strip()
        text += extracted_text
        count += 1

    # Remove all text following the phrase "Heritage Reporting Corporation Official $"
    end_marker = "Heritage Reporting Corporation"
    end_index = text.find(end_marker)
    if end_index != -1:
        text = text[:end_index]

    return text, count

def process_and_save_text(input_folder, output_folder):
    # Process all text files in the input folder
    for txt_file in os.listdir(input_folder):
        if txt_file.lower().endswith('.txt'):
            txt_path = os.path.join(input_folder, txt_file)

            # Generate a unique name for the output text file
            extracted_filename = os.path.splitext(txt_file)[0] + '_extracted.txt'
            extracted_path = os.path.join(output_folder, extracted_filename)

            # Read text from the file
            with open(txt_path, 'r') as file:
                input_text = file.read()

            # Extract relevant text and count occurrences
            extracted_text, occurrences = extract_text_between_markers(input_text)

            # Save the extracted text to a new file
            with open(extracted_path, 'w') as file:
                file.write(extracted_text)

            print(f"Processed: {txt_file}, Number of occurrences: {occurrences}")

# Input and output folders
input_folder = 'text_files'
output_folder = 'roberts'

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process and save text from input_folder to output_folder
process_and_save_text(input_folder, output_folder)

Processed: 22-193_8nk0_cleaned.txt, Number of occurrences: 42
Processed: 22-324_fe9g_cleaned.txt, Number of occurrences: 27
Processed: 22-340_ca7d_cleaned.txt, Number of occurrences: 26
Processed: 22-429_4315_cleaned.txt, Number of occurrences: 34
Processed: 22-448_4f15_cleaned.txt, Number of occurrences: 18
Processed: 22-500_1o23_cleaned.txt, Number of occurrences: 15
Processed: 22-585_7648_cleaned.txt, Number of occurrences: 24
Processed: 22-611_1b8e_cleaned.txt, Number of occurrences: 21
Processed: 22-6389_8n59_cleaned.txt, Number of occurrences: 23
Processed: 22-660_apm1_cleaned.txt, Number of occurrences: 29
Processed: 22-666_f2ah_cleaned.txt, Number of occurrences: 23
Processed: 22-704_g3bi_cleaned.txt, Number of occurrences: 17
Processed: 22-721_3e04_cleaned.txt, Number of occurrences: 24
Processed: 22-800_097c_cleaned.txt, Number of occurrences: 24
Processed: 22-807_o7jp_cleaned.txt, Number of occurrences: 43
Processed: 22-846_b07d_cleaned.txt, Number of occurrences: 25
Process