In [1]:
import fitz  # PyMuPDF
import re
import os

def convert_pdf_to_txt(pdf_path, txt_path):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        total_pages = pdf_document.page_count

        for page_num in range(total_pages):
            page = pdf_document[page_num]
            page_text = page.get_text()
            text += page_text

    # Remove extra numbers and spaces
    cleaned_text = re.sub(r'\s+', ' ', re.sub(r'\d+', '', text.strip()))

    # Save the cleaned text to a txt file
    with open(txt_path, 'w') as file:
        file.write(cleaned_text)

def process_pdfs(input_folder, output_folder):
    # Process all PDF files in the input folder
    for pdf_file in os.listdir(input_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(input_folder, pdf_file)

            # Generate a unique name for the output text file
            txt_filename = os.path.splitext(pdf_file)[0] + '_cleaned.txt'
            txt_path = os.path.join(output_folder, txt_filename)

            # Convert PDF to cleaned text file
            convert_pdf_to_txt(pdf_path, txt_path)

# Usage example
input_folder = 'opinions/'
output_folder = 'text_opinions/'

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

process_pdfs(input_folder, output_folder)


In [4]:
import os
import re

def separate_opinions(input_text):
    # Use a regular expression to capture text starting with the first occurrence of "Opinion of the Court"
    match = re.search(r'Opinion of the Court(.*?)(?=Opinion of [A-Z][a-z]*, J\.|$)', input_text, re.DOTALL)

    if match:
        # Extract the matched text
        extracted_text = match.group(1).strip()
        return [extracted_text]
    else:
        return []

def process_and_save_opinions(input_folder, output_folder):
    # Process all txt files in the input folder
    for txt_file in os.listdir(input_folder):
        if txt_file.lower().endswith('.txt'):
            txt_path = os.path.join(input_folder, txt_file)

            # Read text from the file
            with open(txt_path, 'r') as file:
                input_text = file.read()

            # Extract opinions
            opinions = separate_opinions(input_text)

            # Save each opinion to a new file
            for i, opinion_text in enumerate(opinions):
                output_filename = f'{os.path.splitext(txt_file)[0]}_opinion_{i + 1}.txt'
                output_path = os.path.join(output_folder, output_filename)

                with open(output_path, 'w') as output_file:
                    output_file.write(opinion_text)

                print(f"Saved: {output_filename}")

# Input and output folders
input_folder = 'text_opinions'
output_folder = 'first_opinion'

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process and save opinions from input_folder to output_folder
process_and_save_opinions(input_folder, output_folder)



Saved: 598us1r10_febh_cleaned_opinion_1.txt
Saved: 598us1r11_ca7d_cleaned_opinion_1.txt
Saved: 598us1r12_l5gm_cleaned_opinion_1.txt
Saved: 598us1r13_3e04_cleaned_opinion_1.txt
Saved: 598us1r14_3e04_cleaned_opinion_1.txt
Saved: 598us1r3_j4ek_cleaned_opinion_1.txt
Saved: 598us1r4_1a7d_cleaned_opinion_1.txt
Saved: 598us1r5_1b72_cleaned_opinion_1.txt
Saved: 598us1r6_0813_cleaned_opinion_1.txt
Saved: 598us1r7_3dq4_cleaned_opinion_1.txt
Saved: 598us1r8_limq_cleaned_opinion_1.txt
Saved: 598us1r9_8mjp_cleaned_opinion_1.txt
Saved: 598us2r15_d1o2_cleaned_opinion_1.txt
Saved: 598us2r16_4gdj_cleaned_opinion_1.txt
Saved: 598us2r17_c0nd_cleaned_opinion_1.txt
Saved: 598us2r18_g2bh_cleaned_opinion_1.txt
Saved: 598us2r19_dc8f_cleaned_opinion_1.txt
Saved: 598us2r20_b07d_cleaned_opinion_1.txt
Saved: 598us2r21_5425_cleaned_opinion_1.txt
Saved: 598us2r22_hejm_cleaned_opinion_1.txt
Saved: 598us2r23_6k47_cleaned_opinion_1.txt
Saved: 598us2r24_m6hn_cleaned_opinion_1.txt
Saved: 598us2r27_aplc_cleaned_opinion_1

In [5]:
import os
import re

def separate_opinions(input_text):
    # Use a regular expression to capture text starting with the first occurrence of "Opinion of the Court"
    match = re.search(r'Opinion of the Court(.*?)(?:concurring in judgment|Alito, J., dissenting|Thomas, C., dissenting|Sotomayor, S., dissenting|Kagan, E., dissenting|Gorsuch, N., dissenting|Kavanaugh, B., dissenting|Barret, A., dissenting|Jackson, K., dissenting|$)', input_text, re.DOTALL)

    if match:
        # Extract the matched text
        extracted_text = match.group(1).strip()
        return [extracted_text]
    else:
        return []

def process_and_save_opinions(input_folder, output_folder):
    # Process all txt files in the input folder
    for txt_file in os.listdir(input_folder):
        if txt_file.lower().endswith('.txt'):
            txt_path = os.path.join(input_folder, txt_file)

            # Read text from the file
            with open(txt_path, 'r') as file:
                input_text = file.read()

            # Extract opinions
            opinions = separate_opinions(input_text)

            # Save each opinion to a new file
            for i, opinion_text in enumerate(opinions):
                output_filename = f'{os.path.splitext(txt_file)[0]}_opinion_{i + 1}.txt'
                output_path = os.path.join(output_folder, output_filename)

                with open(output_path, 'w') as output_file:
                    output_file.write(opinion_text)

                print(f"Saved: {output_filename}")

# Input and output folders
input_folder = 'text_opinions'
output_folder = 'first_opinion'

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process and save opinions from input_folder to output_folder
process_and_save_opinions(input_folder, output_folder)


Saved: 598us1r10_febh_cleaned_opinion_1.txt
Saved: 598us1r11_ca7d_cleaned_opinion_1.txt
Saved: 598us1r12_l5gm_cleaned_opinion_1.txt
Saved: 598us1r13_3e04_cleaned_opinion_1.txt
Saved: 598us1r14_3e04_cleaned_opinion_1.txt
Saved: 598us1r3_j4ek_cleaned_opinion_1.txt
Saved: 598us1r4_1a7d_cleaned_opinion_1.txt
Saved: 598us1r5_1b72_cleaned_opinion_1.txt
Saved: 598us1r6_0813_cleaned_opinion_1.txt
Saved: 598us1r7_3dq4_cleaned_opinion_1.txt
Saved: 598us1r8_limq_cleaned_opinion_1.txt
Saved: 598us1r9_8mjp_cleaned_opinion_1.txt
Saved: 598us2r15_d1o2_cleaned_opinion_1.txt
Saved: 598us2r16_4gdj_cleaned_opinion_1.txt
Saved: 598us2r17_c0nd_cleaned_opinion_1.txt
Saved: 598us2r18_g2bh_cleaned_opinion_1.txt
Saved: 598us2r19_dc8f_cleaned_opinion_1.txt
Saved: 598us2r20_b07d_cleaned_opinion_1.txt
Saved: 598us2r21_5425_cleaned_opinion_1.txt
Saved: 598us2r22_hejm_cleaned_opinion_1.txt
Saved: 598us2r23_6k47_cleaned_opinion_1.txt
Saved: 598us2r24_m6hn_cleaned_opinion_1.txt
Saved: 598us2r27_aplc_cleaned_opinion_1