In [258]:
from semantic_text_splitter import TextSplitter
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import numpy as np
import PyPDF2
import os
import math
import textwrap
from typing import List, Optional
import pandas as pd
import os
import re
import wordninja

from io import StringIO

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
# If using the modern openai client:
# pip install openai
import openai

## Set Directory for Comment Documents

In [259]:
dir = r"C:/Users/Eric.Englin/DOT OST\Volpe-Group-JPODataProgram - ROADII/Lab/Use Cases/Public Comments/attachments"
all_entries = os.listdir(dir)
print(len(all_entries))

163


In [265]:
def convert_pdf_to_string(file_path):
    file_path = dir + "/" + file_path
    output_string = StringIO()
    with open(file_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    return(output_string.getvalue())




## Deterministic Chunker

In [106]:
def chunk_by_size(this_attachment_text):
    document_list = []
    #content_list = []
    chunks = []
    # Sentence-aware splitting: split into sentences then pack
    sentence_end_re = re.compile(r'(?<=[.!?])\s+')
    # Keep original paragraphs roughly: split into sentences, but first normalize whitespace
    sentences = sentence_end_re.split(this_attachment_text)
    cur_len = 0
    cur = []
    for s in sentences:
        s = wordninja.split(s)
        # Join back with spaces
        s = " ".join(s)
      #  print(s)

        if cur_len + len(s) > 800:
            cur.append(s)
            document_list.append(x)
            chunks.append(" ".join(cur))
            cur_len = 0       
            #print(len(document_list), len(chunks))
           # print(cur)
            cur = []
        else:
            cur.append(s)
            cur_len += len(s) 

    chunk_df = pd.DataFrame({
        "Document":document_list,
        "Chunk": chunks
    })
    return chunk_df

## Extract Chunks

In [266]:
chunk_df = pd.DataFrame(columns=['Document', 'Chunk'])
content_list = []
for x in all_entries[0:3]:
   # document_list.append(x)
    this_attachment_text = convert_pdf_to_string(x)
    this_attachment_text = re.sub(r'\n', ' ', this_attachment_text)
    content_list.append(this_attachment_text)
    this_chunk_df = chunk_by_size(this_attachment_text)
    chunk_df = pd.concat([chunk_df, this_chunk_df])


In [272]:
chunk_df.shape

(169, 2)

In [273]:
chunk_df.head()

Unnamed: 0,Document,Chunk
0,EPA-HQ-OAR-2025-0194-0094_attachment_1.pdf,Research A Section 508 conform ant HTML versio...
1,EPA-HQ-OAR-2025-0194-0094_attachment_1.pdf,OBJECTIVE The aim of this study was to investi...
2,EPA-HQ-OAR-2025-0194-0094_attachment_1.pdf,After adjusting for lifestyle and medical hist...
3,EPA-HQ-OAR-2025-0194-0094_attachment_1.pdf,This n ding could be use ful for creating a po...
4,EPA-HQ-OAR-2025-0194-0094_attachment_1.pdf,To meet the goals of the Paris Agreement a red...


In [271]:
chunk_df.to_excel("chunks_attempt.xlsx")

## OpenAI API 

#### OpenAI Setup and Quick Test

In [274]:
with open('./../openai_key.txt', 'r') as file:
    OPENAI_API_KEY = file.read()
   # print(OPENAI_API_KEY)

openai.api_key = OPENAI_API_KEY


In [275]:
client = openai.OpenAI(
    api_key=openai.api_key,
    base_url="http://10.75.42.137:4000/" 
)

response = client.chat.completions.create(
    model="GPT-4.1-nano", # model to send to the proxy
    messages = [
        {
            "role": "user",
            "content": "this is a test request"
        }
    ]
)

print(response.choices[0].message.content)

Hello! Your test request has been received. How can I assist you today?


#### Method 1: Relies entirely on OpenAI to Chunk Comments

In [281]:
text = content_list[0]

In [282]:
user_prompt = f"""
    Instructions:
    - Split the entire text into smaller comments/chunks appropriate for posting as separate comments.
    - Aim for roughly {desired_chunk_chars} characters per chunk, but respect sentence boundaries and coherence.
    - If the text contains lists, code blocks, or special sections, keep them intact where possible.
    - Output must be a valid JSON array of strings, e.g. ["chunk1", "chunk2", ...].
    - Do not include any extra prose or explanation.
    - Clean up text where words may be spaces may be incorrectly missing or needed
    - Clean up text where words may be mispelled
    - Make sure to do the entire document not just the first page
    - The length of the final output must be roughly similar to the length of the input text

    Text:
    \"\"\"
    {text}
    \"\"\"

    """



In [283]:
resp = client.chat.completions.create(
    model="GPT-4.1-nano",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that splits a long comment or text into smaller, "
    "readable comment-sized chunks. Each chunk should be a coherent piece of text (complete sentences if possible), "
    f"roughly around {desired_chunk_chars} characters (give or take). "
    "Do not invent new content. Preserve meaning and sentence boundaries. "
    "Return output as a JSON array of strings and nothing else."},
        {"role": "user", "content": user_prompt},
    ],
    temperature=0.0,
    max_tokens=20000,
)


In [284]:
tot = 0
for x in result:
    tot = tot + len(x)

print(tot, len(text))
print(round(tot/len(text)*100,2), "% of original text remained in chunked output")

14674 60466
24.27 % of original text remained in chunked output


#### Method 2: Use Non-OpenAI Chunks as Input, then Have OpenAI Enhance Chunks

In [285]:
this_doc_chunks = chunk_df.loc[chunk_df['Document']=="EPA-HQ-OAR-2025-0194-0094_attachment_1.pdf"]

In [286]:
this_chunk_list = this_doc_chunks['Chunk'].tolist()

In [287]:
len(this_chunk_list)

55

In [288]:
target_chunk_chars = 800

In [289]:
user_prompt = f"""
    "You will be given a sequence of numbered text chunks that together form a document.\n\n"
    "Rules (must follow exactly):\n"
    "1) Do NOT summarize, delete, reorder, or invent content. You may only split or merge contiguous text boundaries and normalize whitespace.\n"
    f"2) Aim for roughly {target_chunk_chars} characters per returned chunk, respecting sentence boundaries when possible.\n"
    "3) The returned chunks must be contiguous substrings of the concatenation of the original chunks (you may merge adjacent chunks before re-splitting).\n"
    "4) Output MUST be a single valid JSON array of strings, e.g. [\"chunk1\",\"chunk2\",...]. NOTHING else. No commentary, no metadata.\n\n"
    "INPUT CHUNKS (do not change the labels):\n\n"
    \"\"\"
    {this_chunk_list}
    \"\"\"

    """



In [290]:
resp = client.chat.completions.create(
    model="GPT-4.1-nano",
    messages=[
        {"role": "system", "content": "You only modify chunk boundaries. Under no circumstances change the text content "
            "other than whitespace normalization. Return a valid JSON array of strings and nothing else."
},
        {"role": "user", "content": user_prompt},
    ],
    temperature=0.0,
    max_tokens=20000,
)


In [291]:
raw = resp.choices[0].message.content.strip()
new_chunks = json.loads(raw)

In [292]:
raw = resp.choices[0].message.content.strip()
print(len(raw), len(text))
print(round(len(raw)/len(text)*100,2), "% of original text remained in chunked output")

57727 60466
95.47 % of original text remained in chunked output
