# Install necessary packages 

In [None]:
%pip install PyPDF2
%pip install pytesseract
%pip install pdf2image
%pip install pdfplumber
%pip install python-imaging
%pip install langchain openai pypdf faiss-cpu
%pip install python-docx
%pip install exceptions
%pip install pytesseract pdf2image python-docx PyPDF2 pyth
%pip install striprtf
%pip install tiktoken
%pip install pinecone-client
%pip install docx
%pip install pyth

In [None]:
%pip install --upgrade openai

# Set up relevant functions for Text Vectorization and GPT-4 Querying 

In [54]:
import openai
import base64
import os
import glob
import io
import pytesseract
import pdf2image
import docx
import pinecone
import time
import tiktoken
import uuid
import PyPDF2
import pandas as pd

from pinecone import Pinecone
from pinecone import ServerlessSpec

from PyPDF2 import PdfReader
from PIL import Image
from openai import OpenAI
from striprtf.striprtf import rtf_to_text

from tqdm.auto import tqdm
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

# Extracts text from DOCX files
def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

# Extracts text from RTF files
def read_rtf(file_path):
    with open(file_path, "rb") as file:
        doc = Rtf15Reader.read(file)
    return PlaintextWriter.write(doc).getvalue()

# Function to read text from PDF of scanned documents using Optical Character Recognition (OCR)
def ocr_pdf(file_path):
    
    # Convert each page to an image
    images = pdf2image.convert_from_path(file_path)
    text = ""
    
    for image in images:
        # Use Tesseract to do OCR on the image
        text += pytesseract.image_to_string(image)
    
    return text

# Function to read text from text-encoded PDF
def read_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Get page count of a PDF
def get_pdf_page_count(file_path):
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            return len(pdf_reader.pages)
    except Exception as e:
        print(f"Error: {e}")
        return None

# Efficient function for processing any PDF. Attempts to transcribe encoded text first and uses OCR if that fails
def process_pdf(file_path):
    try:
        output = read_pdf(file_path)
        
        # If less than 30 words are extracted per page on average, assume failure and use OCR
        if (len(output) < (get_pdf_page_count(file_path) * 30 * 7)):
            output = ocr_pdf(file_path)
            
        return output
    except:
        return ocr_pdf(file_path)
    
# Function to read and combine all files in a single folder
def concatenate_pdfs_in_folder(folder_path):
    text = ""
    for file_path in glob.glob(os.path.join(folder_path, '*')):
        if file_path.lower().endswith('.pdf'):
            text += process_pdf(file_path) + "\n\n\n NEW FILE BEGINS \n\n\n"
    return text

# Function to make a simple text-based query to the OpenAI API
def text_query_gpt(definitions, document_text):
    
    prompt_text = f"The following are definitions of carefully identified exogenous variables for projects that undergo NEPA Review, please read them all very carefully and become deeply and intimately familiar with every detail:\n\n" + f"{definitions}\n\n" + f"Read the following NEPA Environmental Impact Statements (these may include draft and/or final environmental impact statements) and related comment letters:\n\n" + f"\n\n {document_text} \n\n" + f"Based on the definitions above, indicate the presence (1) or absence (0) of each exogenous variable in the highlighted portions of the EIS extract and related documents. Please, be meticulous in your search and systematic in your logic, methods, and justifications for the presence of each variable (listed again here for clarity: [Lack of Federal Funding/Strained Resources for NEPA Compliance Teams, Influence of NEPA Litigation, Property/Land Rights Disputes, Mid-project financing issues/loss of project funding, Compliance with ESA triggered during NEPA Review Process, Compliance with CWA triggered during NEPA Review Process, Compliance with CAA triggered during NEPA Review Process, Compliance with NHPA triggered during NEPA Review Process, Third-party construction delays, Compliance with other environmental legislation triggered during NEPA Review Process]), making sure to tie them back closely to the provided definitions. Remember that some of the definitions are broad, particularly for strained agency resources and for mid-project financing issues, so your search should be of great breadth and depth. YOU MUST FORMAT YOUR OUTPUT EXACTLY AS FOLLOWS: 'Lack of Federal Funding/Strained Resources for NEPA Compliance Teams: [insert 0 or 1 here], Influence of NEPA Litigation: [insert 0 or 1 here], Property/Land Rights Disputes: [insert 0 or 1 here], Mid-project financing issues/loss of project funding: [insert 0 or 1 here], Compliance with ESA triggered during NEPA Review Process: [insert 0 or 1 here], Compliance with CWA triggered during NEPA Review Process: [insert 0 or 1 here], Compliance with CAA triggered during NEPA Review Process: [insert 0 or 1 here], Compliance with NHPA triggered during NEPA Review Process: [insert 0 or 1 here], Third-party construction delays: [insert 0 or 1 here], Compliance with other environmental legislation triggered during NEPA Review Process: [insert 0 or 1 here]'. DO NOT INCLUDE ANY ADDITIONAL TEXT IN YOUR OUTPUT. BE PRECISE WITH YOUR METHODOLOGY, IT SHOULD BE CONSISTENT FOR REPEATED QUERIES. If no relevant material to the exogenous variable definition appears in the provided excerpts, you can classify them as 0, but feel free to make context-based inferences. Do read all provided material meticulously"
    
    client = OpenAI(api_key = 'REDACTED')

    completion = client.chat.completions.create(
      model="gpt-4-turbo",
      messages=[
        {"role": "system", "content": "Your sole duty is to carefully parse environmental permitting documents with a precise eye for detail, and report back your findings per the user's desired specifications."},
        {"role": "user", "content": prompt_text}
      ]
    )
    
    return completion.choices[0].message.content

# Gets token length of text
def tiktoken_len(text):
    tokenizer_name = tiktoken.encoding_for_model('gpt-4')
    tokenizer = tiktoken.get_encoding(tokenizer_name.name)
    
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

# Function that breaks down very large EISes/text strings (128000+ tokens) into smaller tokenized vector chunks passable to OpenAI API via embeddings
def text_vectorizer(document_text, max_chunk_length):
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_length,
        chunk_overlap=20,
        length_function=tiktoken_len,
        separators=["\n\n", "\n", " ", ""]
    )
        
    chunks = []
    split_texts = text_splitter.split_text(document_text)

    for i, chunk_text in enumerate(split_texts):
        chunks.append({
            'id': str(uuid.uuid4()),  # Generate a unique identifier for each chunk
            'text': chunk_text,
            'chunk': i  # Chunk number
        })

    return chunks
    
# Function that creates embeddings for tokenized vector chunks via Langchain with server-side storage on Pinecone for access and use by OpenAI's GPT-4 model
# Extracts top 300 vector chunks ranked by relevance to prompt via cosine function, sorts them by keyword search, and then uses top 125 for processing
def vectorized_query_gpt(definitions, vector_chunks):
    
    client = OpenAI(api_key = 'REDACTED')

    openai.api_key = 'REDACTED'
    embed_model = "text-embedding-3-large"

    res = client.embeddings.create(
        input=[
            "Sample document text goes here",
            "there will be several phrases in each batch"
        ], model=embed_model
    )

    # configure Pinecone client
    pc = Pinecone(api_key='REDACTED')
    
    cloud = 'aws'
    region = 'us-east-1'
    index_name = 'gpt-4-langchain-docs'

    spec = ServerlessSpec(cloud=cloud, region=region)
    
    # check if index already exists (it shouldn't if this is first time)
    if index_name not in pc.list_indexes().names():
        # if does not exist, create index
        pc.create_index(
            index_name,
            dimension=len(res.data[0].embedding),
            metric='cosine',
            spec=spec
        )
        # wait for index to be initialized
        while not pc.describe_index(index_name).status['ready']:
            time.sleep(1)

    # connect to index
    index = pc.Index(index_name)
    
    batch_size = 100  # how many embeddings we create and insert at once

    for i in tqdm(range(0, len(vector_chunks), batch_size)):
        i_end = min(len(vector_chunks), i+batch_size)
        meta_batch = vector_chunks[i:i_end]
        
        ids_batch = [x['id'] for x in meta_batch]
        
        texts = [x['text'] for x in meta_batch]
        
        # create embeddings (try-except added to avoid RateLimitError)
        try:
            res = client.embeddings.create(input=texts, model=embed_model)
        except:
            done = False
            while not done:
                time.sleep(5)
                try:
                    res = openai.Embedding.create(input=texts, engine=embed_model)
                    done = True
                except:
                    pass
                
        embeds = [datum.embedding for datum in res.data]
        
        # cleanup metadata
        meta_batch = [{
            'id': x['id'],
            'text': x['text'],
            'chunk': x['chunk'],
        } for x in meta_batch]
        
        
        to_upsert = list(zip(ids_batch, embeds, meta_batch))
        
        # upsert to Pinecone
        index.upsert(vectors=to_upsert)

    query = f"The following are definitions of carefully identified exogenous variables for projects that undergo NEPA Review, please read them all very carefully and become deeply and intimately familiar with every detail:\n\n" + f"{definitions}\n\n" + f"Read the following NEPA Environmental Impact Statements (these may include draft and/or final environmental impact statements) and related comment letters:\n\n" + f"Based on the definitions above, indicate the presence (1) or absence (0) of each exogenous variable in the highlighted portions of the EIS extract and related documents. Please, be meticulous in your search and systematic in your logic, methods, and justifications for the presence of each variable (listed again here for clarity: [Lack of Federal Funding/Strained Resources for NEPA Compliance Teams, Influence of NEPA Litigation, Property/Land Rights Disputes, Mid-project financing issues/loss of project funding, Compliance with ESA triggered during NEPA Review Process, Compliance with CWA triggered during NEPA Review Process, Compliance with CAA triggered during NEPA Review Process, Compliance with NHPA triggered during NEPA Review Process, Third-party construction delays, Compliance with other environmental legislation triggered during NEPA Review Process]), making sure to tie them back closely to the provided definitions. Remember that some of the definitions are broad, especially for strained agency resources and mid-project financing issues, so your search should be of great breadth and depth. YOU MUST FORMAT YOUR OUTPUT EXACTLY AS FOLLOWS: 'Lack of Federal Funding/Strained Resources for NEPA Compliance Teams: [insert 0 or 1 here], Influence of NEPA Litigation: [insert 0 or 1 here], Property/Land Rights Disputes: [insert 0 or 1 here], Mid-project financing issues/loss of project funding: [insert 0 or 1 here], Compliance with ESA triggered during NEPA Review Process: [insert 0 or 1 here], Compliance with CWA triggered during NEPA Review Process: [insert 0 or 1 here], Compliance with CAA triggered during NEPA Review Process: [insert 0 or 1 here], Compliance with NHPA triggered during NEPA Review Process: [insert 0 or 1 here], Third-party construction delays: [insert 0 or 1 here], Compliance with other environmental legislation triggered during NEPA Review Process: [insert 0 or 1 here]'. DO NOT INCLUDE ANY ADDITIONAL TEXT IN YOUR OUTPUT. BE PRECISE WITH YOUR METHODOLOGY, IT SHOULD BE CONSISTENT FOR REPEATED QUERIES. If no relevant material to the exogenous variable definition appears in the provided excerpts, you can classify them as 0, but feel free to make context-based inferences. Do read all provided material meticulously"

    res = client.embeddings.create(
        input=[query],
        model=embed_model
    )

    # retrieve from Pinecone
    xq = res.data[0].embedding

    # get relevant contexts via vector search (primary)
    res = index.query(vector=xq, top_k=300, include_metadata=True)

    # Keyword scoring (secondary)
    def keyword_score(text, keywords):
        score = sum(text.lower().count(kw.lower()) for kw in keywords)
        return score
    
    keywords = ["contractor", "timeline", "approval", "delay", "lawsuit", "external", "year", "construction", "developer", "finance", "owner", "litigat", "Clean Water Act", "endangered species", "Clean Air Act", "historic", "historic preservation", "landmark", "protected", "financ", "budget", "fund", "setback", "extend", "lengthen", "public comment", "comment", "stakeholder", "plaintiff", "defendant", "employment", "staffing", "shortfall", "resource", "strained", "allocation", "legal", "challenge", "date", "final", "staffing"]

    # Re-rank based on keywords
    matches_with_scores = [(item, keyword_score(item['metadata']['text'], keywords)) for item in res['matches']]
    matches_with_scores.sort(key=lambda x: x[1], reverse=True)  # Sort by keyword score, high to low

    top_matches = matches_with_scores[:125]  # Select the top 125 after re-ranking

    contexts = [item[0]['metadata']['text'] for item in top_matches]
    augmented_query = query+"\n\n---\n\n".join(contexts)+"\n\n-----\n\n"
    
    primer = "Your sole duty is to carefully parse environmental permitting documents with a precise eye for detail, and report back your findings per the user's desired specifications."

    # print(tiktoken_len(augmented_query))
    
    res = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": primer},
            {"role": "user", "content": augmented_query}
        ]
    )
    
    return res.choices[0].message.content

def add_output_to_csv(project_name, model_output, dataframe):
    exo_variables = ["Lack of Federal Funding/Strained Resources for NEPA Compliance Teams", "Influence of NEPA Litigation", "Property/Land Rights Disputes", "Mid-project financing issues/loss of project funding", "Compliance with ESA triggered during NEPA Review Process", "Compliance with CWA triggered during NEPA Review Process", "Compliance with CAA triggered during NEPA Review Process", "Compliance with NHPA triggered during NEPA Review Process", "Third-party construction delays", "Compliance with other environmental legislation triggered during NEPA Review Process"]
    
    print(model_output)
    print()
    
    output_dict = {}
    
    try:
        output_dict = {item.split(': ')[0].strip(): int(item.split(': ')[1]) for item in model_output.strip("'").split(', ')}
    except:
        return f"failure loading EISes for {project_name}"
        

    row_index = 0
    
    try:
        row_index = dataframe[dataframe['Project Title'] == project_name].index[0]
    except:
        return f"Failure finding row in spreadsheet for {project_name}!"

    for var in exo_variables:
        dataframe.loc[row_index, var] = output_dict[var]
        
    return f"Success! for {project_name}!"

# Script to Run Systematic Queries and Update Spreadsheets with Exogenous Variable Classifications

In [None]:
import pandas as pd
import numpy as np

csv_path = 'andres_dot.csv'

spreadsheet = pd.read_csv(csv_path)
spreadsheet['Project Title'] = spreadsheet['Project Title'].str.strip()

# Read the exogenous variable definitions from the provided PDF
definitions_pdf_path = 'Compiled Exogenous Variable Definitions.pdf'
definitions_text = read_pdf(definitions_pdf_path)

base_path = '/Users/rohinjuneja/Documents/NEPA_GPT/JIGNESH_GPT'
subfolder_path = '/DOTPERMIT_ANDRES'
base_path += subfolder_path
print(base_path)

count = 0

# Iterate over each folder in the subfolder and process all PDFs within it
for folder_name in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder_name)
    folder_path = folder_path[folder_path.index("JIGNESH_GPT") + len("JIGNESH_GPT "):] 

    project_name = folder_path[folder_path.index(subfolder_path[1:]) + len(subfolder_path):]
    print(project_name)
    
    if ('Plutonium' not in project_name) and spreadsheet.loc[spreadsheet['Project Title'] == project_name, 'Influence of NEPA Litigation'].isnull().any():
    
        input_text = concatenate_pdfs_in_folder(folder_path)
        model_output = ''

        if tiktoken_len(input_text) < 32000:
            model_output = text_query_gpt(definitions_text, input_text)
        else:    
            model_output = vectorized_query_gpt(definitions_text, text_vectorizer(input_text, 500))

        outcome = add_output_to_csv(project_name, model_output, spreadsheet)

        print()
        print(outcome)
        print()

        spreadsheet.to_csv(csv_path, index=False)

In [3]:
vector = vectorized_query_gpt(definitions_text, text_vectorizer(concatenate_pdfs_in_folder("DOI/Blackfoot Bridge Mine Project Developing Three Mine Pits Haul Roads Water Management Structures and Overburden Disposal Areas Implementation Caribou County ID"), 500))
print(vector)
output_dict = {item.split(': ')[0].strip(): int(item.split(': ')[1]) for item in vector.strip("'").split(', ')}
print(output_dict)

0it [00:00, ?it/s]


Lack of Federal Funding/Strained Resources for NEPA Compliance Teams: 0, Influence of NEPA Litigation: 0, Property/Land Rights Disputes: 0, Mid-project financing issues/loss of project funding: 0, Compliance with ESA triggered during NEPA Review Process: 1, Compliance with CWA triggered during NEPA Review Process: 1, Compliance with CAA triggered during NEPA Review Process: 1, Compliance with NHPA triggered during NEPA Review Process: 1, Third-party construction delays: 0, Compliance with other environmental legislation triggered during NEPA Review Process: 0
{'Lack of Federal Funding/Strained Resources for NEPA Compliance Teams': 0, 'Influence of NEPA Litigation': 0, 'Property/Land Rights Disputes': 0, 'Mid-project financing issues/loss of project funding': 0, 'Compliance with ESA triggered during NEPA Review Process': 1, 'Compliance with CWA triggered during NEPA Review Process': 1, 'Compliance with CAA triggered during NEPA Review Process': 1, 'Compliance with NHPA triggered during 