In [17]:
import os
import getpass
import glob
import pypdf
from tqdm.notebook import tqdm
from langchain.llms import OpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate

In [4]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

OpenAI API Key: ········


In [5]:
embeddings = OpenAIEmbeddings()

In [6]:
file_path = 'pdf/FTA_Title_VI_FINAL.pdf'

In [7]:
reader = pypdf.PdfReader(file_path)
page = reader.pages[0]
page.extract_text()

' \n \n \n \n  \n \n \n \n \n \n \n  \n \n \n \n \nU.S. Department  \nof Transportation  \nFederal Transit  \nAdministration \n \n CIRCULAR\n \nFTA C 4702.1B  \nSubject: \t TITLE VI REQUIREMENTS AND GUIDELINES FOR FEDERAL \nTRANSIT ADMINISTRATION RECIPIENTS   October 1, 2012 \n1.\n\t PURPOSE. The purpose of this Circular is to provide recipients of Federal Transit \nAdministration (FTA) financial assistance with guidance and instructions necessary to carry \nout U.S. Department of Transportation (“DOT” or “the Department”) Title VI regulations (49 CFR part 21) and to integrate in to their programs and activitie s considerations expressed in \nthe Department’s Policy Guidance Concerni ng Recipients’ Responsibilities to Limited \nEnglish Proficient (“LEP”) Persons (70 FR 74087, December 14, 2005).  \n2.\t CANCELLATION. This Circular supersedes FT A Circular 4702.1A “Title VI and Title VI-\nDependent Guidelines for Federal Transit Administration Recipients,” dated May 13, 2007.  \n3.\t AU

In [8]:
FILE_MAP = {
    '5010-1e-circular-award-management-requirements-7-16-18.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/award-management-requirements-circular-50101e",
    'C9070_1G_FINAL_circular_4-20-15(1).pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/enhanced-mobility-seniors-and-individuals-disabilities",
    'eeo-circular-c-47041a.pdf': "https://www.transit.dot.gov/regulations-and-guidance/civil-rights-ada/eeo-circular",
    'Final_C_9300_1_Bpub.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/capital-investment-program-guidance-and-application",
    'Final_FTA_ADA_Circular_C_4710.1.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/americans-disabilities-act-guidance-pdf",
    'FINAL_FTA_circular9030.1E.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/urbanized-area-formula-program-program-guidance-and",
    'Final_FTA_C_5100_4-16-15.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/bus-and-bus-facilities-program-guidance-and-application",
    'FTA-Third-party-procurement-best-practices-webinar-transcript-12-13-2022.pdf': "https://www.transit.dot.gov/regulations-and-programs/fta-circulars/third-party-contracting-guidance-transcript",
    'FTA_Circular_5300_published_02-28-15.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/state-good-repair-grant-program-guidance-and-application",
    'FTA_Circular_9040_1Gwith_index_-_Final_Revised_-_vm_10-15-14(1).pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/formula-grants-rural-areas-program-guidance-and-application",
    'FTA_Cir_6100.1E.docx_4.08.2015_(2)_0.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/research-technical-assistance-and-training-program",
    'FTA_C_5800.1SSMP.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/safety-and-security-management-guidance-major-capital",
    'FTA_EJ_Circular_7.14-12_FINAL.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/environmental-justice-policy-guidance-federal-transit",
    'FTA_Title_VI_FINAL.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/title-vi-requirements-and-guidelines-federal-transit",
    'Joint-Development-Circular-C-7050-1B.pdf': "https://www.transit.dot.gov/funding/funding-finance-resources/joint-development/joint-development-circular-c-70501b",
    'program-guidance-metropolitan-planning-and-state-planning-and-research-c81001d.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/program-guidance-metropolitan-planning-and-state-planning-a-0",
    'Third Party Contracting Guidance (Circular 4220.1F).pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/third-party-contracting-guidance",
    'UMTA_C_9500.1.pdf': "https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/intergovernmental-review-fta-planning-capital-and-operating"
}

CIRCULAR_MAP = {
    '5010-1e-circular-award-management-requirements-7-16-18.pdf': "C 5010.1E Award Management Requirements",
    'C9070_1G_FINAL_circular_4-20-15(1).pdf': "C 9070.1G Enhanced Mobility of Seniors and Individuals with Disabilities Program Guidance and Application Instructions",
    'eeo-circular-c-47041a.pdf': "C 4704.1A Equal Employment Opportunity (EEO) Act Guidance",
    'Final_C_9300_1_Bpub.pdf': "C 9300.1B Capital Investment Program Guidance and Application Instructions",
    'Final_FTA_ADA_Circular_C_4710.1.pdf': "C 4710.1 Americans with Disabilities Act Guidance",
    'FINAL_FTA_circular9030.1E.pdf': "C 9030.1E Urbanized Area Formula Program: Program Guidance and Application Instructions",
    'Final_FTA_C_5100_4-16-15.pdf': "C 5100.1 Bus and Bus Facilities Program: Guidance and Application Instructions",
    'FTA-Third-party-procurement-best-practices-webinar-transcript-12-13-2022.pdf': "Third Party Contracting Guidance Webinar Transcript",
    'FTA_Circular_5300_published_02-28-15.pdf': "C 5300.1 State of Good Repair Grant Program: Guidance and Application Instructions",
    'FTA_Circular_9040_1Gwith_index_-_Final_Revised_-_vm_10-15-14(1).pdf': "C 9040.1G Formula Grants for Rural Areas: Program Guidance and Application Instructions",
    'FTA_Cir_6100.1E.docx_4.08.2015_(2)_0.pdf': "C 6100.1E Research, Technical Assistance and Training Program: Application Instructions and Program Management Guidelines",
    'FTA_C_5800.1SSMP.pdf': "C 5800.1 Safety and Security Management Guidance for Major Capital Projects",
    'FTA_EJ_Circular_7.14-12_FINAL.pdf': "C 4703.1 Environmental Justice Policy Guidance For Federal Transit Administration Recipients",
    'FTA_Title_VI_FINAL.pdf': "C 4702.1B Title VI Requirements and Guidelines for Federal Transit Administration Recipients",
    'Joint-Development-Circular-C-7050-1B.pdf': "C 7050.1B Guidance on Joint Development",
    'program-guidance-metropolitan-planning-and-state-planning-and-research-c81001d.pdf': "C 8100.1D Program Guidance for Metropolitan Planning and State Planning and Research Program Grants",
    'Third Party Contracting Guidance (Circular 4220.1F).pdf': "C 4220.1F Third Party Contracting Guidance",
    'UMTA_C_9500.1.pdf': "C 9500.1 Intergovernmental Review of FTA Planning, Capital and Operating Programs and Activities"
}

def create_documents(file_path, chunk_size=250, chunk_overlap=50, path_seperator='\\'):
    reader = pypdf.PdfReader(file_path)
    texts = [page.extract_text() for page in reader.pages]
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        encoding_name='cl100k_base',
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
    )
    docs = splitter.create_documents([''.join(texts)])
    slash_index = file_path[::-1].index(path_seperator)
    file_name = file_path[-slash_index:]
    for idx, doc in enumerate(docs):
        doc.metadata['file_name'] = file_name
        doc.metadata['file_link'] = FILE_MAP[file_name]
        doc.metadata['circular_name'] = CIRCULAR_MAP[file_name]
        pages = get_page(doc.page_content, texts, search_window=20)
        doc.metadata['start_page'] = pages['start_page'] + 1
        doc.metadata['end_page'] = pages['end_page'] + 1
    return docs, texts

def get_page(content, texts, search_window=20):
    for idx, text in enumerate(texts):
        if content in text:
            return {'start_page': idx, 'end_page': idx}
        else:
            for i in range(1, search_window+1):
                if content in ''.join(texts[idx-i:idx]) + texts[idx]:
                    return {'start_page': idx-i, 'end_page': idx}      

In [9]:
docs, texts = create_documents(file_path, chunk_size=250, chunk_overlap=50, path_seperator='/')
len(docs), len(texts)

(339, 130)

In [11]:
docs[30]

Document(page_content='tract, block or block group, or traffic analysis zone, wh ere the proportion of minority \npersons residing in that area exceeds the av erage proportion of minority persons in the \nrecipient’s service area.  \nx.\t Primary recipient means any FTA recipient that extends Federal fina ncial assistance to a \nsubrecipient. \ny.\t Provider of fixed route public transportation (or “transit provider”)  means any entity that \noperates public transportation service, and incl udes States, local and regional entities, and \npublic and private entities. This term is used in place of “recipient” in chapter IV and is \ninclusive of direct recipients, primary recipi ents, designated recipients, and subrecipients \nthat provide fixed route pub lic transportation service. \nz.\t Public transportation means regular, con tinuing shared-ride surface transportation \nservices that are open to the general public or open to a segment of the general public \ndefined by age, disability

In [12]:
file_path = glob.glob('pdf/*.pdf')
pages = []
for idx, file in enumerate(file_path):
    print(f'Loading {idx+1}/{len(file_path)}: {file}')
    docs, _ = create_documents(file, chunk_size=250, chunk_overlap=50, path_seperator='\\')
    pages += docs
    
print(len(pages))

Loading 1/18: pdf\5010-1e-circular-award-management-requirements-7-16-18.pdf
Loading 2/18: pdf\C9070_1G_FINAL_circular_4-20-15(1).pdf
Loading 3/18: pdf\eeo-circular-c-47041a.pdf
Loading 4/18: pdf\Final_C_9300_1_Bpub.pdf
Loading 5/18: pdf\Final_FTA_ADA_Circular_C_4710.1.pdf
Loading 6/18: pdf\FINAL_FTA_circular9030.1E.pdf
Loading 7/18: pdf\Final_FTA_C_5100_4-16-15.pdf
Loading 8/18: pdf\FTA-Third-party-procurement-best-practices-webinar-transcript-12-13-2022.pdf
Loading 9/18: pdf\FTA_Circular_5300_published_02-28-15.pdf
Loading 10/18: pdf\FTA_Circular_9040_1Gwith_index_-_Final_Revised_-_vm_10-15-14(1).pdf
Loading 11/18: pdf\FTA_Cir_6100.1E.docx_4.08.2015_(2)_0.pdf
Loading 12/18: pdf\FTA_C_5800.1SSMP.pdf
Loading 13/18: pdf\FTA_EJ_Circular_7.14-12_FINAL.pdf
Loading 14/18: pdf\FTA_Title_VI_FINAL.pdf
Loading 15/18: pdf\Joint-Development-Circular-C-7050-1B.pdf
Loading 16/18: pdf\program-guidance-metropolitan-planning-and-state-planning-and-research-c81001d.pdf
Loading 17/18: pdf\Third Party Co

In [13]:
pages[1238].metadata

{'file_name': 'Final_C_9300_1_Bpub.pdf',
 'file_link': 'https://www.transit.dot.gov/regulations-and-guidance/fta-circulars/capital-investment-program-guidance-and-application',
 'circular_name': 'C 9300.1B Capital Investment Program Guidance and Application Instructions',
 'start_page': 13,
 'end_page': 13}

In [14]:
db = FAISS.from_documents(pages, embeddings)
db.save_local('data/')

In [15]:
new_db = FAISS.load_local("data", embeddings)
query = 'What is Circular 5010.1E?'

docs_found = new_db.similarity_search(query, k=5)

In [16]:
docs_found

[Document(page_content='This circular is intended to assist recipients in applying for and administering Federal Transit\nAdministration (FTA) -funded proje cts and in meeting the responsibilities and reporting\nrequirements of FTA awards.  Recipients have a responsibility to comply with regulatoryrequirements and to be aware of all pertinent guidance material to assist in the managementof their federally ass isted awards.  If there is a conflict between FTA Circular 5010.1,\n“Awards Management Requirements,” and this program- specific circular (C  8100.1D), the\nguidance provided herein prevails.\nPage ii  FTA C 8100.1D  \n 2. CANCELLATION. This circular cancels FTA  Circular 8100.1C, “Program Guidance For \nMetropolitan Planning and State Planning and Research Program Grants ,” dated September \n1, 2008.  \n3. AUTHORITY.  \na. Federal Transit Laws, 49  U.S.C Chapter 53.   \nb. 49 C .F.R. 1.51.', metadata={'file_name': 'program-guidance-metropolitan-planning-and-state-planning-and-res

## Encode each page

In [4]:
file_path = glob.glob('pdf/*.pdf')
pages = []
for file in file_path:
    print(f'Loading: {file}')
    loader = PyPDFLoader(file)
    pages += loader.load_and_split()
for page in pages:
    page.metadata['source'] = page.metadata['source'].replace('pdf\\', '')
print(len(pages))

  0%|          | 0/18 [00:00<?, ?it/s]

Loading: pdf\5010-1e-circular-award-management-requirements-7-16-18.pdf
Loading: pdf\C9070_1G_FINAL_circular_4-20-15(1).pdf
Loading: pdf\eeo-circular-c-47041a.pdf
Loading: pdf\Final_C_9300_1_Bpub.pdf
Loading: pdf\Final_FTA_ADA_Circular_C_4710.1.pdf
Loading: pdf\FINAL_FTA_circular9030.1E.pdf
Loading: pdf\Final_FTA_C_5100_4-16-15.pdf
Loading: pdf\FTA-Third-party-procurement-best-practices-webinar-transcript-12-13-2022.pdf
Loading: pdf\FTA_Circular_5300_published_02-28-15.pdf
Loading: pdf\FTA_Circular_9040_1Gwith_index_-_Final_Revised_-_vm_10-15-14(1).pdf
Loading: pdf\FTA_Cir_6100.1E.docx_4.08.2015_(2)_0.pdf
Loading: pdf\FTA_C_5800.1SSMP.pdf
Loading: pdf\FTA_EJ_Circular_7.14-12_FINAL.pdf
Loading: pdf\FTA_Title_VI_FINAL.pdf
Loading: pdf\Joint-Development-Circular-C-7050-1B.pdf
Loading: pdf\program-guidance-metropolitan-planning-and-state-planning-and-research-c81001d.pdf
Loading: pdf\Third Party Contracting Guidance (Circular 4220.1F).pdf
Loading: pdf\UMTA_C_9500.1.pdf
2187


In [5]:
pages[56].metadata

{'source': '5010-1e-circular-award-management-requirements-7-16-18.pdf',
 'page': 61}

In [12]:
db = FAISS.from_documents(pages, embeddings)

In [19]:
db.save_local('data/')

In [20]:
new_db = FAISS.load_local("data", embeddings)

In [56]:
query = 'What is Circular 5010.1E?'

docs = new_db.similarity_search(query, k=5)

In [57]:
len(docs)

5

In [64]:
context = ''
for doc in docs:
    print('SOURCE:', doc.metadata['source'])
    print('PAGE:', doc.metadata['page'])
    print('CONTENT:')
    print(doc.page_content)
    print('\n-------------------')
    context += doc.page_content + '\n'

SOURCE: 5010-1e-circular-award-management-requirements-7-16-18.pdf
PAGE: 170
CONTENT:
FTA C 5010.1E  Appendix A -1 
 APPENDIX A:   
 
TABLE OF FTA CIRCULARS  
Circular  Program  Topic  Title  
C 4710.1  Civil Rights  Americans with Disabilities 
Act: Guidance  
 C 5100.1  5339  Bus and Bus 
Facilities Program  Bus and Bus Facilities 
Program: Guidance and 
Application Instructions  
 C 6100.1  5312, 
5313, 
5214, 
5322  Technology 
Development and 
Deployment  Research, Technical 
Assistance and Training 
Program: Application 
Instructions and Program 
Management Guidelines  
 C 5300.1  5337  State of Good 
Repair Program  State of Good Repair Grant 
Program: Guidance and 
Application Instructions  
 C 9040.1  5311  Capital Facilities 
and Formula 
Grant Program  Formula Grants for Rural 
Areas: Program Guidance and 
Application Instructions  
 C 7050.1    Joint 
Development  Federal Transit 
Administration  Guidance on 
Joint Development  
 C 9070.1  5310  Capital Facilities 
and Form

In [90]:
template = "Answer the question based only on the following context:\n###\n{context}\n###\nQuestion: {query}\nAnswer:"
prompt_template = PromptTemplate.from_template(template)
prompt = prompt_template.format(context=context, query=query)
print(prompt)

Answer the question based only on the following context:
###
FTA C 5010.1E  Appendix A -1 
 APPENDIX A:   
 
TABLE OF FTA CIRCULARS  
Circular  Program  Topic  Title  
C 4710.1  Civil Rights  Americans with Disabilities 
Act: Guidance  
 C 5100.1  5339  Bus and Bus 
Facilities Program  Bus and Bus Facilities 
Program: Guidance and 
Application Instructions  
 C 6100.1  5312, 
5313, 
5214, 
5322  Technology 
Development and 
Deployment  Research, Technical 
Assistance and Training 
Program: Application 
Instructions and Program 
Management Guidelines  
 C 5300.1  5337  State of Good 
Repair Program  State of Good Repair Grant 
Program: Guidance and 
Application Instructions  
 C 9040.1  5311  Capital Facilities 
and Formula 
Grant Program  Formula Grants for Rural 
Areas: Program Guidance and 
Application Instructions  
 C 7050.1    Joint 
Development  Federal Transit 
Administration  Guidance on 
Joint Development  
 C 9070.1  5310  Capital Facilities 
and Formula 
Grant Programs  Enha

In [91]:
source = {idx: [doc.metadata['source'], doc.metadata['page']] for idx, doc in enumerate(docs)}
source

{0: ['5010-1e-circular-award-management-requirements-7-16-18.pdf', 170],
 1: ['Final_FTA_C_5100_4-16-15.pdf', 140],
 2: ['5010-1e-circular-award-management-requirements-7-16-18.pdf', 40],
 3: ['eeo-circular-c-47041a.pdf', 2],
 4: ['5010-1e-circular-award-management-requirements-7-16-18.pdf', 1]}

In [92]:
llm = OpenAI(temperature=0.7)

In [93]:
response = llm(prompt)

In [94]:
print(response)

 Circular 5010.1E is a document that provides requirements and procedures for management of all applicable Federal Transit Administration (FTA) programs authorized under 49 U.S.C. chapter 53. It includes post-award guidance for all applicable FTA programs, as well as instructions for the Public Transportation Innovation Program (Sections 5312) and the Technical Assistance and Workforce Development Program (Section 5314).


In [100]:
if True:
    message = f"""I don't know.
    what
    """

print(message)

I don't know.    what
    


In [103]:
docs[0].metadata['source']

'5010-1e-circular-award-management-requirements-7-16-18.pdf'