In [1]:
import jsonlines
from langchain.schema import Document
from typing import Iterable

In [2]:
from langchain_unstructured import UnstructuredLoader
from unstructured_client import UnstructuredClient

In [3]:
import json

# Load the JSON file into a dictionary
with open("OldDominion_part3.json", "r") as file:
    pdf_files = json.load(file)

In [4]:
print(pdf_files)
# Fetch the keys from the dictionary
file_paths = list(pdf_files.keys())

print(file_paths)


{'Old_Dominion\\Pro_Motion_Physical_Therapy\\2007.09.01 - Pro Motion Physical Therapy, LLC - Deed of Lease dtd.pdf': {'file_name': 'Pro Motion Physical Therapy, LLC - Deed of Lease dtd', 'building_name': 'Old Dominion', 'tenant_name': 'Pro Motion Physical Therapy', 'date': '2007.09.01'}, 'Old_Dominion\\Pro_Motion_Physical_Therapy\\2007.10.08 - Pro Motion Physical Therapy, LLC - Exhibit B dtd.pdf': {'file_name': 'Pro Motion Physical Therapy, LLC - Exhibit B dtd', 'building_name': 'Old Dominion', 'tenant_name': 'Pro Motion Physical Therapy', 'date': '2007.10.08'}, 'Old_Dominion\\Pro_Motion_Physical_Therapy\\2012.12.04 - Pro Motion Physical Therapy, LLC - 1st Amendment (Work Agreement - Exhibit E) dtd.pdf': {'file_name': 'Pro Motion Physical Therapy, LLC - 1st Amendment (Work Agreement - Exhibit E) dtd', 'building_name': 'Old Dominion', 'tenant_name': 'Pro Motion Physical Therapy', 'date': '2012.12.04'}, 'Old_Dominion\\Pro_Motion_Physical_Therapy\\2012.12.04 - Pro Motion Physical Therapy,

In [5]:
client = UnstructuredClient(
    api_key_auth="qdPElrIi3zaPeRscDBj7iJCjXix2uC",
    server="free-api"
)


In [6]:
import os
from PyPDF2 import PdfReader

def check_pdf_files(file_paths):
    bad_files = []
    for file_path in file_paths:
        try:
            # Attempt to open the PDF
            with open(file_path, 'rb') as f:
                reader = PdfReader(f)
                # Try to read the first page
                _ = reader.pages[0]
            print(f"{file_path}: OK")
        except Exception as e:
            # Catch any errors related to reading the PDF
            print(f"{file_path}: BAD - {str(e)}")
            bad_files.append(file_path)
    return bad_files

bad_files = check_pdf_files(file_paths)

print("\nBad Files:")
for bad_file in bad_files:
    print(bad_file)


Old_Dominion\Pro_Motion_Physical_Therapy\2007.09.01 - Pro Motion Physical Therapy, LLC - Deed of Lease dtd.pdf: OK
Old_Dominion\Pro_Motion_Physical_Therapy\2007.10.08 - Pro Motion Physical Therapy, LLC - Exhibit B dtd.pdf: OK
Old_Dominion\Pro_Motion_Physical_Therapy\2012.12.04 - Pro Motion Physical Therapy, LLC - 1st Amendment (Work Agreement - Exhibit E) dtd.pdf: OK
Old_Dominion\Pro_Motion_Physical_Therapy\2012.12.04 - Pro Motion Physical Therapy, LLC - 1st Amendment dtd.pdf: OK
Old_Dominion\Pro_Motion_Physical_Therapy\2013.08.19 - Pro Motion Physical Therapy, LLC - Exhibit C to 1st Amendment-1 dtd.pdf: OK
Old_Dominion\Pro_Motion_Physical_Therapy\2017.05.15 - Pro Motion Physical Therapy, LLC - 2nd Amendment dtd.pdf: OK
Old_Dominion\Pro_Motion_Physical_Therapy\Pro Motion Physical Therapy, LLC - 3rd Amendment_executed.pdf: OK
Old_Dominion\Richard_A._Hall\2019.10.17 - Richard A Hall - Deed of Lease.pdf: OK
Old_Dominion\Steele_Foundation\2020.07.09 - Steele Foundation - Lease.pdf: OK
Old_

In [7]:
loader = UnstructuredLoader(
    file_path = file_paths,
    partition_via_api=True,
    client=client,
    strategy="hi_res",
    hi_res_model_name="yolox",
    chunking_strategy="by_title",
    pdf_infer_table_structure=True,
    max_characters=3700,
)

docs = loader.load()

INFO: Preparing to split document for partition.
INFO: Concurrency level set to 5
INFO: Splitting pages 1 to 32 (32 total)
INFO: Determined optimal split size of 7 pages.
INFO: Partitioning 4 files with 7 page(s) each.
INFO: Partitioning 1 file with 4 page(s).
INFO: Partitioning set #1 (pages 1-7).
INFO: Partitioning set #2 (pages 8-14).
INFO: Partitioning set #3 (pages 15-21).
INFO: Partitioning set #4 (pages 22-28).
INFO: Partitioning set #5 (pages 29-32).
INFO: HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO: Successfully partitioned set #1, elements added to the final result.
INFO: Successfully partitioned set #2, elements added to the final result.
INFO: Successfully partitio

In [8]:
# def save_docs_to_jsonl(documents: Iterable[Document], file_path: str) -> None:
#     with jsonlines.open(file_path, mode="w") as writer:
#         for doc in documents:
#             writer.write(doc.dict())

def save_docs_to_jsonl(documents: Iterable[Document], file_path: str) -> None:
    mode = "a" if os.path.exists(file_path) else "w"
    with jsonlines.open(file_path, mode=mode) as writer:
        for doc in documents:
            writer.write(doc.dict())
            
def load_docs_from_jsonl(file_path: str) -> Iterable[Document]:
    documents = []
    with jsonlines.open(file_path, mode="r") as reader:
        for doc in reader:
            documents.append(Document(**doc))
    return documents


save_docs_to_jsonl(docs, "./docs_beverly1.json")
loaded_docs = load_docs_from_jsonl("./docs_beverly1.json")

In [9]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

pretty_print_docs(loaded_docs[:2])

Document 1:

DocuSign Envelope [D: 5AE05595-ED31-48C5-B81D-748F4B85BC38B

DEED OF LEASE

by and between

SIP / CREF 6849 Old Dominion, LLC a Delaware limited liability company

and

Chain Bridge Partners, LL.C, a Virginia limited liability company

Dated ol , 2021

DocuSigh Envelope ID: 5AE05595-ED31-48C5-B81D-748F4B85BC3B

TABLE OF CONTENTS
----------------------------------------------------------------------------------------------------
Document 2:

Section 1. Leased Premises . . Section 2. Term; Possession. ... Section 3. Rent Section 4. Landlord’s Services. Section 5. Improvements. ......... Section 6. Upkeep of Premises lg Section 7. Use of Premises 10 Section 8. Tenant’s Agreement Section 9. Alterations 11 Section 0.Tenant’s Claims. 12 13 Section 1.Landlord’s Lien Section 2.Assignment & Subletting 12 Section 14 3.Right of Access Section 4.Surrender of Possession. 15 Section 15.Insurance 15 16 Section 16,Damage and Destruction. Section 17.Condemnation Cerere e 16 Section 18.Defa

In [10]:
import os
from datetime import datetime
loaded_docs = load_docs_from_jsonl("./docs_beverly1.json")
def extract_file_details(file_path):
    # Extract the file name from the full path
    file_name = os.path.basename(file_path)
    
    # Split the file name by '-' to get the individual parts
    parts = file_name.split('-')
    
    # Determine if the date is present in the file name
    if len(parts) >= 3:
        date_str = parts[0].strip()
        tenant_name = parts[1].strip()
        file_name = '-'.join(parts[2:]).strip('.pdf').strip()
    else:
        tenant_name = parts[0].strip()
        file_name = '-'.join(parts[1:]).strip('.pdf').strip()
        date_str = ""
    
    # Extract the building name from the file path
    building_name = file_path.split('\\')[1]
    
    return {
        'building_name': building_name,
        'tenant_name': tenant_name,
        'date': date_str,
        'filename': file_name
    }

def append_metadata(doc):
    metadata = extract_file_details(doc.metadata['filename'])
    doc.metadata['building_name'] = metadata['building_name']
    doc.metadata['tenant_name'] = metadata['tenant_name']
    doc.metadata['date'] = metadata['date']
    doc.metadata['filename'] = metadata['filename']
    return doc

for i, doc in enumerate(loaded_docs):
    print("DFILE: ", doc.metadata['filename'], "i: ", i)
    with open("temp.md", "+a") as f:
        f.write(f"DFILELE: {doc.metadata['filename']}")
    append_metadata(doc)

DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  0
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  1
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  2
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  3
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  4
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  5
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  6
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  7
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Bridge Partners - Deed of Lease.pdf i:  8
DFILE:  Old_Dominion\Chain_Bridge_Partners\2021.07.02 - Chain Br

In [4]:
import os
import json
from uuid import uuid4
from langchain_elasticsearch import ElasticsearchStore
from langchain_community.embeddings import JinaEmbeddings
from langchain_core.documents import Document
from dotenv import load_dotenv
import nltk
from unstructured_client.models import operations, shared
import unstructured_client
from langchain_openai import OpenAIEmbeddings
nltk.download('averaged_perceptron_tagger')

load_dotenv()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
jina_api_key = "jina_8cb5dc6a1de64812a166753f9148e820kjsisZ-LPRu7mKuLVbKsoSb2trWU"
ES_URL = "http://154.38.182.130:9200/"
ES_API_KEY= "UDNJQlQ1QUJ3V2ZzVTRHWVptS1A6OXZCemhCeTBRUk9vVEd5eElDZlp5dw=="
opean_ai_api = "sk-proj-rxc6TZw5ngCqElxbhTeqT3BlbkFJy8lLKJUg0IzUe3WcTalZ"

In [13]:
#embeddings = JinaEmbeddings(jina_api_key=jina_api_key, model_name='jina-embeddings-v2-base-en')
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=opean_ai_api,
)


In [5]:
import jsonlines
from langchain.schema import Document
from typing import Iterable

def load_docs_from_jsonl(file_path: str) -> Iterable[Document]:
    documents = []
    with jsonlines.open(file_path, mode="r") as reader:
        for doc in reader:
            documents.append(Document(**doc))
    return documents

loaded_docs = load_docs_from_jsonl("docs_beverly1.json")

In [1]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-large-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embeddingsbg = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [8]:
es_store = ElasticsearchStore.from_documents(
    documents=loaded_docs,
    es_url=ES_URL,
    es_api_key=ES_API_KEY,
    embedding=embeddingsbg,
    index_name="old_dominion_bge",
)

ConnectionTimeout: Connection timed out

In [1]:
def save_docs_to_jsonl(documents: Iterable[Document], file_path: str) -> None:
    mode = "a" if os.path.exists(file_path) else "w"
    with jsonlines.open(file_path, mode=mode) as writer:
        for doc in documents:
            writer.write(doc.dict())

NameError: name 'Iterable' is not defined