In [1]:
import logging, os, time
from typing import List
import json
# from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_runners import run_pipeline
# from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_runners import PipelineCreationSchema
from nv_ingest_api.util.logging.configuration import configure_logging as configure_local_logging
from nv_ingest_client.client import Ingestor, NvIngestClient
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from app.services.ingester import get_nv_ingest_client
from app.services.extractor import extract_entities_llm
from app.utils.vectorstore import (create_collections, create_metadata_schema_collection, 
                                   add_metadata_schema, get_collection, get_vectorstore, delete_collections,
                                    add_schema, init_collection, collection_exists)
from app.utils.common import get_config
from app.domain.common import COLL
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:

# Initialize global objects
logger = logging.getLogger(__name__)

CONFIG = get_config()
NV_INGEST_CLIENT_INSTANCE = get_nv_ingest_client()
DOCUMENT_EMBEDDER = document_embedder = NVIDIAEmbeddings(base_url=os.getenv("EMBEDDING_NIM_ENDPOINT"), model=os.getenv("EMBEDDING_MODEL_NAME"),  dimensions=CONFIG.embeddings.dimensions, truncate="END")
# NV-Ingest Batch Mode Configuration
ENABLE_NV_INGEST_BATCH_MODE = os.getenv("ENABLE_NV_INGEST_BATCH_MODE", "true").lower() == "true"
NV_INGEST_FILES_PER_BATCH = int(os.getenv("NV_INGEST_FILES_PER_BATCH", 16))
ENABLE_NV_INGEST_PARALLEL_BATCH_MODE = os.getenv("ENABLE_NV_INGEST_PARALLEL_BATCH_MODE", "true").lower() == "true"
NV_INGEST_CONCURRENT_BATCHES = int(os.getenv("NV_INGEST_CONCURRENT_BATCHES", 4))

split_options = {"chunk_size": 6144, "chunk_overlap": 248}
collection_name = "multimodal_data"

In [3]:
ingestor = Ingestor(client=NV_INGEST_CLIENT_INSTANCE)
# Add files to ingestor
filepaths = ["/home/ubuntu/projects/datas/rfi for cloud adoption.pdf"]
ingestor = ingestor.files(filepaths)
# Create kwargs for extract method
extract_kwargs = {
    "extract_text": True,
    "extract_infographics": False,
    "extract_tables": True,
    "extract_charts": False,
    "extract_images": False,
    "extract_method": "pdfium", #config.nv_ingest.pdf_extract_method, Literal['pdfium','nemoretriever_parse','None']
    "text_depth": CONFIG.nv_ingest.text_depth,
    "paddle_output_format": "markdown", #Literal['markdown','html','text']
    # "extract_audio_params": {"segment_audio": True} # TODO: Uncomment this when audio segmentation to be enabled
}
ingestor = ingestor.extract(**extract_kwargs)

split_source_types = ["text", "html"]
split_source_types = ["PDF"] + split_source_types if CONFIG.nv_ingest.enable_pdf_splitter else split_source_types
logger.info(f"Post chunk split status: {CONFIG.nv_ingest.enable_pdf_splitter}. Splitting by: {split_source_types}")
ingestor = ingestor.split(
                tokenizer=CONFIG.nv_ingest.tokenizer,
                chunk_size=split_options.get("chunk_size", CONFIG.nv_ingest.chunk_size),
                chunk_overlap=split_options.get("chunk_overlap", CONFIG.nv_ingest.chunk_overlap),
                params={"split_source_types": split_source_types}
            )

results, failures = ingestor.ingest(return_failures=True, show_progress=True)

# results blob is directly inspectable
# print(ingest_json_results_to_blob(results[0]))

# (optional) Review any failures that were returned
if failures:
    print(f"There were {len(failures)} failures. Sample: {failures[0]}")

`paddle_output_format` is deprecated and will be removed in a future release. Please use `table_output_format` instead.
Processing: 100%|██████████| 1/1 [00:04<00:00,  4.95s/doc]


In [4]:
# initialise collections and metadata schema
init_collection(collections=COLL, embed_dimension=CONFIG.embeddings.dimensions, vdb_endpoint=CONFIG.vector_store.url)

# initialise vector store objects for each collections
vs_chunks        = get_vectorstore(document_embedder, collection_name=COLL["chunks"],        vdb_endpoint=CONFIG.vector_store.url)
vs_requirements  = get_vectorstore(document_embedder, collection_name=COLL["requirements"],  vdb_endpoint=CONFIG.vector_store.url)
vs_criteria      = get_vectorstore(document_embedder, collection_name=COLL["criteria"],      vdb_endpoint=CONFIG.vector_store.url)
vs_contacts      = get_vectorstore(document_embedder, collection_name=COLL["contacts"],      vdb_endpoint=CONFIG.vector_store.url)
vs_deadlines     = get_vectorstore(document_embedder, collection_name=COLL["deadlines"],     vdb_endpoint=CONFIG.vector_store.url)
vs_tech          = get_vectorstore(document_embedder, collection_name=COLL["technologies"],  vdb_endpoint=CONFIG.vector_store.url)
vs_std           = get_vectorstore(document_embedder, collection_name=COLL["standards"],     vdb_endpoint=CONFIG.vector_store.url)
vs_org           = get_vectorstore(document_embedder, collection_name=COLL["organizations"], vdb_endpoint=CONFIG.vector_store.url)

True


In [5]:
# extract entities using LLM
json_outs, documents = extract_entities_llm(results)
print(json_outs)
# print(documents)

extracting entities:: 100%|██████████| 28/28 [09:36<00:00, 20.58s/doc]

Error processing chunk: list index out of range
Error processing chunk: list index out of range
Error processing chunk: list index out of range
Error processing chunk: list index out of range
Error processing chunk: list index out of range
Error processing chunk: list index out of range
{'document_type': 'RFI', 'document_title': 'Request for Information (RFI) for Public and Hybrid Cloud Adoption for hosting applications on Infrastructure of Cloud Service Provider', 'issue_date': '2022-12-14', 'deadlines': [{'date': '2022-12-28'}], 'client_organization': 'Union Bank of India, Department of Information Technology', 'client_industry': 'Public Sector Bank', 'contacts': [{'name': 'Nandan Valera', 'title': None, 'email': 'nandan.v@eptl.in', 'phone': '9081000427'}, {'name': 'Fahad Khan', 'title': None, 'email': 'fahad@eptl.in', 'phone': '9904406300'}, {'name': 'Shaikh Nasruddin', 'title': None, 'email': 'shaikh@eptl.in', 'phone': '9510812960'}, {'name': 'Ashutosh Gaur', 'title': None, 'email'




In [None]:
# vs_chunks.add_documents(documents)

In [9]:
result = vs_chunks.search(query="What are the modes of submission ?", search_type="similarity", k=3,
                          expr="source == '/home/ubuntu/projects/datas/rfi for cloud adoption.pdf'")
print(result)
print(len(result))

[Document(metadata={'pk': 461506363148960165, 'source': '/home/ubuntu/projects/datas/rfi for cloud adoption.pdf', 'content_metadata': {'source_type': 'PDF', 'content_metadata': {'type': 'text', 'description': 'Unstructured text from PDF document.', 'page_number': 2, 'hierarchy': {'page_count': 22, 'page': 2, 'block': -1, 'line': -1, 'span': -1, 'nearby_objects': {'text': {'content': [], 'bbox': [], 'type': []}, 'images': {'content': [], 'bbox': [], 'type': []}, 'structured': {'content': [], 'bbox': [], 'type': []}}}, 'subtype': ''}}}, page_content='RFI for Cloud Adoption\r\nClassification: Internal 2 | 21\r\nGENERAL INSTRUCTIONS TO BIDDERS\r\nAll bidders must note that this being E-tender, bids received only through online on E-tendering \r\nportal https://ubi.abcprocure.comshall be considered as an offer. Any bid submitted in \r\nphysical form will not be received or opened and shall be summarily rejected.\r\nProcedure for submission of E-tender by bidder:\r\nInterested bidders who wi