In [None]:
!pip3 install unstructured unstructured-inference langchain numpy tesseract cohere chromadb


In [2]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
filename = "/Users/mohittalniya/semantic-search/mongo-search/mongodb_balancesheet.pdf" 
# Define parameters for Unstructured's library
strategy = "hi_res" # Strategy for analyzing PDFs and extracting table structure
model_name = "yolox" # Best model for table extraction. Other options are detectron2_onnx and chipper depending on file layout

# Extracts the elements from the PDF
elements = partition_pdf(
    filename=filename, 
    strategy=strategy, 
    infer_table_structure=True, 
    model_name=model_name
)

# Store results in json
elements_to_json(elements, filename=f"{filename}.json")

In [None]:
for elem in elements[:10]:
    print(elem)

In [5]:
import json
text_file = "/Users/mohittalniya/semantic-search/mongo-search/mongodb_balancesheet.pdf.txt"
def process_json_file(input_filename):
    # Read the JSON file
    with open(input_filename, 'r') as file:
        data = json.load(file)

    # Iterate over the JSON data and extract required table elements
    extracted_elements = []
    for entry in data:
        if entry["type"] == "Table":
            extracted_elements.append(entry["metadata"]["text_as_html"])
        # else:
        #     extracted_elements.append(entry["text"])  

    print(extracted_elements)           
    
    # Write the extracted elements to the output file
    with open(text_file, 'w') as output_file:
        for element in extracted_elements:
            output_file.write(element + "\n\n")

In [None]:
process_json_file(f"{filename}.json")

In [7]:
from langchain.document_loaders import TextLoader
print(text_file)
loader = TextLoader(text_file)
documents = loader.load()

/Users/mohittalniya/semantic-search/mongo-search/mongodb_balancesheet.pdf.txt


In [8]:
from langchain.text_splitter import CharacterTextSplitter

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=4500, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

In [9]:
import os
from langchain.embeddings import CohereEmbeddings
COHERE_API_KEY = '<>';
embeddings = CohereEmbeddings(
    model="embed-english-light-v2.0", cohere_api_key=COHERE_API_KEY
)

In [10]:
from langchain.vectorstores import Chroma

db = Chroma.from_documents(docs, embeddings)

In [19]:
from langchain.llms import Cohere
from langchain.chains import RetrievalQA

llm = Cohere(cohere_api_key=COHERE_API_KEY, truncate="START", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever())

questions = [
    "What are the current assets of MongoDB",
    "What is the total cash flow from investing activities of MongoDB?",
    "How many total Atlas Customers MongoDB has in July 2023?",
    "What was the gross margin in July 2023?"
]

# Store responses in output_list
output_list = []

for query in questions:
    response = qa_chain({"query": query})
    output_list.append(response)

In [20]:
import pprint

# Use pprint to pretty print the output list
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(output_list)

[   {   'query': 'What are the current assets of MongoDB',
        'result': ' As of July 31, 2023, MongoDB had total current assets of '
                  '$2,648,234. This includes cash and cash equivalents of '
                  '$722,190, accounts receivable of $61,206, prepaid expenses '
                  'and other current assets of $124, and other current assets '
                  'of $2,548,114.'},
    {   'query': 'What is the total cash flow from investing activities of '
                 'MongoDB?',
        'result': ' The total cash flow from investing activities of MongoDB '
                  'is $110,195.'},
    {   'query': 'How many total Atlas Customers MongoDB has in July 2023?',
        'result': ' MongoDB has 29,000+ Atlas Customers as of July 2023.'},
    {   'query': 'What was the gross margin in July 2023?',
        'result': ' The gross margin in July 2023 was 74%.'}]
