In [6]:
# Author : Rajib
# This program shows how to do context aware parsing of a large PDF and then summarize it
# References:
# https://python.langchain.com/docs/use_cases/summarization
# https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/
# https://smith.langchain.com/hub/
import json
import logging
import os
import re
import zipfile
import pandas as pd
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.table_structure_type import TableStructureType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import ExtractRenditionsElementType
from langchain.chains import LLMChain, StuffDocumentsChain, ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")



class PDFExtract():
    def __init__(self, client_id, client_secret):
        self.client_id = client_id
        self.client_secret = client_secret
        # initialize pinecone
        # One thing I dound out(may be a defect in PINECONE), the api_key and the environment must be provided as below
        

    def _get_credentials(self):
        credentials = Credentials.service_principal_credentials_builder().with_client_id(
            self.client_id).with_client_secret(self.client_secret).build()

        return credentials

    def _zip_file(self, output_path,unzip_dir):
        with zipfile.ZipFile(output_path, 'r') as zip_ref:
            zip_ref.extractall(unzip_dir)

    def _parse_json(self, json_file_path):
        with open(json_file_path, "r") as json_file:
            content = json.loads(json_file.read())

        pdf_element = content["elements"]
        return pdf_element

    # def load_pine_index(self,docs,index_name = "arxiv-index"):

    #     emb = OpenAIEmbeddings()
    #     docsearch = Pinecone.from_documents(docs, emb, index_name=index_name)

    def get_files_from_dir(self, dir):
        file_list = []
        files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]

        return files

    def load_docs(self, file_path):
        loader = TextLoader(file_path)
        docs = loader.load()

        return docs

    def parse_pdf(self, input_file_path, output_path, unzip_dir, chunked_dir):
        try:
            credentials = self._get_credentials()
            execution_context = ExecutionContext.create(credentials)
            extract_pdf_operation = ExtractPDFOperation.create_new()
            source = FileRef.create_from_local_file(input_file_path)
            extract_pdf_operation.set_input(source)
            extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
                .with_element_to_extract(ExtractElementType.TEXT) \
                .with_element_to_extract(ExtractElementType.TABLES) \
                .with_table_structure_format(TableStructureType.CSV) \
                .with_element_to_extract_renditions(ExtractRenditionsElementType.FIGURES)\
                .build()
            extract_pdf_operation.set_options(extract_pdf_options)

            # Execute the operation.
            result: FileRef = extract_pdf_operation.execute(execution_context)

            # # Save the result to the specified location.
            result.save_as(output_path)
            self._zip_file(output_path,unzip_dir)
            json_file_path = os.path.join(unzip_dir, "structuredData.json")
            elements = self._parse_json(json_file_path)

            file_split = 0
            # Define the header flag. If first time header no need to cut a new file
            FIRST_TIME_HEADER = True
            file_name = os.path.join(chunked_dir, f"file_{file_split}".format(file_split=file_split))
            parsed_file = open(file_name, "a", encoding="utf-8")
            for element in elements:
                if "//Document/H2" in element["Path"]:
                    hdr_txt = element["Text"]
                    if FIRST_TIME_HEADER:
                        FIRST_TIME_HEADER = False
                        parsed_file.write(hdr_txt)
                        parsed_file.write("\n")
                    else:
                        parsed_file.close()
                        file_split = file_split + 1
                        file_name = os.path.join(chunked_dir, f"file_{file_split}".format(file_split=file_split))
                        parsed_file = open(file_name, "a", encoding="utf-8")
                        parsed_file.write(hdr_txt)
                        parsed_file.write("\n")
                else:
                    if "Document/Table" in element["Path"]:
                        match = re.search(r'^//Document/Table(?:\[\d+\])?$', element["Path"])
                        if match:
                            xlsx_file_name = element["filePaths"][0]
                            xlsx_file = os.path.join(unzip_dir, xlsx_file_name)
                            df = pd.DataFrame(pd.read_excel(xlsx_file))
                            table_content = df.to_markdown().replace("_x000D_", "      ")
                            parsed_file.write(table_content)
                            parsed_file.write("\n")
                    else:
                        try:
                            text_content = element["Text"]
                            parsed_file.write(text_content)
                            parsed_file.write("\n")
                        except KeyError as ke:
                            pass
            parsed_file.close()
        except Exception as e:
            print(e)
            logging.exception("Exception encountered while executing operation")



In [7]:

if __name__ == "__main__":
    pdf_extract_client_id = '8f2bfd9259224ac285feaae73e813c2f'
    pdf_extract_client_secret = 'p8e-Ou_2-JQP04FYX00-BAO6Hk8_M0_X4aMr'

    input_file_path = "/public/home/llm2/yule/PDFTriage/data/test.pdf"
    output_path = "/public/home/llm2/yule/PDFTriage/data/test.zip"
    unzip_dir = "/public/home/llm2/yule/PDFTriage/data/test/unzip"
    chunked_dir = "/public/home/llm2/yule/PDFTriage/data/test/chunk"
    isExist = os.path.exists(chunked_dir)
    if not isExist:
        os.makedirs(chunked_dir)
    pdf_extract = PDFExtract(pdf_extract_client_id, pdf_extract_client_secret)

    # Step - 1 : Run this step to chunk the PDF into contextual subsections

    pdf_extract.parse_pdf(input_file_path,output_path,unzip_dir,chunked_dir)

    # Step - 2 : use a TextLoader to get all the chunks in a list of Dcoments

    files = pdf_extract.get_files_from_dir(chunked_dir)
    print(files)
    list_of_all_docs=[]
    for file in files:
        document = pdf_extract.load_docs(file)

        list_of_all_docs.append(document[0])
    




['/public/home/llm2/yule/PDFTriage/data/test/chunk/file_0', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_2', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_5', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_7', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_9', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_1', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_3', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_4', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_6', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_8', '/public/home/llm2/yule/PDFTriage/data/test/chunk/file_10']
[Document(page_content='How Does Batch Normalization Help Optimization? \narXiv:1805.11604v5  [stat.ML]  15 Apr 2019 \n|    | Shibani Santurkar∗          | Dimitris Tsipras∗          | Andrew Ilyas∗          | Aleksander M ˛      adry          |\n|---:|:-----------------------------|:----------------------------|:------------------------|:--------

In [11]:
json_file_path = '/public/home/llm2/yule/PDFTriage/data/test/unzip/structuredData.json'
with open(json_file_path, "r") as json_file:
            content = json.loads(json_file.read())
content.keys()

dict_keys(['version', 'extended_metadata', 'elements', 'pages'])

In [15]:
content['elements'][10]['Path']

'//Document/Table/TR/TH[4]/P'