### Data Extraction and Vectorization of documents extracted from structured and unstructured PDFs

#### UNIFI Value Frameworks PDF Lifting Competition
- The objective of the competition is to create a solution that parses annual reports in PDF format and extracts information about pre-defined activity metrics, in order for UNIFI to obtain specific information about sustainability at a given company

#### Install the necessary libraries

In [None]:
!sudo apt install tesseract-ocr -y
!sudo apt install libtesseract-dev -y
!sudo apt-get install poppler-utils -y

In [None]:
!pip install langchain unstructured[all-docs] pydantic lxml openai tiktoken opencv-python

In [None]:
!pip install python-dotenv

In [None]:
!pip install sentence_transformers

In [None]:
!pip install pypdf

In [None]:
!pip install -qU langchain-openai

In [None]:
!pip install anthropic

In [None]:
!pip install qdrant_client

#### Import the necessary libraries

In [None]:
import anthropic
import os
import uuid
import base64
import torch
from IPython import display
from unstructured.partition.pdf import partition_pdf
from langchain.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.schema.document import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from torch import cuda

In [None]:
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)

In [None]:
# Create .env file within your directory and add your OpenAI API key and ANTHROPIC API key as OPENAI_API_KEY and ANTHROPIC_API_KEY respectively
import openai
import sys
sys.path.append('../..')
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']
anthropic_api_key = os.environ['ANTHROPIC_API_KEY']

In [None]:
client = anthropic.Anthropic(api_key=anthropic_api_key)

In [None]:
client

In [None]:
llm_model = "gpt-4-turbo-preview"

#### Load OpenAI's new text-embedding-3-large embeddings

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

#### Use partition_pdf function from unstructured to extract text, table, and image elements from PDFs

In [None]:
output_path = "./pdfs_images_output"

In [None]:
def get_elements_from_document(file_path):
  image_path = f"{output_path}/{os.path.splitext(os.path.basename(file_path))[0]}"
  raw_pdf_elements = partition_pdf(
    filename=file_path,
    strategy="hi_res",
    extract_images_in_pdf=True,
    extract_image_block_types=["Image", "Table"],
    extract_image_block_to_payload=False,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    extract_image_block_output_dir=image_path,
  )
  return raw_pdf_elements

In [None]:
def get_text_and_table_summaries(raw_pdf_elements):
  # Get text summaries and table summaries
  text_elements = []
  table_elements = []

  text_summaries = []
  table_summaries = []

  summary_prompt = """
  Summarize the following ensuring you note the numbers and percentages of the various metrics listed {element_type}:
  {element}
  """
  summary_chain = LLMChain(
      llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3),
      prompt=PromptTemplate.from_template(summary_prompt)
  )

  for e in raw_pdf_elements:
      if 'CompositeElement' in repr(e):
          text_elements.append(e.text)
          summary = summary_chain.run({'element_type': 'text', 'element': e})
          text_summaries.append(summary)

      elif 'Table' in repr(e):
          table_elements.append(e.text)
          summary = summary_chain.run({'element_type': 'table', 'element': e})
          table_summaries.append(summary)
  return text_elements, text_summaries, table_elements, table_summaries

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode('utf-8')

def summarize_image(encoded_image):
    try:
        message = client.messages.create(
            model="claude-3-opus-20240229",
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "data": encoded_image,
                                "media_type": "image/jpeg"  # Add the media_type field with appropriate value
                            },
                        },
                        {
                            "type": "text",
                            "text": "Can you describe the image and extract all activity metrics which are numbers and percentages that represent various metrics for various companies from the image?"
                        }
                    ],
                }
            ],
        )
        text_blocks = [content_block.text for content_block in message.content if content_block.type == 'text']
        text = text_blocks[0]
        return text
    except anthropic.APIConnectionError as e:
        print("The server could not be reached")
        print(e.__cause__)  # an underlying Exception, likely raised within httpx.
        prompt = [
            SystemMessage(content="You are a bot that is good at analyzing images in PDFs."),
            HumanMessage(content=[
                {
                    "type": "text",
                    "text": "Can you describe the image and extract all activity metrics which are numbers and percentages that represent various metrics for various companies from the image?"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{encoded_image}"
                    },
                },
            ])
        ]
        try:
            response = ChatOpenAI(model="gpt-4-vision-preview", openai_api_key=openai.api_key, max_tokens=1024).invoke(prompt)
            return response.content
        except Exception as e:
            print(f"Error message: {e}")
            return "Image could not be summarized."
    except anthropic.RateLimitError as e:
        print("A 429 status code was received; we should back off a bit.")
        prompt = [
            SystemMessage(content="You are a bot that is good at analyzing images in PDFs."),
            HumanMessage(content=[
                {
                    "type": "text",
                    "text": "Can you describe the image and extract all activity metrics which are numbers and percentages that represent various metrics for various companies from the image?"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{encoded_image}"
                    },
                },
            ])
        ]
        try:
            response = ChatOpenAI(model="gpt-4-vision-preview", openai_api_key=openai.api_key, max_tokens=1024).invoke(prompt)
            return response.content
        except Exception as e:
            print(f"Error message: {e}")
            return "Image could not be summarized."
    except anthropic.APIStatusError as e:
        print("Another non-200-range status code was received")
        print(e.status_code)
        print(e.response)
        prompt = [
            SystemMessage(content="You are a bot that is good at analyzing images in PDFs."),
            HumanMessage(content=[
                {
                    "type": "text",
                    "text": "Can you describe the image and extract all activity metrics which are numbers and percentages that represent various metrics for various companies from the image?"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{encoded_image}"
                    },
                },
            ])
        ]
        try:
            response = ChatOpenAI(model="gpt-4-vision-preview", openai_api_key=openai.api_key, max_tokens=1024).invoke(prompt)
            return response.content
        except Exception as e:
            print(f"Error message: {e}")
            return "Image could not be summarized."
    


def get_image_summaries():
  image_elements = []
  image_summaries = []
  for i in os.listdir(output_path):
    if i.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(output_path, i)
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)
        summary = summarize_image(encoded_image)
        image_summaries.append(summary)
  return image_elements, image_summaries

In [None]:
def create_documents(text_elements, text_summaries, table_elements, table_summaries):
  # Create Documents and Vectorstore
  documents = []
  retrieve_contents = []

  for e, s in zip(text_elements, text_summaries):
      i = str(uuid.uuid4())
      doc = Document(
          page_content = s,
          metadata = {
              'id': i,
              'type': 'text',
              'original_content': e
          }
      )
      retrieve_contents.append((i, e))
      documents.append(doc)

  for e, s in zip(table_elements, table_summaries):
      doc = Document(
          page_content = s,
          metadata = {
              'id': i,
              'type': 'table',
              'original_content': e
          }
      )
      retrieve_contents.append((i, e))
      documents.append(doc)

  return documents

#### Function to create a vectors store using Qdrant

In [None]:
from langchain_community.vectorstores import Qdrant

def vectorize_docs(docs):
    url = "http://localhost:6333"
    qdrant = Qdrant.from_documents(
        docs,
        embeddings,
        url=url,
        prefer_grpc=False,
        collection_name="UNIFI_Vector_DB"
    )
    print("Vector DB Successfully Created!")

In [None]:
pdf_files = []
for filename in os.listdir("../Data/Data Sources/Test/"):
  if filename.endswith('.pdf'):
    pdf_files.append(filename)
    print(filename)

In [None]:
from tqdm import tqdm
documents_to_be_vectorized_ = []
for pdf_file in tqdm(pdf_files):
    try:
        file_path = os.path.join('../Data/Data Sources/Test/', pdf_file)
        raw_pdf_elements_ = get_elements_from_document(file_path)
        text_elements_, text_summaries_, table_elements_, table_summaries_ = get_text_and_table_summaries(raw_pdf_elements_)
        documents_ = create_documents(text_elements_, text_summaries_, table_elements_, table_summaries_)
        documents_to_be_vectorized_.append(documents_)
    except Exception as e:
        print(f"An error occurred for file: {pdf_file}")
        print(f"Error message: {e}")

In [None]:
# Combine all the elements in the documents list which is in the documents_to_be_vectorized list into a elements of one list of documents that will be vectorized
all_documents = []
for document in documents_to_be_vectorized_:
    print(document)
    all_documents.extend(document)

#### Manually go through the images extracted from the PDFs and remove redundant images before running the next 4 cells below

In [None]:
def get_image_summaries_(pdf_images_folder_path):
  image_elements = []
  image_summaries = []
  for i in os.listdir(pdf_images_folder_path):
    if i.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(pdf_images_folder_path, i)
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)
        summary = summarize_image(encoded_image)
        image_summaries.append(summary)
  return image_elements, image_summaries

In [None]:
def create_image_documents_(image_elements, image_summaries):
  # Create Documents
  documents = []
  retrieve_contents = []

  for e, s in zip(image_elements, image_summaries):
      i = str(uuid.uuid4())
      doc = Document(
          page_content = s,
          metadata = {
              'id': i,
              'type': 'image',
              'original_content': e
          }
      )
      retrieve_contents.append((i, s))
      documents.append(doc)
  return documents

In [None]:
# Iterate through the various folders in "./images" and get the summaries of the images
documents_to_be_vectorized = []
for folder in tqdm(sorted(os.listdir(output_path), reverse=True)):
    if os.path.isdir(os.path.join(output_path, folder)):
        pdf_images_folder_path = os.path.join(output_path, folder)
        image_elements, image_summaries = get_image_summaries_(pdf_images_folder_path)
        docs_ = create_image_documents_(image_elements, image_summaries)
        documents_to_be_vectorized.append(docs_)

In [None]:
for document in documents_to_be_vectorized:
    # print(document)
    all_documents.extend(document)

In [None]:
# Vectorize the documents
vectorize_docs(all_documents)