## <font color="Orange">Data Preprocessing</font>

In [198]:
import os
import sys
sys.path.append(os.path.abspath(".."))

from backend.utils import get_all_files
from IPython.display import Image, display
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import CompositeElement, Element
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import HuggingFacePipeline
from langchain_ollama.llms import OllamaLLM
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage


from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

from huggingface_hub import login, whoami

import base64
from base64 import b64decode
import uuid
from dotenv import load_dotenv

load_dotenv()

True

<font color="cyan">create a function to glob pdfs from YAML file</font>

In [91]:
pdf_path = "../data/ConocoPhillips Proxy Statement 2024.pdf"
pdf_path_2 = "../data/ConocoPhillips AIM Presentation 2023.pdf"

In [5]:
# output file path
output_path = "../data/images/"

pdf_chunks = partition_pdf(
    filename=pdf_path,
    infer_table_structure=True,
    strategy="hi_res",
    # extract_image_block_types=["Image"],
    # extract_image_block_output_dir=output_path,
    extract_image_block_to_payload=False,
    chunking_strategy="by_title",
    max_characters=15000,
    combine_text_under_n_chars=5000,
    new_after_n_chars=10000
)

In [96]:
pdf_chunks_2 = partition_pdf(

    filename=pdf_path_2,
    infer_table_structure=True,
    strategy="hi_res",
    # extract_image_block_types=["Image"],
    # extract_image_block_output_dir=output_path,
    extract_image_block_to_payload=False,
    chunking_strategy="by_title",
    max_characters=2000,
    combine_text_under_n_chars=300,
    new_after_n_chars=1500
)

In [143]:
full_chunks = pdf_chunks + pdf_chunks_2

### <font color="#6699CC" >Extracting text, table and images </font>

In [268]:
texts = []
tables = []
images = []

In [269]:
# Process each CompositeElement
for composite in full_chunks:
    new_inner_elements = []
    
    # Iterate through the inner elements of the CompositeElement
    for inner_element in composite.metadata.orig_elements:
        
        if "Table" in str(type(inner_element)):
            # Append table elements to the separate list
            tables.append(inner_element)
        elif "Image" in str(type(inner_element)):
            # Skip image elements (they are removed)
            continue
        else:
            # Keep other elements
            new_inner_elements.append(inner_element)

    # If there are remaining elements, keep the CompositeElement
    if new_inner_elements:
        composite.elements = new_inner_elements
        texts.append(composite)

In [270]:
tables

[<unstructured.documents.elements.Table at 0x335d9c6e0>,
 <unstructured.documents.elements.Table at 0x336410b30>,
 <unstructured.documents.elements.Table at 0x3327a9f70>,
 <unstructured.documents.elements.Table at 0x3327a8dd0>,
 <unstructured.documents.elements.Table at 0x34d79b260>,
 <unstructured.documents.elements.Table at 0x178a39b80>,
 <unstructured.documents.elements.Table at 0x17893fcb0>,
 <unstructured.documents.elements.Table at 0x178308e90>,
 <unstructured.documents.elements.Table at 0x17fd8c230>,
 <unstructured.documents.elements.Table at 0x1783b3140>,
 <unstructured.documents.elements.Table at 0x335d40770>,
 <unstructured.documents.elements.Table at 0x17fe166f0>,
 <unstructured.documents.elements.Table at 0x3399c8b60>,
 <unstructured.documents.elements.Table at 0x3365aa600>,
 <unstructured.documents.elements.Table at 0x17cf44c50>,
 <unstructured.documents.elements.Table at 0x34d13ecc0>,
 <unstructured.documents.elements.Table at 0x34d170320>,
 <unstructured.documents.elemen

In [271]:
len(tables), len(texts)

(72, 166)

In [272]:
# base64 encode for images
def encode_images(file_path):
    with open(file_path, "rb") as file:
        return base64.b64encode(file.read()).decode("utf-8")

In [273]:
# utils function to get list of files not working 
images_path = "../data/images/"
image_lists = get_all_files(images_path, "png")

In [274]:
# image_lists = ['../data/images/figure-39-105.jpg', '../data/images/figure-34-92.jpg', '../data/images/figure-16-31.jpg', '../data/images/figure-24-55.jpg', '../data/images/figure-38-101.jpg', '../data/images/figure-74-58.jpg', '../data/images/figure-6-8.jpg', '../data/images/figure-39-106.jpg', '../data/images/figure-43-128.jpg', '../data/images/figure-14-23.jpg', '../data/images/figure-38-103.jpg', '../data/images/figure-18-37.jpg', '../data/images/figure-120-59.jpg', '../data/images/figure-24-56.jpg', '../data/images/figure-50-149.jpg', '../data/images/figure-38-102.jpg', '../data/images/figure-43-129.jpg', '../data/images/figure-39-107.jpg', '../data/images/figure-120-60.jpg', '../data/images/figure-14-24.jpg', '../data/images/figure-27-66.jpg', '../data/images/figure-69-56.jpg', '../data/images/figure-32-87.jpg', '../data/images/figure-25-60.jpg', '../data/images/figure-12-20.jpg', '../data/images/figure-8-11.jpg', '../data/images/figure-27-70.jpg', '../data/images/figure-25-61.jpg', '../data/images/figure-32-86.jpg', '../data/images/figure-12-24.jpg', '../data/images/figure-45-136.jpg', '../data/images/figure-45-134.jpg', '../data/images/figure-22-47.jpg', '../data/images/figure-37-99.jpg', '../data/images/figure-37-98.jpg', '../data/images/figure-40-110.jpg', '../data/images/figure-44-130.jpg', '../data/images/figure-45-135.jpg', '../data/images/figure-17-33.jpg', '../data/images/figure-40-109.jpg', '../data/images/figure-29-80.jpg', '../data/images/figure-41-119.jpg', '../data/images/figure-22-48.jpg', '../data/images/figure-53-157.jpg', '../data/images/figure-17-35.jpg', '../data/images/figure-29-78.jpg', '../data/images/figure-29-79.jpg', '../data/images/figure-32-88.jpg', '../data/images/figure-17-36.jpg', '../data/images/figure-37-100.jpg', '../data/images/figure-7-7.jpg', '../data/images/figure-5-5.jpg', '../data/images/figure-41-120.jpg', '../data/images/figure-1-1.jpg', '../data/images/figure-28-75.jpg', '../data/images/figure-48-144.jpg', '../data/images/figure-26-64.jpg', '../data/images/figure-51-153.jpg', '../data/images/figure-23-54.jpg', '../data/images/figure-11-18.jpg', '../data/images/figure-51-152.jpg', '../data/images/figure-25-34.jpg', '../data/images/figure-26-65.jpg', '../data/images/figure-28-74.jpg', '../data/images/figure-21-44.jpg', '../data/images/figure-21-46.jpg', '../data/images/figure-42-124.jpg', '../data/images/figure-33-90.jpg', '../data/images/figure-42-125.jpg', '../data/images/figure-13-22.jpg', '../data/images/figure-55-159.jpg', '../data/images/figure-48-142.jpg', '../data/images/figure-50-150.jpg', '../data/images/figure-23-52.jpg', '../data/images/figure-23-53.jpg', '../data/images/figure-49-146.jpg', '../data/images/figure-48-143.jpg', '../data/images/figure-26-63.jpg', '../data/images/figure-6-7.jpg', '../data/images/figure-42-122.jpg', '../data/images/figure-24-58.jpg', '../data/images/figure-23-51.jpg', '../data/images/figure-18-38.jpg', '../data/images/figure-18-39.jpg', '../data/images/figure-42-123.jpg', '../data/images/figure-49-145.jpg']

In [276]:
len(image_lists)

32

In [277]:
for image in image_lists:
    base64_image = encode_images(image)
    images.append(base64_image)

### <font color="#f5b041">Data Summarization</font>

#### Text and Tables

In [278]:
prompt_text = """

You are an assistant tasked with summarizing tables and text.
Give a brief summary of the table or text.

Repsond only with the summary, no additional comment.
Do not start your sentence by saying "Here is a summary" or anything like that.
Just give the summary as it is.

for context the content is from 2 documents which is proxy statement of year 2024 and AIM (Analyst and investor meeting)of year 2023 
of ConocoPhillips which is one of the world’s largest independent E&P companies 
based on production and proved reserves.

Table or text chunk: {element}

"""

In [279]:
prompt = ChatPromptTemplate.from_template(prompt_text)

In [280]:
model = OllamaLLM(model="llama3.2:1b")

In [281]:
summarize_chain = prompt | model | StrOutputParser()

In [282]:
tables_html = [table.metadata.text_as_html for table in tables]

In [283]:
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})

In [284]:
table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 3})

In [291]:
prompt_template_image = """

Describe the image in detail. For context,
the image is part of two documents The ConocoPhillips 2024 Proxy Statement and 2023 Analyst & Investor Meeting Presentation outline the company's strategy, portfolio, financial plan, and sustainability goals. 
Key topics include board elections, executive compensation, financial performance, and stockholder engagement. 
The company emphasizes a disciplined, returns-focused strategy, strong financial discipline, and progress on its net-zero energy transition plan. It also highlights operational milestones, LNG expansion, and emissions reduction targets, reinforcing its commitment to long-term value creation for stockholders.

### Instructions:
- Provide a **detailed description** of the image.
- If the image contains **graphs (e.g., bar charts, line graphs, pie charts)**, describe:
  - The **type of chart** (bar, line, scatter, etc.).
  - **Key trends, labels, and numerical values** visible.
  - **Comparisons or significant insights** from the data.
- If the image contains **tables or figures**, summarize their key takeaways.

Focus on **clarity, accuracy, and relevance** while ensuring a structured and comprehensive response.

"""

messages = [
    (
        "user",
        [
            {"type": "text", "text": prompt_template_image},
            {
                "type": "image_url",
                "image_url": {"url": "data:image/jpeg;base64,{image}"},
            },
        ],
    )
]

In [292]:
image_prompt = ChatPromptTemplate.from_messages(messages)

In [293]:
image_model = ChatOpenAI(model="gpt-4o-mini")

In [294]:
image_summarize_chain = image_prompt | image_model | StrOutputParser()

In [295]:
image_summaries = image_summarize_chain.batch(images)

In [297]:
image_summaries

['The image is a stacked area chart titled "Balanced, Diversified, Disciplined Production Growth." It visually represents the projected production growth of ConocoPhillips from the year 2023 to 2032, measured in thousands of barrels of oil equivalent per day (MBOED).\n\n### Chart Description:\n\n- **Y-Axis**: This axis ranges from 0 to 2,500 MBOED, indicating the total production capacity over the specified period.\n- **X-Axis**: It spans from 2023 (estimated) to 2032, representing the future years of production projections.\n\n### Composition of the Chart:\n\n- The chart is segmented into four distinct areas, with varying colors representing different categories of production:\n  - **Unconventional (Lower 48 + Montney)**: The bottom portion, depicted in dark blue, occupies the largest segment. It shows a projected growth rate of approximately 6% Compound Annual Growth Rate (CAGR) over the decade.\n  - **Conventional**: The middle portion, shown in light blue, indicates a smaller growt

In [298]:
table_summaries

['Proxy Statement of ConocoPhillips for Year 2024 and AIM (Analyst and Investor Meeting) of Year 2023.\n\nChief Executive Officer and Lead Director Notice of 2024 Annual Meeting.\nof Stockholders Proxy Summary.\nAbout ConocoPhillips.\nStockholder Engagement.\nDirector Nominees.\nGovernance Highlights.\nExecutive Compensation.\nProgress Report on Our Plan for the Net-Zero Energy Transition.\nItem 1: Election of Directors.\nand Director Biographies.\nBoard Composition and Refreshment.\nDirector Onboarding and Education.\nBoard and Committee Evaluations.\nCorporate Governance at ConocoPhillips.\nBoard Leadership Structure.\nBoard Independence.\nRelated Party Transactions.\nBoard Meetings and Committees.\nBoard Oversight of Risk Management.\nStockholder Engagement and Board Responsiveness.\nCode of Business Ethics and Conduct.\nCommitment to Our Culture.\nHuman Capital Management.\nPublic Policy Engagement.\nCommunications with the Board of Directors.\nDirector Compensation.',
 'Compensati

In [299]:
text_summaries

['ConocoPhillips\n\n2024 Annual Report\n\nTable of Contents\nA Message from Our Chairman and Chief Executive Officer Notice of 2023 Annual Meeting of Stockholders \nProxy Summary About ConocoPhillips Stockholder Engagement Director Nominees Governance Highlights Executive Compensation Progress Report on Our Plan for the Net-Zero Energy Transition FOR Item 1: Election of Directors and Director Biographies Board Composition and Refreshment Director Onboarding and Education Board and Committee Evaluations Corporate Governance at ConocoPhillips Board Leadership Structure Board Independence Related Party Transactions Board Meetings and Committees Board Oversight of Risk Management Stockholder Engagement and Board Responsiveness Code of Business Ethics and Conduct Commitment to Our Culture Human Capital Management Public Policy Engagement Communications with the Board of Directors Director Compensation Audit and Finance Committee Report FOR Item 2: Proposal to Ratify the Appointment of Ernst

### <font color="">Load Data and Summaries to VectorStore</font>

In [300]:
vectorstore = Chroma(collection_name="multi_modal_rag", embedding_function=OpenAIEmbeddings())

In [301]:
store = InMemoryStore()

In [302]:
id_key = "doc_id"

In [303]:
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [304]:
# Text
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

# Images
img_ids = [str(uuid.uuid4()) for _ in images]
summary_img = [
    Document(page_content=summary, metadata={id_key: img_ids[i]}) for i, summary in enumerate(image_summaries)
]
retriever.vectorstore.add_documents(summary_img)
retriever.docstore.mset(list(zip(img_ids, images)))

### RAG Pipeline

In [305]:
def parse_docs(docs):

    b64 = []
    text = []

    for doc in docs:
        try:
            b64decode(doc)
            b64.append(doc)
        except Exception as e:
            text.append(doc)

    return {"images": b64, "texts":text}

In [306]:
def build_prompt(kwargs):

    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]

    context_text = ""

    if len(docs_by_type["texts"]) > 0:
        for text_element in docs_by_type["texts"]:
            context_text += text_element.text


    prompt_template = f"""
    
    You are an AI assistant tasked with answering user queries using only the provided context. 
    The context may contain **text, tables, and images** extracted from documents.

    - If the answer is **not found in the context**, respond with: 
        **"Sorry, I am not able to answer the question."**
    - Do **not** use external knowledge or assumptions.
    - Ensure numerical accuracy for table data.

    ### Context:
    {context_text}

    ### User Question:
    {user_question}

    Provide a **clear, concise, and well-structured answer** based on the context.
    
    """

    prompt_content = [{"type": "text", "text": prompt_template}]

    if len(docs_by_type["images"]) > 0:
        for image in docs_by_type["images"]:
            prompt_content.append(
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
                }
            )

    return ChatPromptTemplate.from_messages(
        [
            HumanMessage(content=prompt_content)
        ]
    )

In [307]:
chain = {
    "context": retriever | RunnableLambda(parse_docs),
    "question": RunnablePassthrough(),
} | RunnablePassthrough().assign(
    response=(
        RunnableLambda(build_prompt)
        | ChatOpenAI(model="gpt-4o-mini")
        | StrOutputParser()
    )
)

In [308]:
response = chain.invoke(
    "What are the future plans of Conocophillips"
)

In [309]:
print(response['response'])

ConocoPhillips is focused on operational excellence and has outlined several future plans, including:

1. **Pipeline Construction and Infrastructure**: Plans include the construction and delivery of the first pipeline, complete tie-ins to fabrication and new drill sites, and utilization of existing infrastructure.

2. **Central Processing Facility**: The procurement of a central processing facility is part of their strategy, along with commencing a drilling program.

3. **Commitment to Safety**: The company prioritizes safety, aiming to operate more safely, efficiently, and responsibly, with a focus on minimizing human error as a part of their operational culture.

4. **Engaging Stakeholders**: They emphasize the importance of regular engagement with stakeholders including stockholders, employees, customers, suppliers, advocacy groups, governments, and communities for long-term success.

5. **Energy Transition Positioning**: ConocoPhillips believes they are well positioned for the ener