In [1]:
import os
from typing import Literal
from pydantic import BaseModel
import weaviate

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_weaviate import WeaviateVectorStore

from rag.enums import ChainComponent, ChainType
from rag.factory import ChainManager, LLMConfig
from rag.loader import load_pdf
from scripts.weaviate_helper_functions import (
    EMBEDDING_DIMENSIONS,
    EMBEDDING_MODEL,
    WeaviateCollection,
    create_schema,
    batch_ingest,
)

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")



### Connect to a Weaviate Instance

In [3]:
# To start a Weaviate Server, run:
# docker compose up
# If you don't have docker, using embedded weaviate can do the trick, see https://weaviate.io/developers/weaviate/installation/embedded

weaviate_client = weaviate.connect_to_local(
    headers={"X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")}
)


### Create a Weaviate Collection

You can create your own collection.
`WeaviateCollection.DOCUMENT` will simply defaut to `Document` and the `create_schema`
will create a collection with the following fields:

- `page_content` (will be used to compute embeddings)
- `filename`
- `page`

This will be enough to ingest a pdf and test serveral chains.

In [8]:
col = weaviate_client.collections.get(WeaviateCollection.DOCUMENT.value)
# weaviate_client.collections.delete(WeaviateCollection.DOCUMENT.value)

if not col.exists():
    create_schema(weaviate_client)

### Ingest a PDF

Replace with any PDF, this one will be used as default.

`load_pdf` function will simply load the pdf, split it in chunks, and output a list of `Document` (with the `page_content`, `filename` and `page`).
`batch_ingest` will take the documents and ingest them to the Weaviate instance.

In [9]:
file_path = "Apple_CDP-Climate-Change-Questionnaire_2023.pdf"
docs = load_pdf(file_path)
batch_ingest(weaviate_client, WeaviateCollection.DOCUMENT, docs)

Failed objects:
[]


In [11]:
# Document stored in weaviate can be fetched using the following query:
col.query.fetch_objects(limit=1)


[1;35mQueryReturn[0m[1m([0m
    [33mobjects[0m=[1m[[0m
        [1;35mObject[0m[1m([0m
            [33muuid[0m=[1;35m_WeaviateUUIDInt[0m[1m([0m[32m'015eff8c-87ac-4e70-aa88-b6ce135242ae'[0m[1m)[0m,
            [33mmetadata[0m=[1;35mMetadataReturn[0m[1m([0m
                [33mcreation_time[0m=[3;35mNone[0m,
                [33mlast_update_time[0m=[3;35mNone[0m,
                [33mdistance[0m=[3;35mNone[0m,
                [33mcertainty[0m=[3;35mNone[0m,
                [33mscore[0m=[3;35mNone[0m,
                [33mexplain_score[0m=[3;35mNone[0m,
                [33mis_consistent[0m=[3;35mNone[0m,
                [33mrerank_score[0m=[3;35mNone[0m
            [1m)[0m,
            [33mproperties[0m=[1m{[0m
                [32m'page'[0m: [1;36m41.0[0m,
                [32m'page_content'[0m: [32m'C8.2g\n[0m[32m([0m[32mC8.2g[0m[32m)[0m[32m Provide a breakdown by country/area of your non-fuel energy consumption

## RAG

Now that we have a Weaviate instance running, and a pdf document ingested, we can start interacting with it.

First, since we're using Langchain, we will define a `WeaviateVectorStore` instance to interact with our vectorstore, instead of simply using the `weaviate_client`.

In [19]:
vectorstore = WeaviateVectorStore(
    weaviate_client,
    WeaviateCollection.DOCUMENT,
    "page_content",
    embedding=OpenAIEmbeddings(model=EMBEDDING_MODEL, dimensions=EMBEDDING_DIMENSIONS),
)

filters = None  # wvc.query.Filter.by_property("page").equal(10)

### Chain Manager

To choose between multiple chain more easily, a `ChainManager` is provided.
We will see how we can easily switch between chains, and next, we will see how we can create and modify the chains.

`ChainManager` expects at least 2 arguments:

- `llm_config`: this define which model will be used across the different chain. Hence, it is easy to use cheaper model when doing basic tasks, and more expensive ones, for harder tasks.
- `vectorstore`: this allows to connect to the vectorstore

In [20]:
# Create LLM configuration with different models for different components
llm_config = LLMConfig(
    # Default model for any component not specifically configured
    default_llm=ChatOpenAI(model="gpt-4o-mini"),
    # Specific models for different components
    component_llms={
        ChainComponent.EXTRACTION: ChatOpenAI(model="gpt-4o"),
    },
)

# Initialize the chain manager with the LLM configuration
manager = ChainManager(
    vectorstore=vectorstore,
    llm_config=llm_config,
)

## Available Chains

The `ChainType` enums display the different chains available.
Each chain is composed of different subchain. These subchains can be reused in different chain, to avoid code duplication.

In [17]:
list(ChainType.__members__)


[1m[[0m
    [32m'BASIC_QA'[0m,
    [32m'IMAGE_QA'[0m,
    [32m'STRUCTURED_OUTPUT_IMAGE'[0m,
    [32m'STRUCTURED_OUTPUT'[0m,
    [32m'RELEVANCE_CHECK'[0m,
    [32m'FULL_VALIDATION'[0m
[1m][0m

### Basic QA Chain

We will start with `BASIC_QA`, which simply:

- take the question, transform the question into an embedding, perform a similarity_search on the embeddings of the chunks
- return the top `k` (2 by default) chunks
- the `k` chunks are then formatted, and the model will have access to those chunks as a `context` to better answer the question
- the answer will be a simple string

#### Using ChainManager

By default, the chains returns all the input and output of the different chains. This makes it easy to debug in practice.

In [21]:

# Run a basic QA chain
result = manager.run_chain(
    chain_type=ChainType.BASIC_QA,
    question="What is the GHG scope 1 emission of the company?",
)
result



[1m{[0m
    [32m'question'[0m: [32m'What is the GHG scope 1 emission of the company?'[0m,
    [32m'source_documents'[0m: [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m[32m'page'[0m: [1;36m28.0[0m, [32m'filename'[0m: [32m'Apple_CDP-Climate-Change-Questionnaire_2023.pdf'[0m[1m}[0m,
            [33mpage_content[0m=[32m'[0m[32m([0m[32mC6.1[0m[32m)[0m[32m What were your organization’s gross global Scope 1 emissions in metric tons CO2e?\nReporting year\nGross global Scope 1 emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)[0m[32m\n55200\nStart date\n[0m[32m<[0m[32mNot[0m[32m Applicable>\nEnd date\n<Not Applicable>\nComment\nC6.2\n[0m[32m([0m[32mC6.2[0m[32m)[0m[32m Describe your organization’s approach to reporting Scope 2 emissions.\nRow 1\n\u200b\nScope 2, location-based\n\u200b\nWe are reporting a Scope 2, location-based figure\nScope 2, market-based\nWe are reporting a Scope 2, market-based figure\

In [22]:
result['response']

[32m'55200 metric tons CO2e'[0m

#### Without ChainManager

When developing, or if a chain is not yet supported, it is easy in practice to combine different chain.
For that, we get (or define new subchain) existing subchain, and combine them with the `ChainBuilder` class that will take all the chains together,
and build a single chain that we can then call.

In [27]:
from rag.chains import (
    ChainBuilder,
    QAFromContextChain,
    WeaviateRetrievalChain,
)

# If we wanted to compose ourself the chain, we could use the subchains like this
retrieval_builder = WeaviateRetrievalChain(vectorstore=vectorstore, search_type='similarity')
qa_builder = QAFromContextChain(model=ChatOpenAI(model="gpt-4o-mini"))

chain = ChainBuilder(chains=[retrieval_builder, qa_builder]).build()

In [28]:
chain.invoke({'question': 'What is the GHG scope 1 emission of the company?'})


[1m{[0m
    [32m'question'[0m: [32m'What is the GHG scope 1 emission of the company?'[0m,
    [32m'source_documents'[0m: [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m[32m'page'[0m: [1;36m28.0[0m, [32m'filename'[0m: [32m'Apple_CDP-Climate-Change-Questionnaire_2023.pdf'[0m[1m}[0m,
            [33mpage_content[0m=[32m'[0m[32m([0m[32mC6.1[0m[32m)[0m[32m What were your organization’s gross global Scope 1 emissions in metric tons CO2e?\nReporting year\nGross global Scope 1 emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)[0m[32m\n55200\nStart date\n[0m[32m<[0m[32mNot[0m[32m Applicable>\nEnd date\n<Not Applicable>\nComment\nC6.2\n[0m[32m([0m[32mC6.2[0m[32m)[0m[32m Describe your organization’s approach to reporting Scope 2 emissions.\nRow 1\n\u200b\nScope 2, location-based\n\u200b\nWe are reporting a Scope 2, location-based figure\nScope 2, market-based\nWe are reporting a Scope 2, market-based figure\

In [30]:
# We could also access each subchain individually like this
retrieval_chain = retrieval_builder.build()
retrieval_chain.invoke({'question': 'What is the GHG scope 1 emission of the company?'})


[1m{[0m
    [32m'question'[0m: [32m'What is the GHG scope 1 emission of the company?'[0m,
    [32m'source_documents'[0m: [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m[32m'page'[0m: [1;36m28.0[0m, [32m'filename'[0m: [32m'Apple_CDP-Climate-Change-Questionnaire_2023.pdf'[0m[1m}[0m,
            [33mpage_content[0m=[32m'[0m[32m([0m[32mC6.1[0m[32m)[0m[32m What were your organization’s gross global Scope 1 emissions in metric tons CO2e?\nReporting year\nGross global Scope 1 emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)[0m[32m\n55200\nStart date\n[0m[32m<[0m[32mNot[0m[32m Applicable>\nEnd date\n<Not Applicable>\nComment\nC6.2\n[0m[32m([0m[32mC6.2[0m[32m)[0m[32m Describe your organization’s approach to reporting Scope 2 emissions.\nRow 1\n\u200b\nScope 2, location-based\n\u200b\nWe are reporting a Scope 2, location-based figure\nScope 2, market-based\nWe are reporting a Scope 2, market-based figure\

In [16]:
list(ChainComponent.__members__)

[1m[[0m[32m'QA'[0m, [32m'EXTRACTION'[0m, [32m'RELEVANCE'[0m, [32m'VALIDATION'[0m, [32m'IMAGE'[0m, [32m'EXTRACTION_IMAGE'[0m[1m][0m

### Structured Output Chain

A typical case, is when we want to have structured output, and not simply a string.

For that, the `STRUCTURED_OUTPUT` is available. First define a `pydantic` class, with the expected fields.
When we expect to have multiple output, here, multiple instances of a same class, it is better to create another class that will store all of them.

In [31]:
class GhgEmission(BaseModel):
    """Ghg emission of a company for a given year and a given scope"""
    scope: Literal["scope1", "scope2_location_based", "scope2_market_based", "scope3"]
    year: int
    value: float
    unit: str

class GhgEmissionData(BaseModel):
    """Ghg emission data for a company for each scope and for different year"""

    data: list[GhgEmission]

In [32]:
# Run a structured output chain
result = manager.run_chain(
    chain_type=ChainType.STRUCTURED_OUTPUT,
    question="What is the GHG scope 1 emission of the company?",
    output_schema=GhgEmissionData,
)
result


[1m{[0m
    [32m'question'[0m: [32m'What is the GHG scope 1 emission of the company?'[0m,
    [32m'source_documents'[0m: [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m[32m'page'[0m: [1;36m28.0[0m, [32m'filename'[0m: [32m'Apple_CDP-Climate-Change-Questionnaire_2023.pdf'[0m[1m}[0m,
            [33mpage_content[0m=[32m'[0m[32m([0m[32mC6.1[0m[32m)[0m[32m What were your organization’s gross global Scope 1 emissions in metric tons CO2e?\nReporting year\nGross global Scope 1 emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)[0m[32m\n55200\nStart date\n[0m[32m<[0m[32mNot[0m[32m Applicable>\nEnd date\n<Not Applicable>\nComment\nC6.2\n[0m[32m([0m[32mC6.2[0m[32m)[0m[32m Describe your organization’s approach to reporting Scope 2 emissions.\nRow 1\n\u200b\nScope 2, location-based\n\u200b\nWe are reporting a Scope 2, location-based figure\nScope 2, market-based\nWe are reporting a Scope 2, market-based figure\

In [33]:
result['response']

[1;35mGhgEmissionData[0m[1m([0m[33mdata[0m=[1m[[0m[1;35mGhgEmission[0m[1m([0m[33mscope[0m=[32m'scope1'[0m, [33myear[0m=[1;36m2023[0m, [33mvalue[0m=[1;36m55200[0m[1;36m.0[0m, [33munit[0m=[32m'metric tons CO2e'[0m[1m)[0m[1m][0m[1m)[0m

Compared to before, we now have a Pydantic class, and not a string. This makes a huge difference when we will want to perform actions on the extraction results.

### Relevance Check Chain

This chain is the same as the one above, except that for each source documents (previously we were passing 2 documents, because our `k` equals 2)
we will ask a model to determine wether or not, the given doc is likely to be useful to answer the question.

Note that for the relevance check, if we have 2 source documents, we will make 2 separate call in parallel. This allows to still have fast results.

Hence, this task perform a `post retrieval` transformation (here, filtering).

In [34]:
# Run a chain with relevance check
result = manager.run_chain(
    chain_type=ChainType.RELEVANCE_CHECK,
    question="What is the GHG scope 1 emission of the company?",
    output_schema=GhgEmissionData,
)
result


[1m{[0m
    [32m'question'[0m: [32m'What is the GHG scope 1 emission of the company?'[0m,
    [32m'source_documents'[0m: [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m[32m'page'[0m: [1;36m28.0[0m, [32m'filename'[0m: [32m'Apple_CDP-Climate-Change-Questionnaire_2023.pdf'[0m[1m}[0m,
            [33mpage_content[0m=[32m'[0m[32m([0m[32mC6.1[0m[32m)[0m[32m What were your organization’s gross global Scope 1 emissions in metric tons CO2e?\nReporting year\nGross global Scope 1 emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)[0m[32m\n55200\nStart date\n[0m[32m<[0m[32mNot[0m[32m Applicable>\nEnd date\n<Not Applicable>\nComment\nC6.2\n[0m[32m([0m[32mC6.2[0m[32m)[0m[32m Describe your organization’s approach to reporting Scope 2 emissions.\nRow 1\n\u200b\nScope 2, location-based\n\u200b\nWe are reporting a Scope 2, location-based figure\nScope 2, market-based\nWe are reporting a Scope 2, market-based figure\

In [37]:
[i['response'] for i in result['relevance_response']]


[1m[[0m
    [1;35mDocumentRelevance[0m[1m([0m
        [33mrelevant[0m=[3;92mTrue[0m,
        [33mexplanation[0m=[32m'The document provides the gross global Scope 1 emissions of the company, which directly answers the question regarding the GHG scope 1 emission.'[0m
    [1m)[0m,
    [1;35mDocumentRelevance[0m[1m([0m
        [33mrelevant[0m=[3;91mFalse[0m,
        [33mexplanation[0m=[32m'The document provides information on Scope 3 emissions, which are indirect emissions not included in Scope 1 or 2. It does not contain any data or details regarding Scope 1 emissions of the company.'[0m
    [1m)[0m
[1m][0m

If the `relevant` field is `false`, then we will skip this document. This avoid having too many irrelevant document in the context.

### Full Validation Chain

This chain is the same as the one above, except that for each output,
we will ask a model to determine wether or not, the given response is likely to suffer from hallucinations.
Hallucinations happens less often in RAG, but can still happen.

Hence, this task perform a `post extraction` transformation.

In [38]:
# Run a chain with full validation
result = manager.run_chain(
    chain_type=ChainType.FULL_VALIDATION,
    question="What is the GHG scope 1 emission of the company?",
    output_schema=GhgEmissionData,
)
result


[1m{[0m
    [32m'question'[0m: [32m'What is the GHG scope 1 emission of the company?'[0m,
    [32m'source_documents'[0m: [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m[32m'page'[0m: [1;36m28.0[0m, [32m'filename'[0m: [32m'Apple_CDP-Climate-Change-Questionnaire_2023.pdf'[0m[1m}[0m,
            [33mpage_content[0m=[32m'[0m[32m([0m[32mC6.1[0m[32m)[0m[32m What were your organization’s gross global Scope 1 emissions in metric tons CO2e?\nReporting year\nGross global Scope 1 emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)[0m[32m\n55200\nStart date\n[0m[32m<[0m[32mNot[0m[32m Applicable>\nEnd date\n<Not Applicable>\nComment\nC6.2\n[0m[32m([0m[32mC6.2[0m[32m)[0m[32m Describe your organization’s approach to reporting Scope 2 emissions.\nRow 1\n\u200b\nScope 2, location-based\n\u200b\nWe are reporting a Scope 2, location-based figure\nScope 2, market-based\nWe are reporting a Scope 2, market-based figure\

In [39]:
result['validation']

[1;35mValidateExtraction[0m[1m([0m[33mhallucination[0m=[3;91mFalse[0m[1m)[0m

### Image QA Chain

Instead of using text as before, this time, we will pass the pdf images to the model directly.

In [41]:
# Run a chain with image
result = manager.run_chain(
    chain_type=ChainType.IMAGE_QA,
    question="What is the GHG scope 1 emission of the company? If the information is not available, describe what you see in the image(s).",
    file_path="Apple_CDP-Climate-Change-Questionnaire_2023.pdf",
)
result


[1m{[0m
    [32m'question'[0m: [32m'What is the GHG scope 1 emission of the company? If the information is not available, describe what you see in the image[0m[32m([0m[32ms[0m[32m)[0m[32m.'[0m,
    [32m'source_documents'[0m: [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m[32m'page'[0m: [1;36m28.0[0m, [32m'filename'[0m: [32m'Apple_CDP-Climate-Change-Questionnaire_2023.pdf'[0m[1m}[0m,
            [33mpage_content[0m=[32m'[0m[32m([0m[32mC6.1[0m[32m)[0m[32m What were your organization’s gross global Scope 1 emissions in metric tons CO2e?\nReporting year\nGross global Scope 1 emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)[0m[32m\n55200\nStart date\n[0m[32m<[0m[32mNot[0m[32m Applicable>\nEnd date\n<Not Applicable>\nComment\nC6.2\n[0m[32m([0m[32mC6.2[0m[32m)[0m[32m Describe your organization’s approach to reporting Scope 2 emissions.\nRow 1\n\u200b\nScope 2, location-based\n\u200b\nWe are rep

We can note here that the model, even though it got the same pages as before, it wasn't able to find the values we were looking for.

### Image QA with Structured Output Chain

Same chain as before, but this time, we will output structured output.

In [43]:
# Run a chain with image + structured output
result = manager.run_chain(
    chain_type=ChainType.STRUCTURED_OUTPUT_IMAGE,
    question="What is the GHG scope 1 emission of the company? Consider all year available in the context.",
    file_path="Apple_CDP-Climate-Change-Questionnaire_2023.pdf",
    output_schema=GhgEmissionData,
)
result


[1m{[0m
    [32m'question'[0m: [32m'What is the GHG scope 1 emission of the company? Consider all year available in the context.'[0m,
    [32m'source_documents'[0m: [1m[[0m
        [1;35mDocument[0m[1m([0m
            [33mmetadata[0m=[1m{[0m[32m'page'[0m: [1;36m26.0[0m, [32m'filename'[0m: [32m'Apple_CDP-Climate-Change-Questionnaire_2023.pdf'[0m[1m}[0m,
            [33mpage_content[0m=[32m'Scope 3 category 3: Fuel-and-energy-related activities [0m[32m([0m[32mnot included in Scope 1 or 2[0m[32m)[0m[32m\nBase year start\nSeptember 28 2014\nBase year end\nSeptember 27 2015\nBase year emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)[0m[32m\n0\nComment\nThis Scope 3 emissions category was excluded from our baseline emissions, thus assumed to be 0.\nScope 3 category 4: Upstream transportation and distribution\nBase year start\nSeptember 28 2014\nBase year end\nSeptember 27 2015\nBase year emissions [0m[32m([0m[32mmetric tons CO2e[0m[32m)