### Import the required libraries

In [1]:
import os, sys
import warnings
import pandas as pd
import numpy as np
import pickle
import openai
from tqdm.auto import tqdm
from langchain.schema.document import Document
from langchain.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from Ingestion.ingest import extract_text_and_metadata_from_pdf_document, extract_text_and_metadata_from_docx_document

warnings.filterwarnings("ignore")

sys.path.append('../..')
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']

  from pandas.core import (


### Load OpenAI's text-embedding-3-large embeddings

In [2]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [3]:
DB_FAISS_PATH = 'vectorstore/db_faiss'

def create_vector_db(documents, embeddings):
    # Create a vector store
    db = FAISS.from_documents(documents, embeddings)
    db.save_local(DB_FAISS_PATH)

In [4]:
def initialize_bm25_retriever(documents):
    bm25_retriever = BM25Retriever.from_documents(documents)
    bm25_retriever.k = 5
    # Save bm25_retriever as a pickle file
    with open('bm25_retriever.pkl', 'wb') as f:
        pickle.dump(bm25_retriever, f)
    return bm25_retriever

def load_bm25_retriever():
    with open('bm25_retriever.pkl', 'rb') as f:
        bm25_retriever = pickle.load(f)
    return bm25_retriever

In [5]:
dir_path = "../Test_Documents"

if not os.path.exists(dir_path):
    print(f"Test Documents Directory path {dir_path} does not exist")
    sys.exit(1)

In [6]:
bm25_retriever = None

In [7]:
if not os.path.exists(DB_FAISS_PATH):
    pdf_files = [f for f in os.listdir(dir_path) if f.endswith('.pdf')]
    docx_files = [f for f in os.listdir(dir_path) if f.endswith('.docx')]

    documents = []

    for pdf_file in tqdm(pdf_files, desc='Processing PDF files'):
        pdf_path = os.path.join(dir_path, pdf_file)
        try:
            df = extract_text_and_metadata_from_pdf_document(pdf_path)
            print(f"Extracted text and metadata from {pdf_file}")
            for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing rows'):
                file_name = row['Filename']
                text = row['Text']
                page_number = row['Page_Number']
                document = Document(
                    page_content=text,
                    metadata = {
                        'id': str(index) + '_' + file_name + '_' + str(page_number),
                        'type': 'text',
                        'filename': file_name,
                        'page_number': page_number
                    }
                )
                documents.append(document)
        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

    for docx_file in tqdm(docx_files, desc='Processing DOCX files'):
        docx_path = os.path.join(dir_path, docx_file)
        try:
            df = extract_text_and_metadata_from_docx_document(docx_path)
            print(f"Extracted text and metadata from {docx_file}")
            for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing rows'):
                parent_id = row['Parent_Id']
                file_name = row['Filename']
                text = row['Text']
                page_number = row['Page_Number']
                document = Document(
                    page_content=text,
                    metadata = {
                        'id': str(index) + '_' + str(parent_id) + '_' + file_name + '_' + str(page_number),
                        'type': 'text',
                        'filename': file_name,
                        'page_number': page_number
                    }
                )
                documents.append(document)
        except Exception as e:
            print(f"Error processing {docx_file}: {str(e)}")

    create_vector_db(documents, embeddings)
    bm25_retriever = initialize_bm25_retriever(documents)

Processing PDF files:   0%|          | 0/4 [00:00<?, ?it/s]

2024-06-19 07:57:05.467228: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-19 07:57:05.467295: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-19 07:57:05.470548: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-19 07:57:05.799598: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at microsoft/

Extracted text and metadata from 2_SalesTaxAct2018_Malaysia.pdf


Processing rows:   0%|          | 0/55 [00:00<?, ?it/s]

Extracted text and metadata from 3_Canada_Cybersec_Strategy.pdf


Processing rows:   0%|          | 0/22 [00:00<?, ?it/s]

Extracted text and metadata from 1_WHO_FCTC.pdf


Processing rows:   0%|          | 0/37 [00:00<?, ?it/s]

Extracted text and metadata from 5_CyberPeace_Report.pdf


Processing rows:   0%|          | 0/39 [00:00<?, ?it/s]

Processing DOCX files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracted text and metadata from 4_GovStack_Specs.docx


Processing rows:   0%|          | 0/391 [00:00<?, ?it/s]



In [8]:
# Load the FAISS vector store
db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
faiss_retriever = db.as_retriever()
print("FAISS Loaded")
# Load the BM25 Retriever if it does not exist
if not bm25_retriever:
    bm25_retriever = load_bm25_retriever()
print("BM25 Retriever loaded")
# Create an ensemble retriever with the BM25 and FAISS retrievers
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5])

FAISS Loaded
BM25 Retriever loaded


In [9]:
llm = ChatOpenAI(temperature=0, model_name="gpt-4")

In [10]:
compressor = LLMChainExtractor.from_llm(llm)

In [11]:
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=ensemble_retriever)

### Evaluate the optimized retrieval system using TruLens

#### Load TruLens Library Modules

In [12]:
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness

In [13]:
tru = Tru()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


In [14]:
from openai import OpenAI
oai_client = OpenAI()

In [15]:
class ContextualCompressionRetrieval:
    @instrument
    def retrieve(self, query: str) -> list:
        """
        Retrieve relevant text from vector store.
        """
        results = compression_retriever.get_relevant_documents(query)
        return results[0].page_content
    
    @instrument
    def generate_completion(self, query: str, context_str: list) -> str:
        """
        Generate answer from context.
        """
        completion = oai_client.chat.completions.create(
        model="gpt-4",
        temperature=0,
        messages=
        [
            {"role": "user",
            "content": 
            f"We have provided context information below. \n"
            f"---------------------\n"
            f"{context_str}"
            f"\n---------------------\n"
            f"Given this information, please answer the question: {query}"
            }
        ]
        ).choices[0].message.content
        return completion

    @instrument
    def query(self, query: str) -> str:
        context_str = self.retrieve(query)
        completion = self.generate_completion(query, context_str)
        return completion

In [16]:
contextual_compression_retrieval_rag = ContextualCompressionRetrieval()

In [17]:
from trulens_eval.feedback.provider.openai import OpenAI

provider = OpenAI()
grounded = Groundedness(groundedness_provider=provider)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/adeptschneiderthedev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.retrieve.rets.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.app.retrieve.args.query .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.app.retrieve.args.query .
✅ In Context Relevance, input context will be set to __record__.app.retrieve.rets.collect() .


### Construct the TruLens App

In [19]:
from trulens_eval import TruCustomApp
tru_rag = TruCustomApp(contextual_compression_retrieval_rag,
    app_id = 'Retrieval Pipeline Testing v3 (Contextual Retrieval)',
    feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance])

In [20]:
queries = [
    "Can the Conference of the Parties of the WHO FCTC assist countries in securing financial resources for implementation?",
    "What should be the minimum size of health warnings and messages on tobacco products, and where should they be placed?",
    "I opened a company to produce sensors in Kuala Lumpur. Based on the law in the file, how should I register for sales tax, and what are my obligations?",
    "I opened a company to produce sensors in Kuala Lumpur. During product I paid sales tax on my inputs. Based on the law in the file, what are conditions to be eligible for a refund of the sales tax?",
    "What specific indicators and targets are outlined in Canada's Cybersecurity Strategy?",
    "What measures is the government of Canada taking in response to data security challenges posed by the emergence of novel technologies?",
    "What are the API requirements that apply to the Consent building block?",
    "What additional building blocks are essential to support the functionality of the consent building block?",
    "What are the key findings of the CyberPeace Institute's analysis of cyber threats affecting NGOs in International Geneva?",
    "What are the key lessons learnt from the case studies examined in the report?"
]

In [21]:
def tru_rag_contextual_compression_retrieval_pipeline(query):
    with tru_rag as recording:
        contextual_compression_retrieval_rag.query(query)
    tru.get_leaderboard(app_ids=["Retrieval Pipeline Testing v3 (Contextual Retrieval)"])

In [22]:
for query in queries:
    tru_rag_contextual_compression_retrieval_pipeline(query)

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/5 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/12 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/6 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.43.140:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>