### Import the necessary libraries

In [16]:
import os, sys
import warnings
import pandas as pd
import numpy as np
import pickle
import openai
from tqdm.auto import tqdm
from langchain.schema.document import Document
from langchain.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from Ingestion.ingest import extract_text_and_metadata_from_pdf_document, extract_text_and_metadata_from_docx_document

In [3]:
warnings.filterwarnings("ignore")

In [4]:
sys.path.append('../..')
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']

### Load OpenAI's text-embedding-3-large embeddings

In [6]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [8]:
DB_FAISS_PATH = 'vectorstore/db_faiss'

def create_vector_db(documents, embeddings):
    # Create a vector store
    db = FAISS.from_documents(documents, embeddings)
    db.save_local(DB_FAISS_PATH)

In [9]:
dir_path = "../Test_Documents"

if not os.path.exists(dir_path):
    print(f"Test Documents Directory path {dir_path} does not exist")
    sys.exit(1)

In [18]:
if not os.path.exists(DB_FAISS_PATH):
    pdf_files = [f for f in os.listdir(dir_path) if f.endswith('.pdf')]
    docx_files = [f for f in os.listdir(dir_path) if f.endswith('.docx')]

    documents = []

    for pdf_file in tqdm(pdf_files, desc='Processing PDF files'):
        pdf_path = os.path.join(dir_path, pdf_file)
        try:
            df = extract_text_and_metadata_from_pdf_document(pdf_path)
            print(f"Extracted text and metadata from {pdf_file}")
            for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing rows'):
                file_name = row['Filename']
                text = row['Text']
                page_number = row['Page_Number']
                document = Document(
                    page_content=text,
                    metadata = {
                        'id': str(index) + '_' + file_name + '_' + str(page_number),
                        'type': 'text',
                        'filename': file_name,
                        'page_number': page_number
                    }
                )
                documents.append(document)
        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

    for docx_file in tqdm(docx_files, desc='Processing DOCX files'):
        docx_path = os.path.join(dir_path, docx_file)
        try:
            df = extract_text_and_metadata_from_docx_document(docx_path)
            print(f"Extracted text and metadata from {docx_file}")
            for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing rows'):
                parent_id = row['Parent_Id']
                file_name = row['Filename']
                text = row['Text']
                page_number = row['Page_Number']
                document = Document(
                    page_content=text,
                    metadata = {
                        'id': str(index) + '_' + str(parent_id) + '_' + file_name + '_' + str(page_number),
                        'type': 'text',
                        'filename': file_name,
                        'page_number': page_number
                    }
                )
                documents.append(document)
        except Exception as e:
            print(f"Error processing {docx_file}: {str(e)}")

    create_vector_db(documents, embeddings)

Processing PDF files:   0%|          | 0/4 [00:00<?, ?it/s]

2024-06-17 15:50:56.275868: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-17 15:50:56.275933: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-17 15:50:56.279081: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-17 15:50:56.575721: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at microsoft/

Extracted text and metadata from 2_SalesTaxAct2018_Malaysia.pdf


Processing rows:   0%|          | 0/55 [00:00<?, ?it/s]

Extracted text and metadata from 3_Canada_Cybersec_Strategy.pdf


Processing rows:   0%|          | 0/22 [00:00<?, ?it/s]

Extracted text and metadata from 1_WHO_FCTC.pdf


Processing rows:   0%|          | 0/37 [00:00<?, ?it/s]

Extracted text and metadata from 5_CyberPeace_Report.pdf


Processing rows:   0%|          | 0/39 [00:00<?, ?it/s]

Processing DOCX files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracted text and metadata from 4_GovStack_Specs.docx


Processing rows:   0%|          | 0/391 [00:00<?, ?it/s]

### Load the FAISS Vectore Store

In [19]:
db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
faiss_retriever = db.as_retriever()

### Evaluate the baseline retrieval system using TruLens

#### Load TruLens Library Modules

In [10]:
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness

In [11]:
tru = Tru()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


In [12]:
from openai import OpenAI
oai_client = OpenAI()

In [13]:
class BaseRetrieval:
    @instrument
    def retrieve(self, query: str) -> list:
        """
        Retrieve relevant text from vector store.
        """
        results = db.similarity_search(query, k=5)
        print(results)
        return results[0].page_content
    
    @instrument
    def generate_completion(self, query: str, context_str: list) -> str:
        """
        Generate answer from context.
        """
        completion = oai_client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0,
        messages=
        [
            {"role": "user",
            "content": 
            f"We have provided context information below. \n"
            f"---------------------\n"
            f"{context_str}"
            f"\n---------------------\n"
            f"Given this information, please answer the question: {query}"
            }
        ]
        ).choices[0].message.content
        return completion

    @instrument
    def query(self, query: str) -> str:
        context_str = self.retrieve(query)
        completion = self.generate_completion(query, context_str)
        return completion

In [14]:
rag = BaseRetrieval()

In [15]:
from trulens_eval.feedback.provider.openai import OpenAI

provider = OpenAI()
grounded = Groundedness(groundedness_provider=provider)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/adeptschneiderthedev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.retrieve.rets.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.app.retrieve.args.query .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.app.retrieve.args.query .
✅ In Context Relevance, input context will be set to __record__.app.retrieve.rets.collect() .


### Construct the TruLens App

In [20]:
from trulens_eval import TruCustomApp
tru_rag = TruCustomApp(rag,
    app_id = 'Base Retrieval Pipeline Testing v1',
    feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance])

In [21]:
queries = [
    "Can the Conference of the Parties of the WHO FCTC assist countries in securing financial resources for implementation?",
    "What should be the minimum size of health warnings and messages on tobacco products, and where should they be placed?",
    "I opened a company to produce sensors in Kuala Lumpur. Based on the law in the file, how should I register for sales tax, and what are my obligations?",
    "I opened a company to produce sensors in Kuala Lumpur. During product I paid sales tax on my inputs. Based on the law in the file, what are conditions to be eligible for a refund of the sales tax?",
    "What specific indicators and targets are outlined in Canada's Cybersecurity Strategy?",
    "What measures is the government of Canada taking in response to data security challenges posed by the emergence of novel technologies?",
    "What are the API requirements that apply to the Consent building block?",
    "What additional building blocks are essential to support the functionality of the consent building block?",
    "What are the key findings of the CyberPeace Institute's analysis of cyber threats affecting NGOs in International Geneva?",
    "What are the key lessons learnt from the case studies examined in the report?"
]

In [22]:
def tru_rag_retrieval_pipeline(query):
    with tru_rag as recording:
        rag.query(query)
    tru.get_leaderboard(app_ids=["Base Retrieval Pipeline Testing v1"])

In [23]:
for query in queries:
    tru_rag_retrieval_pipeline(query)

[Document(page_content='Article 26 Financial resources\n\n1. objective of this Convention. The Parties recognize the important role that financial resources play in achieving the\n\n2. Each Party shall provide financial support in respect of its national activities intended to achieve the objective of the Convention, in accordance with its national plans, priorities and programmes.\n\n3. Parties shall promote, as appropriate, the utilization of bilateral, regional, subregional and other multilateral channels to provide funding for the development and strengthening of\n\n23\n\nWHO Framework Convention on Tobacco Control\n\nmultisectoral comprehensive tobacco control programmes of developing country Parties and Parties with economies in transition. Accordingly, economically viable alternatives to tobacco production, including crop diversification should be addressed and supported in the context of nationally developed strategies of sustainable development.\n\n4. Parties represented in re

Groundedness per statement in source:   0%|          | 0/4 [00:00<?, ?it/s]



Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

[Document(page_content='Sales Tax\n\nApplication for registration\n\n13. (1) Any manufacturer who is liable to be registered under section 12 shall apply to the Director General for registration as a registered manufacturer in the prescribed form not later than the last day of the month following the month in which he is liable to be registered as referred to in paragraph 12(2)(a) or (b).\n\n(2) Upon receipt of the application under subsection (1), the Director General may approve the registration, subject to such conditions as he deems fit.\n\n(3) The Director General shall register the manufacturer under subsection (1) with effect from the first day of the month following the month in which the application under subsection (1) is made or from such earlier date as may be agreed between the Director General and the manufacturer but such date shall not be earlier than the date he becomes liable to be registered.\n\n(4) Where any manufacturer fails to comply with subsection (1), the Dire

Groundedness per statement in source:   0%|          | 0/23 [00:00<?, ?it/s]

[Document(page_content='Claim for refund of sales tax in relation to bad debt\n\n36. (1) Any person who is, or has ceased to be, a registered manufacturer may make a claim to the Director General for a refund of the whole or any part of any sales tax paid by him in respect of taxable goods if—\n\n(a) the whole or any part of the sales tax payable to such person has been written off in his accounts as bad debts; and\n\n(b) the Director General is satisfied that all reasonable efforts have been made by such person to recover the sales tax.\n\nSales Tax\n\n(2) Where the person who makes a claim for a refund under subsection (1) in relation to bad debt—\n\n(a) has not received any payment in respect of the sale of taxable goods, such person may make a claim for a refund of the whole of the sales tax paid; or\n\n(b) has received any payment in respect of the taxable goods, such person may make a claim for the difference between the sales tax paid and the amount calculated in accordance with

Groundedness per statement in source:   0%|          | 0/21 [00:00<?, ?it/s]

[Document(page_content='Cyber innovation\n\nBy supporting advanced research, fostering digital innovation, and developing cyber skills and knowledge, the federal government will position Canada as a global leader in cyber security.\n\nLeadership and Collaboration\n\nThe federal government, in close collaboration with provinces, territories, and the private sector, will take a leadership role to advance cyber security in Canada and will, in coordination with allies, work to shape the international cyber security environment in Canada’s favour.\n\n3 of 35 • NATiONAL CYBER SECURiTY STRATEGY\n\n¢\n\nExecutive Summary \n\nIn a dynamic cyber security environment, the Government of Canada’s\n\napproach will be rooted in a sustained commitment to:\n\n• Protect the safety and security of Canadians and our critical infrastructure\n\n• Promote and protect rights and freedoms online\n\nEncourage cyber security for business, economic growth, and prosperity\n\n• Collaborate and support coordination 

Groundedness per statement in source:   0%|          | 0/4 [00:00<?, ?it/s]

[Document(page_content='Cyber Innovation\n\nbe created for Canadians in the years ahead.2 Governments, academia, and members of the private sector can work together to create new opportunities, drive investment, and foster leading-edge research\n\nand development.\n\nCanada is already a leader in cyber security research and development. Breakthroughs in cyber security research are not only beneficial for Canadian cyber security firms, but for the economy as a whole. Government has a role to play to support advanced research and to help innovative companies scale up to bring cyber security technologies and services to the global marketplace.\n\nCYBER SECURITY SNAPSHOT: STREAMLINING SERVICE DELIVERY\n\nStuart was relieved when he found out that he could access his Canada Pension Plan (CPP) account online without having to remember another password. All he has to do is go to the CPP log-in page, click on the logo for his bank, and enter his information. He uses the same username and passw

Groundedness per statement in source:   0%|          | 0/18 [00:00<?, ?it/s]

[Document(page_content="In general, the Consent Building Block shall follow the authentication and authorisation requirements as laid out in the Govstack architecture. For clarity, Consent Building Block's API endpoints are invoked with a client-supplied API key which MUST defer to the Identification and Verification Building Block in order to verify the role and/or scope of the API key matches the API endpoint to which it is supplied. This is mentioned here, as this Definition is drafted without clear guidance in the OpenAPI spec for the handling of roles and scopes.", metadata={'id': '381_9d5bbb77ba04f6dc5716ab8a7b451cb1_4_GovStack_Specs.docx_20.0', 'type': 'text', 'filename': '4_GovStack_Specs.docx', 'page_number': 20.0}), Document(page_content="In general, the Consent Building Block shall follow the authentication and authorisation requirements as laid out in the GovStack architecture. For clarity, Consent Building Block's API endpoints are invoked with a client-supplied API key wh

Groundedness per statement in source:   0%|          | 0/15 [00:00<?, ?it/s]

[Document(page_content='Following is the first core set of key functionalities of the Consent Building Block. For potential future developments of the specification follow the work in progress at GovStack confluence page.', metadata={'id': '335_None_4_GovStack_Specs.docx_18.0', 'type': 'text', 'filename': '4_GovStack_Specs.docx', 'page_number': 18.0}), Document(page_content='The Consent Building Block implements the key functionalities described in the consent management lifecycle. It includes the ability to configure consent agreements by an organisation admin, present consent requests towards individuals, capture consents, enable queries if consent exists, or not, and enable independent audit of consents.', metadata={'id': '331_None_4_GovStack_Specs.docx_18.0', 'type': 'text', 'filename': '4_GovStack_Specs.docx', 'page_number': 18.0}), Document(page_content="The Consent Building Block enables services for individuals to approve the use of their Personal Data by defining the principle

Groundedness per statement in source:   0%|          | 0/24 [00:00<?, ?it/s]

[Document(page_content="Part 2 Key Findings\n\nKey Finding 1\n\nNGOs in International Geneva are targeted by cyberattacks.\n\n• 41% of NGOs report having been victim of a cyberattack within the past three years.\n\n• All NGOs that have experienced attacks report that these were not isolated events. The frequency of these incidents varies, with some NGOs facing incidents on a daily basis and others encountering them on a monthly or annual basis.\n\n• 70% of NGOs either don’t think, or aren't sure whether, they have an adequate level of resilience to recover from a disruptive cyberattack.\n\nKey Finding 2\n\nNGOs in International Geneva understand their exposure to cyber risks, but lack the support needed to implement mature cybersecurity strategies.\n\n• NGOs, unlike entities recognized as critical infrastructure, lack specific designation as a sector for particular protections in cyberspace.\n\n• Funding for NGOs is generally earmarked for specific projects, often leaving cybersecurity

Groundedness per statement in source:   0%|          | 0/12 [00:00<?, ?it/s]

[Document(page_content='Case Studies Case Study #1: Ransomware Attack on an NGO Case Study #2: Attack on an NGO Website Case Study #3: Responding to a Man-in-the-Middle cyberattack Appendix A: Detailed recommendations Organizational recommendations Technical Recommendations Basic Cybersecurity Measures: Enhanced Cybersecurity Measures: Project scope and participation of NGOs Limitations 35 35 35 39 43 48 48 48 50 50 51 52 52 54 55 60', metadata={'id': '3_5_CyberPeace_Report.pdf_3', 'type': 'text', 'filename': '5_CyberPeace_Report.pdf', 'page_number': 3}), Document(page_content='Strategic recommendations\n\n■ Train employees and users to recognize the signs of MitM attacks and practice safe browsing and email habits. Encourage reporting of any kind of suspicious activity.\n\n■ Implement the principle of least privilege (PoLP) to restrict user access to only the resources necessary for their roles.\n\n■ Assess the security practices of third-party vendors, especially those providing crit

Groundedness per statement in source:   0%|          | 0/18 [00:00<?, ?it/s]

In [24]:
tru.run_dashboard()

Starting dashboard ...


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.0.105:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>