In [1]:
print("ok")

ok


## Load dependencies

In [140]:
import os
import time
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

from uuid import uuid4
from langchain_core.documents import Document

## Load env var

In [82]:
load_dotenv()

True

In [83]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_MODEL = os.getenv("OPENAI_API_MODEL")
OPENAI_MAX_TOKENS=os.getenv("OPENAI_MAX_TOKENS")
OPENAI_TEMPERATURE=os.getenv("OPENAI_TEMPERATURE")
EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_CLOUD = os.getenv("PINECONE_CLOUD")
PINECONE_REGION = os.getenv("PINECONE_REGION")
PINECONE_DIMENSION = os.getenv("PINECONE_DIMENSION")


os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['OPENAI_API_MODEL'] = OPENAI_API_MODEL
os.environ['OPENAI_MAX_TOKENS'] = OPENAI_MAX_TOKENS
os.environ['OPENAI_TEMPERATURE'] = OPENAI_TEMPERATURE
os.environ['EMBEDDING_MODEL_NAME'] = EMBEDDING_MODEL_NAME
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY  
os.environ['PINECONE_INDEX_NAME'] = PINECONE_INDEX_NAME
os.environ['PINECONE_CLOUD'] = PINECONE_CLOUD
os.environ['PINECONE_REGION'] = PINECONE_REGION
os.environ['PINECONE_DIMENSION'] = PINECONE_DIMENSION

## Document loading

In [7]:
def get_load_docs():
    loader = DirectoryLoader(
        path="../data",
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        use_multithreading=True,
    )
    docs = loader.load()
    return docs

In [8]:
# Call load function to upload the folder
for file in os.listdir("../data"):
    if file.endswith(".pdf"):
        extract_document = get_load_docs()

In [9]:
print(f'length of the extracted data: {len(extract_document)}')

length of the extracted data: 105


In [10]:
print(f'type of the data extracted: {type(extract_document)}')

type of the data extracted: <class 'list'>


## Text splitting and chunking

In [11]:
def get_text_splitted_and_chunked(extract_document):
  """
    Split and chunk text into smaller segments.

    Parameters:
    - extract_document (str): The text document to be split and chunked.

    Returns:
    - text_chunks (list): A list containing the split and chunked text segments.

    Example:
    >>> document = "Lorem ipsum dolor sit amet, consectetur adipiscing elit..."
    >>> chunks = split_and_chunk_text(document)
    """
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=100,
      chunk_overlap=10,
      is_separator_regex=True,
  )
  text_chunks = text_splitter.split_documents(extract_document)
  return text_chunks

In [12]:
# Call get_text_splitted_and_chunked function
text_chunks = get_text_splitted_and_chunked(extract_document)

In [13]:
# testing
print(len(text_chunks[35].page_content))

23


In [14]:
# testing
print((text_chunks[89].page_content))

on an urgent basis by a simple amendment to the legislation. We will enact a Public Procurement Law


## Embedding

In [15]:
def get_embeddings():
  """
    Initializes and returns an instance of Hugging Face Embeddings.

    Returns:
    HuggingFaceEmbeddings: An instance of Hugging Face Embeddings \n
    initialized with the specified model name.
  """
  model_name = "sentence-transformers/all-mpnet-base-v2"
  model_kwargs = {'device': 'cpu'}
  encode_kwargs = {'normalize_embeddings': False}
  embeddings = HuggingFaceEmbeddings(
      model_name=model_name,
      model_kwargs=model_kwargs,
      encode_kwargs=encode_kwargs
  )
  return embeddings

In [16]:
# Calling the get_embeddings function
embeddings = get_embeddings()



In [17]:
# Test
embedded_query = embeddings.embed_query("How are you?")

In [18]:
print(len(embedded_query))

768


## Initiate pinecone vector database and stor data

In [65]:
# Config Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

In [66]:
spec = ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION)

index_name = PINECONE_INDEX_NAME

In [67]:
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

In [68]:
# Create an index
pc.create_index(
    dimension=int(PINECONE_DIMENSION),
    name=PINECONE_INDEX_NAME,
    metric="cosine",
    spec=spec
)

# Wat for index to be ready before connecting
while not pc.describe_index(index_name).status["ready"]:
  time.sleep(1)

In [69]:
index = pc.Index(PINECONE_INDEX_NAME)

In [73]:
print(index_name)

manifesto-bot


In [74]:
vectorstore = PineconeVectorStore.from_texts(
    [t.page_content for t in text_chunks],
    embedding=embeddings,
    index_name=index_name
)

In [78]:
# Test
query = "What are the key economic reforms to be introduced by Sajith?"
response = vectorstore.similarity_search(query, k=5)

In [77]:
response

[Document(page_content='Tax reforms; (2) Monetary \npolicy reforms, including \nCentral Bank (CBSL) \nindependence; (3) Cost-'),
 Document(page_content='such as reforming the public sector and state-owned enterprises – is a must. Reforms in energy'),
 Document(page_content='reforms in ten key areas, as detailed below. \nThe 10 pillars of reform:\n01\n02\n03\n04'),
 Document(page_content='also implementing deep \nstructural reforms to \naddress long-standing \nissues. The SJB’s'),
 Document(page_content='and (5) Governance \nreforms.Some Progress: Since \nstarting the EFF program, \nthe government has')]

## Define LLM

In [85]:
llm = ChatOpenAI(
    api_key=OPENAI_API_KEY,
    model=OPENAI_API_MODEL,
    max_tokens=OPENAI_MAX_TOKENS,
    temperature=OPENAI_TEMPERATURE,
)

## Define Memory

In [126]:
memory = ConversationSummaryMemory(
    llm=llm,
    max_token_limit=100,
    input_key="query",
    output_key="result",
)

# Prompt Management

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7f89ff4c6dd0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7f89ff4e5330>, root_client=<openai.OpenAI object at 0x7f89ff4c7130>, root_async_client=<openai.AsyncOpenAI object at 0x7f89ff4c48b0>, temperature=0.2, openai_api_key=SecretStr('**********'), openai_proxy='', max_tokens=100)

In [119]:
prompt_template = """
You are an honest, unbiased, and knowledgeable assistant and \n
reviewer who helps voters to assist them in answering critically \n 
on the question {question}, they ask based on the manifestos of presidential \n
candidates of the upcoming presidential elections in Sri Lanka. \n\n

Your answers should be based on the manifestos of the presidential \n
candidates and should not be biased or influenced by any political \n
party or individual. \n\n 

You should provide the complete, concise and answer with high readability \n
score within the given token limit. \n\n

Context: {context} \n\n
Response:
"""

In [120]:
PROMPT = PromptTemplate.from_template(
    prompt_template
)

In [121]:
print(PROMPT)

input_variables=['context', 'question'] template='\nYou are an honest, unbiased, and knowledgeable assistant and \n\nreviewer who helps voters to assist them in answering critically \n \non the question {question}, they ask based on the manifestos of presidential \n\ncandidates of the upcoming presidential elections in Sri Lanka. \n\n\n\nYour answers should be based on the manifestos of the presidential \n\ncandidates and should not be biased or influenced by any political \n\nparty or individual. \n\n \n\nYou should provide the complete, concise and answer with high readability \n\nscore within the given token limit. \n\n\n\nContext: {context} \n\n\nResponse:\n'


## Define Retriever

In [136]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [137]:
rag_chain = (
    {
      "context": retriever, "question": RunnablePassthrough()}
    | PROMPT
    | llm
    | StrOutputParser()
)

In [142]:
question = "What are the key economic reforms to be introduced by Sajith?"
response = rag_chain.invoke(question)

In [143]:
response

'Based on the manifestos of the presidential candidates in Sri Lanka, key economic reforms to be introduced by Sajith include tax reforms, monetary policy reforms (including Central Bank independence), and reforms in the public sector and state-owned enterprises. Additionally, there is a focus on energy reforms and deep structural reforms to address long-standing issues. The 10 pillars of reform outlined include areas such as improving infrastructure, promoting innovation and technology, enhancing education and skills development, and fostering a conducive environment for businesses. These'

In [146]:
results = vectorstore.similarity_search_with_score(query=question, k=4)

In [152]:
for doc, score in results:
    print(doc.page_content)

Tax reforms; (2) Monetary 
policy reforms, including 
Central Bank (CBSL) 
independence; (3) Cost-
such as reforming the public sector and state-owned enterprises – is a must. Reforms in energy
reforms in ten key areas, as detailed below. 
The 10 pillars of reform:
01
02
03
04
also implementing deep 
structural reforms to 
address long-standing 
issues. The SJB’s


In [155]:
def similarity_search_with_metadata(vectorstore, question):
    # Perform similarity search
    results = vectorstore.similarity_search_with_score(query=question, k=4)
    
    # Display the source and text segment for each result
    for doc, score in results:
        metadata = doc.page_content  # Adjust according to your document structure
        print(f"Score: {score}")
        print(f"Text Segment: {metadata}")
        print("---")

similarity_search_with_metadata(vectorstore, question=question)


Score: 0.668023467
Text Segment: Tax reforms; (2) Monetary 
policy reforms, including 
Central Bank (CBSL) 
independence; (3) Cost-
---
Score: 0.646106541
Text Segment: such as reforming the public sector and state-owned enterprises – is a must. Reforms in energy
---
Score: 0.638928354
Text Segment: reforms in ten key areas, as detailed below. 
The 10 pillars of reform:
01
02
03
04
---
Score: 0.635710835
Text Segment: also implementing deep 
structural reforms to 
address long-standing 
issues. The SJB’s
---
