In [2]:
import boto3
from langchain_aws import ChatBedrock
from botocore.config import Config
import warnings
warnings.filterwarnings("ignore")

region = "us-west-2"
config = Config(
    region_name=region,
    signature_version = "v4",
    retries={
        "max_attempts":3,
        "mode" : "standard",
    }
)
bedrock_rt = boto3.client("bedrock-runtime", config=config)

sonnet_model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

model_kwargs = {
    "max_tokens" : 4096,
    "temperature" : 0.0,
    "stop_sequences" : ["Human"],
}

llm = ChatBedrock(
    client = bedrock_rt,
    model_id = sonnet_model_id,
    model_kwargs = model_kwargs,
)

In [3]:
from langchain_community.embeddings import BedrockEmbeddings

bedrock_client = boto3.client(service_name='bedrock-runtime', 
                              region_name='us-east-1')
embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",
                                       client=bedrock_client)

In [4]:
import os
import getpass
from langchain_cohere import CohereEmbeddings

os.environ['COHERE_API_KEY'] = getpass.getpass()
embeddings_model = CohereEmbeddings(
    model="embed-english-light-v3.0"
)

**RAG Fusion**

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

file_path = (
    "/home/ubuntu/learn/Insurance_Handbook_20103.pdf"
)
loader = PyPDFLoader(file_path,extract_images = True)
pages = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,chunk_overlap = 0)
docs = text_splitter.split_documents(pages)
db = FAISS.from_documents(docs , embeddings_model)
print(db.index.ntotal)

621


In [6]:
from langchain_core.output_parsers import StrOutputParser
from langchain import hub

prompt = hub.pull("langchain-ai/rag-fusion-query-generation")
prompt.pretty_print()



You are a helpful assistant that generates multiple search queries based on a single input query.


Generate multiple search queries related to: [33;1m[1;3m{original_query}[0m


OUTPUT (4 queries):


In [7]:
original_query = "How to identify in what scenarios I have to take an insurance?"

In [8]:
generate_queries = (
    prompt | llm | StrOutputParser() | (lambda x : x.split("\n"))
)

In [9]:
retriever = db.as_retriever()

In [10]:
from langchain.load import dumps, loads


def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [11]:
chain = generate_queries | retriever.map() | reciprocal_rank_fusion

In [13]:
result = chain.invoke({"original_query" : "What are the contents of this pdf?"})
result[:5]

[(Document(metadata={'source': '/home/ubuntu/learn/Insurance_Handbook_20103.pdf', 'page': 199}, page_content='I.I.I.\tInsurance\tHandbook\t\t\twww.iii.org/insurancehandbook\t \t193\nI.I.I. \nResources\nI.I.I. Store\nThe I.I.I. Store is your gateway to a wide array of books and brochures from the Insurance \nInformation Institute. Print and PDF formats, and quantity discounts are available  for most products. Order online at www.iii.org/publications, call 212-346-5500 or email publications@iii.org.\nI.I.I. INSURANCE FACT BOOK\nThousands of insurance facts, figures, tables and graphs designed for quick and easy reference.\nTHE FINANCIAL SERVICES FACT BOOK\nBanking, securities and insurance industry trends and statistics. Published jointly with the Financial Services Roundtable. Online version available at www.financialservicesfacts.org\nINSURANCE HANDBOOK'),
  0.049206349206349205),
 (Document(metadata={'source': '/home/ubuntu/learn/Insurance_Handbook_20103.pdf', 'page': 84}, page_conten