<a href="https://colab.research.google.com/github/sunnysavita10/Indepth-GENAI/blob/main/Hybrid_Search_in_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Sample documents
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [None]:
query="keyword-based search"

In [None]:
import re
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [None]:
preprocess_documents=[preprocess_text(doc) for doc in documents]

In [None]:
preprocess_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [None]:
print("Preprocessed Documents:")
for doc in preprocess_documents:
    print(doc)

Preprocessed Documents:
this is a list which containig sample documents
keywords are important for keywordbased search
document analysis involves extracting keywords
keywordbased search relies on sparse embeddings


In [None]:
print("Preprocessed Query:")
print(query)

Preprocessed Query:
keyword-based search


In [None]:
preprocessed_query = preprocess_text(query)

In [None]:
preprocessed_query

'keywordbased search'

In [None]:
vector=TfidfVectorizer()

In [None]:
X=vector.fit_transform(preprocess_documents)

In [None]:
X.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [None]:
X.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [None]:
query_embedding=vector.transform([preprocessed_query])

In [None]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [None]:
similarities = cosine_similarity(X, query_embedding)

In [None]:
similarities

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [None]:
np.argsort(similarities,axis=0)

array([[0],
       [2],
       [3],
       [1]])

In [None]:
ranked_documents = [documents[i] for i in ranked_indices]

In [None]:
#Ranking
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()

In [None]:
ranked_indices


array([1, 3, 2, 0])

In [None]:
# Output the ranked documents
for i, doc in enumerate(ranked_documents):
    print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


In [None]:
query

'keyword-based search'

In [None]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [None]:
#https://huggingface.co/sentence-transformers

In [None]:
document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [None]:
# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [None]:
# Calculate cosine similarity between query and documents
similarities = cosine_similarity(document_embeddings, query_embedding)

In [None]:
similarities

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

In [None]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [None]:
ranked_indices

array([0, 2, 1])

In [None]:
# Output the ranked documents
for i, idx in enumerate(ranked_indices):
    print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


In [None]:
doc_path="/content/Retrieval-Augmented-Generation-for-NLP"

In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


In [None]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting langchain<0.3.0,>=0.2.5 (from langchain_community)
  Downloading langchain-0.2.5-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.3.0,>=0.2.7 (from langchain_community)
  Downloading langchain_core-0.2.9-py3-none-any.whl (321 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.8/321.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.2.0,>=0.1.0 (from langchain_community)
  Downloading langsmith-0.1.81-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
loader=PyPDFLoader(doc_path)

In [None]:
docs=loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)

In [None]:
chunks = splitter.split_documents(docs)

In [None]:
chunks

[Document(page_content='Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,', metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP', 'page': 0}),
 Document(page_content='Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research;‡University College London;⋆New York University;\nplewis@fb.com\nAbstract', metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP', 'page': 0}),
 Document(page_content='plewis@fb.com\nAbstract\nLarge pre-trained language models have been shown to store factual knowledge\nin their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-', metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP', 'page': 0}),
 Document(page_content='stream NLP tasks. However, their ability to access and precisely manipulate knowl-\nedge is stil

In [None]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [None]:
HF_TOKEN="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"

In [None]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.3-py3-none-any.whl (559 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/559.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m553.0/559.5 kB[0m [31m20.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)


In [None]:
from langchain.vectorstores import Chroma

In [None]:
vectorstore=Chroma.from_documents(chunks,embeddings)

In [None]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceInferenceAPIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7af7e62bb010>, search_kwargs={'k': 3})

In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [None]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [None]:
keyword_retriever.k =  3

In [None]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])

# Mixing vector search and keyword search for Hybrid search

## hybrid_score = (1 — alpha) * sparse_score + alpha * dense_score

In [None]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (41

In [None]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/309.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


In [None]:
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain import HuggingFacePipeline

In [None]:
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

In [None]:
# initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: Name or path of the model for tokenizer initialization.
    return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

In [None]:
tokenizer = initialize_tokenizer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
model = load_quantized_model(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline)

  warn_deprecated(


In [None]:
from langchain.chains import RetrievalQA

In [None]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [None]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [None]:
response1 = normal_chain.invoke("What is Abstractive Question Answering?")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
response1

{'query': 'What is Abstractive Question Answering?',
 'result': 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\n3.2 Abstractive Question Answering\nRAG models can go beyond simple extractive QA and answer questions with free-form, abstractive\n\neven when the correct answer is not in any retrieved document, achieving 11.8% accuracy in such\ncases for NQ, where an extractive model would score 0%.\n4.2 Abstractive Question Answering\n\nthe popular extractive QA paradigm [ 5,7,31,26], where answers are extracted spans from retrieved\ndocuments, relying primarily on non-parametric knowledge. We also compare to “Closed-Book\n\nQuestion: What is Abstractive Question Answering?\nHelpful Answer: Abstractive Question Answering is a type of question answering that goes beyond simply finding the answer in a retrieved document, as it allows for free-form, abstractive responses 

In [None]:
print(response1.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

3.2 Abstractive Question Answering
RAG models can go beyond simple extractive QA and answer questions with free-form, abstractive

even when the correct answer is not in any retrieved document, achieving 11.8% accuracy in such
cases for NQ, where an extractive model would score 0%.
4.2 Abstractive Question Answering

the popular extractive QA paradigm [ 5,7,31,26], where answers are extracted spans from retrieved
documents, relying primarily on non-parametric knowledge. We also compare to “Closed-Book

Question: What is Abstractive Question Answering?
Helpful Answer: Abstractive Question Answering is a type of question answering that goes beyond simply finding the answer in a retrieved document, as it allows for free-form, abstractive responses that may not be present in any of the documents. This is in contrast to extractiv

In [None]:
response2 = hybrid_chain.invoke("What is Abstractive Question Answering?")

In [None]:
response2

{'query': 'What is Abstractive Question Answering?',
 'result': 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\n3.2 Abstractive Question Answering\nRAG models can go beyond simple extractive QA and answer questions with free-form, abstractive\n\neven when the correct answer is not in any retrieved document, achieving 11.8% accuracy in such\ncases for NQ, where an extractive model would score 0%.\n4.2 Abstractive Question Answering\n\nLabel GenerationDocument\nIndexDefine\t"middle\tear" (x)\nQuestion Answering:\nQuestion QueryThe\tmiddle\tear\tincludes\nthe\ttympanic\tcavity\tand\nthe\tthree\tossicles.\t\t (y)\nQuestion Answering:\n\nthe popular extractive QA paradigm [ 5,7,31,26], where answers are extracted spans from retrieved\ndocuments, relying primarily on non-parametric knowledge. We also compare to “Closed-Book\n\nQuestion: What is Abstractive Question Answer

In [None]:
print(response2.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

3.2 Abstractive Question Answering
RAG models can go beyond simple extractive QA and answer questions with free-form, abstractive

even when the correct answer is not in any retrieved document, achieving 11.8% accuracy in such
cases for NQ, where an extractive model would score 0%.
4.2 Abstractive Question Answering

Label GenerationDocument
IndexDefine	"middle	ear" (x)
Question Answering:
Question QueryThe	middle	ear	includes
the	tympanic	cavity	and
the	three	ossicles.		 (y)
Question Answering:

the popular extractive QA paradigm [ 5,7,31,26], where answers are extracted spans from retrieved
documents, relying primarily on non-parametric knowledge. We also compare to “Closed-Book

Question: What is Abstractive Question Answering?
Helpful Answer: Abstractive question answering involves generating a natural language answer to