<a href="https://colab.research.google.com/github/Pradeep2535/Colab-Notebooks-for-GenAI/blob/main/Hybrid_Search_Reranking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
#Sample documents

documents = [



"This is a list which containig sample documents.",

"Keywords are important for keyword-based search.",

"Document analysis involves extracting keywords.",

"Keyword-based search relies on sparse embeddings."
]

In [None]:
import re

def preprocess_text(text):

  text =text.lower()
  text =re.sub(r' [^\w\s]', '', text)

  return text

In [None]:
preprocess_documents = [preprocess_text(doc) for doc in documents]

In [None]:
preprocess_documents

In [None]:
preprocessed_query = preprocess_text("Keyword-based search")

In [None]:
preprocessed_query

In [None]:
vector = TfidfVectorizer()

In [None]:
X = vector.fit_transform(preprocess_documents)

In [None]:
X.toarray()

In [None]:
query_embedding=vector.transform([preprocessed_query])

In [None]:
query_embedding.toarray()

In [None]:
similarities = cosine_similarity(X, query_embedding)

In [None]:
similarities

In [None]:
ranked_documents = [documents[i] for i in ranked_indices]

In [None]:
ranked_indices=np.argsort(similarities, axis=0)[::-1].flatten()

In [None]:
ranked_indices

In [None]:
for i,doc in enumerate(ranked_documents):
  print(f"Rank {i+1}: {doc}")

In [None]:
query="Keyword-based search"

In [None]:
document_embeddings = np.array([
[0.634, 0.234, 0.867, 0.042, 0.249],
[0.123, 0.456, 0.789, 0.321, 0.654],
[0.987, 0.654, 0.321, 0.123, 0.456]])

In [None]:
query_embedding = np.array([[0.789, 0.321,0.654,0.987,0.123]])

In [None]:
similarities = cosine_similarity(document_embeddings, query_embedding)

In [None]:
ranked_indices=np.argsort(similarities, axis=0)[::-1].flatten()

In [None]:
ranked_indices=np.argsort(similarities, axis=0)[::-1].flatten()

In [None]:
ranked_indices

In [None]:
for i,idx in enumerate(ranked_indices):
  print(f"Rank {i+1}: Document {idx+1}")

In [None]:
doc_path = "/content/2005.11401v4.pdf"

In [None]:
!pip install pypdf langchain_community

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
loader=PyPDFLoader(doc_path)

In [None]:
docs=loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=30)

In [None]:
chunks = splitter.split_documents(docs)

In [None]:
chunks

In [None]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [None]:
from google.colab import userdata

HF_TOKEN=userdata.get("HF_TOKEN")

In [None]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN,model_name="BAAI/bge-base-en-v1.5")

In [None]:
!pip install chromadb

In [None]:
from langchain.vectorstores import Chroma

In [None]:
vectorstore=Chroma.from_documents(chunks,embeddings)

In [None]:
vectorstore_retriever=vectorstore.as_retriever(search_kwargs={"k":3})

In [None]:
!pip install rank_bm25

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [None]:
keyword_retreiver = BM25Retriever.from_documents(chunks)

In [None]:
retreiver = EnsembleRetriever(retrievers=[vectorstore_retriever,keyword_retreiver],weights=[0.5,0.5])

In [None]:
model_name="HuggingFaceH4/zephyr-7b-beta"

In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install accelerate

In [None]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline)
from langchain import HuggingFacePipeline

In [None]:
def load_quantized_model(model_name: str):



  bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
  )

  model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
  )

  return model

In [None]:
def initialize_tokenizer(model_name: str):
  tokenizer =AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)

  tokenizer.bos_token_id = 1

  return tokenizer

In [None]:
tokenizer=initialize_tokenizer(model_name)

In [None]:
model   =load_quantized_model(model_name)

In [None]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,use_cache=True,device_map="auto",max_length=2048,do_sample=True,top_k=5,num_return_sequences=1,eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id
)


In [None]:
llm=HuggingFacePipeline(pipeline=pipeline)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
normal_chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=vectorstore_retriever)

In [None]:
hybrid_chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=retreiver)

In [None]:
response1=normal_chain.invoke("What is RAG token model?")

In [None]:
print(response1["result"])

In [None]:

response2=hybrid_chain.invoke("What is RAG token model?")

In [None]:
response2

In [None]:
print(response2["result"])

#Reranking

In [None]:
!pip install cohere

In [None]:
COHERE_API = userdata.get("COHERE_API")

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

In [None]:
compressor = CohereRerank(cohere_api_key=COHERE_API)

In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retreiver
    )

In [None]:
compressed_docs = compression_retriever.get_relevant_documents("What is RAG token model?")

In [None]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=compression_retriever
)

In [None]:
response = hybrid_chain.invoke("What is RAG token model?")

In [None]:
response

In [None]:
print(response["result"])