In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [5]:
#Sample documents

documents = [



"This is a list which containig sample documents.",

"Keywords are important for keyword-based search.",

"Document analysis involves extracting keywords.",

"Keyword-based search relies on sparse embeddings."
]

In [7]:
import re

def preprocess_text(text):

  text =text.lower()
  text =re.sub(r' [^\w\s]', '', text)

  return text

In [8]:
preprocess_documents = [preprocess_text(doc) for doc in documents]

In [10]:
preprocess_documents

['this is a list which containig sample documents.',
 'keywords are important for keyword-based search.',
 'document analysis involves extracting keywords.',
 'keyword-based search relies on sparse embeddings.']

In [11]:
preprocessed_query = preprocess_text("Keyword-based search")

In [12]:
preprocessed_query

'keyword-based search'

In [13]:
vector = TfidfVectorizer()

In [14]:
X = vector.fit_transform(preprocess_documents)

In [15]:
X.toarray()

array([[0.        , 0.        , 0.        , 0.37796447, 0.        ,
        0.37796447, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.        , 0.        , 0.37796447, 0.        , 0.        ,
        0.37796447, 0.37796447],
       [0.        , 0.42693074, 0.3365971 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.42693074, 0.42693074,
        0.        , 0.        , 0.3365971 , 0.3365971 , 0.        ,
        0.        , 0.        , 0.        , 0.3365971 , 0.        ,
        0.        , 0.        ],
       [0.46516193, 0.        , 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.46516193, 0.        , 0.        ,
        0.46516193, 0.        , 0.        , 0.36673901, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.32555709, 0.        , 0.        ,
        0.       

In [16]:
query_embedding=vector.transform([preprocessed_query])

In [17]:
query_embedding.toarray()

array([[0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.57735027, 0.        ,
        0.        , 0.        ]])

In [18]:
similarities = cosine_similarity(X, query_embedding)

In [19]:
similarities

array([[0.        ],
       [0.58300328],
       [0.        ],
       [0.56388142]])

In [22]:
ranked_documents = [documents[i] for i in ranked_indices]

In [23]:
ranked_indices=np.argsort(similarities, axis=0)[::-1].flatten()

In [24]:
ranked_indices

array([1, 3, 2, 0])

In [25]:
for i,doc in enumerate(ranked_documents):
  print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


In [27]:
query="Keyword-based search"

In [28]:
document_embeddings = np.array([
[0.634, 0.234, 0.867, 0.042, 0.249],
[0.123, 0.456, 0.789, 0.321, 0.654],
[0.987, 0.654, 0.321, 0.123, 0.456]])

In [29]:
query_embedding = np.array([[0.789, 0.321,0.654,0.987,0.123]])

In [30]:
similarities = cosine_similarity(document_embeddings, query_embedding)

In [31]:
ranked_indices=np.argsort(similarities, axis=0)[::-1].flatten()

In [32]:
ranked_indices=np.argsort(similarities, axis=0)[::-1].flatten()

In [33]:
ranked_indices

array([0, 2, 1])

In [34]:
for i,idx in enumerate(ranked_indices):
  print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


In [1]:
doc_path = "/content/2005.11401v4.pdf"

In [2]:
!pip install pypdf langchain_community



In [3]:
from langchain_community.document_loaders import PyPDFLoader

In [4]:
loader=PyPDFLoader(doc_path)

In [5]:
docs=loader.load()

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=30)

In [8]:
chunks = splitter.split_documents(docs)

In [9]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/content/2005.11401v4.pdf', 'total_pages': 19, 'page': 0, 'page_label': '1'}, page_content='Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,'),
 Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-04-13T00:48:38+00:00', 'author': '', 'keywords': '', 'moddate': '2021-04-13T00:48:38+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/F

In [10]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [11]:
from google.colab import userdata

HF_TOKEN=userdata.get("HF_TOKEN")

In [12]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN,model_name="BAAI/bge-base-en-v1.5")

In [22]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelem

In [13]:
from langchain.vectorstores import Chroma

In [14]:
vectorstore=Chroma.from_documents(chunks,embeddings)

In [15]:
vectorstore_retriever=vectorstore.as_retriever(search_kwargs={"k":3})

In [26]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [16]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [17]:
keyword_retreiver = BM25Retriever.from_documents(chunks)

In [18]:
retreiver = EnsembleRetriever(retrievers=[vectorstore_retriever,keyword_retreiver],weights=[0.5,0.5])

In [19]:
model_name="HuggingFaceH4/zephyr-7b-beta"

In [31]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [20]:
!pip install accelerate



In [21]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline)
from langchain import HuggingFacePipeline

In [22]:
def load_quantized_model(model_name: str):



  bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
  )

  model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
  )

  return model

In [23]:
def initialize_tokenizer(model_name: str):
  tokenizer =AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)

  tokenizer.bos_token_id = 1

  return tokenizer

In [24]:
tokenizer=initialize_tokenizer(model_name)

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [25]:
model   =load_quantized_model(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not in

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [26]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,use_cache=True,device_map="auto",max_length=2048,do_sample=True,top_k=5,num_return_sequences=1,eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id
)


Device set to use cuda:0


In [27]:
llm=HuggingFacePipeline(pipeline=pipeline)

  llm=HuggingFacePipeline(pipeline=pipeline)


In [28]:
from langchain.chains import RetrievalQA

In [29]:
normal_chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=vectorstore_retriever)

In [30]:
hybrid_chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=retreiver)

In [31]:
response1=normal_chain.invoke("What is RAG token model?")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [32]:
print(response1["result"])

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

RAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-
tor with transition probability: p′
θ(yi|x,y1:i−1) = ∑
z∈top-k(p(·|x)) pη(zi|x)pθ(yi|x,zi,y1:i−1) To

distribution over generated text. In one approach, RAG-Sequence, the model uses the same document
to predict each target token. The second approach, RAG-Token, can predict each target token based

2.1 Models
RAG-Sequence Model The RAG-Sequence model uses the same retrieved document to generate
the complete sequence. Technically, it treats the retrieved document as a single latent variable that

Question: What is RAG token model?
Helpful Answer: RAG-Token is a model in natural language generation that predicts each target token based on the context of the text rather than using the same retrieved document to predict all target tokens, as i

In [33]:

response2=hybrid_chain.invoke("What is RAG token model?")

In [34]:
response2

{'query': 'What is RAG token model?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nRAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-\ntor with transition probability: p′\nθ(yi|x,y1:i−1) = ∑\nz∈top-k(p(·|x)) pη(zi|x)pθ(yi|x,zi,y1:i−1) To\n\nSun Also Rises”. Similarly, document 1 dominates the posterior when “A Farewell to Arms” is\ngenerated. Intriguingly, after the ﬁrst token of each book is generated, the document posterior ﬂattens.\n\ndistribution over generated text. In one approach, RAG-Sequence, the model uses the same document\nto predict each target token. The second approach, RAG-Token, can predict each target token based\n\nthe parameters of a language model? arXiv e-prints, 2020. URL https://arxiv.org/abs/\n2002.08910.\n[53] Stephen Robertson and Hugo Zaragoza. The probabilistic relevance framework: Bm25 and

In [35]:
print(response2["result"])

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

RAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-
tor with transition probability: p′
θ(yi|x,y1:i−1) = ∑
z∈top-k(p(·|x)) pη(zi|x)pθ(yi|x,zi,y1:i−1) To

Sun Also Rises”. Similarly, document 1 dominates the posterior when “A Farewell to Arms” is
generated. Intriguingly, after the ﬁrst token of each book is generated, the document posterior ﬂattens.

distribution over generated text. In one approach, RAG-Sequence, the model uses the same document
to predict each target token. The second approach, RAG-Token, can predict each target token based

the parameters of a language model? arXiv e-prints, 2020. URL https://arxiv.org/abs/
2002.08910.
[53] Stephen Robertson and Hugo Zaragoza. The probabilistic relevance framework: Bm25 and

2.1 Models
RAG-Sequence Model The RAG-Sequence model uses the s

#Reranking

In [36]:
!pip install cohere

Collecting cohere
  Downloading cohere-5.15.0-py3-none-any.whl.metadata (3.4 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20250328-py3-none-any.whl.metadata (2.3 kB)
Downloading cohere-5.15.0-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.5/259.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading types_requests-2.32.0.20250328-py3-none-any.whl (20 kB)
Installing collected packages: types-requests, fastavro, cohere
Successfully installed cohere-5.15.0 fastavro-1.10.0 types-requests-2.32.0.20250328


In [37]:
COHERE_API = userdata.get("COHERE_API")

In [41]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

In [42]:
compressor = CohereRerank(cohere_api_key=COHERE_API)

  compressor = CohereRerank(cohere_api_key=COHERE_API)


In [45]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retreiver
    )

In [48]:
compressed_docs = compression_retriever.get_relevant_documents("What is RAG token model?")

  compressed_docs = compression_retriever.get_relevant_documents("What is RAG token model?")


In [49]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=compression_retriever
)

In [50]:
response = hybrid_chain.invoke("What is RAG token model?")

In [53]:
response

{'query': 'What is RAG token model?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nRAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-\ntor with transition probability: p′\nθ(yi|x,y1:i−1) = ∑\nz∈top-k(p(·|x)) pη(zi|x)pθ(yi|x,zi,y1:i−1) To\n\ndistribution over generated text. In one approach, RAG-Sequence, the model uses the same document\nto predict each target token. The second approach, RAG-Token, can predict each target token based\n\n2.1 Models\nRAG-Sequence Model The RAG-Sequence model uses the same retrieved document to generate\nthe complete sequence. Technically, it treats the retrieved document as a single latent variable that\n\nQuestion: What is RAG token model?\nHelpful Answer: The RAG token model is a variant of the standard autoregressive seq2seq generator, with an added transition probability that uses a 

In [52]:
print(response["result"])

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

RAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-
tor with transition probability: p′
θ(yi|x,y1:i−1) = ∑
z∈top-k(p(·|x)) pη(zi|x)pθ(yi|x,zi,y1:i−1) To

distribution over generated text. In one approach, RAG-Sequence, the model uses the same document
to predict each target token. The second approach, RAG-Token, can predict each target token based

2.1 Models
RAG-Sequence Model The RAG-Sequence model uses the same retrieved document to generate
the complete sequence. Technically, it treats the retrieved document as a single latent variable that

Question: What is RAG token model?
Helpful Answer: The RAG token model is a variant of the standard autoregressive seq2seq generator, with an added transition probability that uses a distribution over generated text instead of just predicting each 