In [1]:
pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.0


In [2]:
from dotenv import find_dotenv,load_dotenv

In [None]:
import os

load_dotenv(override=True)  # Loads the variables from .env

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [5]:
# Sample Documents

documents = [
    "This is a list that contains sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [6]:
# here we need to find how much similarity below query has with which documents
query = "keyword-based search"

#### Cleaning the Text

In [7]:
import re
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove Punctuation
    text = re.sub(r'[^\w\s]','', text)
    return text

#### Applying preprocessing on the documents

In [8]:
preprocessed_doc = [preprocess_text(doc) for doc in documents]
print(preprocessed_doc)

['this is a list that contains sample documents', 'keywords are important for keywordbased search', 'document analysis involves extracting keywords', 'keywordbased search relies on sparse embeddings']


## Sparse Vector Implementation(Keyword Based Search)

#### Applying tf idf vectorization on preprocessed documents and query(for sparse search)

In [9]:
preprocessed_query = preprocess_text(query)
preprocessed_query

'keywordbased search'

In [10]:
tfidf_obj = TfidfVectorizer()
vectorized_doc = tfidf_obj.fit_transform(preprocessed_doc)

In [11]:
vectorized_doc_arr = vectorized_doc.toarray()
vectorized_doc_arr

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [12]:
vectorized_doc_arr[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [13]:
preprocessed_query

'keywordbased search'

In [14]:
vectorized_query = tfidf_obj.transform([preprocessed_query])
vectorized_query

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (1, 21)>

In [15]:
vectorized_query_arr = vectorized_query.toarray()

In [16]:
vectorized_query_arr

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

#### Finding the Similarity between the query and the documents

In [17]:
similarity = cosine_similarity(vectorized_doc_arr,vectorized_query_arr)

In [18]:
print(similarity)

[[0.        ]
 [0.50551777]
 [0.        ]
 [0.48693426]]


In [19]:
preprocessed_query

'keywordbased search'

In [20]:
preprocessed_doc

['this is a list that contains sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

#### Ranking of the documents using similarity

In [21]:
ranked_indices = np.argsort(similarity,axis=0)[::-1].flatten()

In [22]:
ranked_indices

array([1, 3, 2, 0])

In [23]:
ranked_doc = [preprocessed_doc[i] for i in ranked_indices]

In [24]:
for i, doc in enumerate(ranked_doc):
    print(f"Rank {i+1} -> {doc}")

Rank 1 -> keywords are important for keywordbased search
Rank 2 -> keywordbased search relies on sparse embeddings
Rank 3 -> document analysis involves extracting keywords
Rank 4 -> this is a list that contains sample documents


## Dense Vector Implementation (Manual)

In [25]:
document_embedding = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [26]:
document_embedding

array([[0.634, 0.234, 0.867, 0.042, 0.249],
       [0.123, 0.456, 0.789, 0.321, 0.654],
       [0.987, 0.654, 0.321, 0.123, 0.456]])

In [27]:
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [28]:
query_embedding

array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [29]:
similarities = cosine_similarity(document_embedding,query_embedding)

In [30]:
similarities

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

In [31]:
ranked_indices_dense = np.argsort(similarities,axis=0)[::-1].flatten()

In [32]:
ranked_indices_dense

array([0, 2, 1])

In [33]:
for i, idx in enumerate(ranked_indices_dense):
    print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


### Using Sentence Transformers for Dense Embedding

# Using Langchain

In [34]:
pip install langchain langchain-core langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.25-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core
  Downloading langchain_core-0.3.65-py3-none-any.whl.metadata (5.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Downloading langsmith-0.3.45-py3-none-any.whl.metadata (15 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading ty

In [35]:
pip install pypdf

Collecting pypdf
  Downloading pypdf-5.6.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.6.0-py3-none-any.whl (304 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/304.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/304.2 kB[0m [31m16.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.2/304.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.6.0


In [36]:
from langchain_community.document_loaders import PyPDFLoader

In [38]:
loader = PyPDFLoader('/content/sample_data/Machine_Learning_Essay.pdf')
documents = loader.load()
print(documents)

[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20250410052841', 'source': '/content/sample_data/Machine_Learning_Essay.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Machine Learning: Transforming the Future of Technology\nMachine Learning (ML) is a dynamic and rapidly evolving branch of artificial intelligence (AI) that\nenables systems to learn from data, identify patterns, and make decisions with minimal human\nintervention. It has revolutionized how we interact with technology, contributing to advancements in\nnumerous fields such as healthcare, finance, transportation, and entertainment.\nAt its core, machine learning is about developing algorithms that allow computers to learn from and\nmake predictions or decisions based on data. These algorithms improve over time as they are\nexposed to more data, enhancing their accuracy and efficiency. ML is categorized into three main\ntypes: supervis

In [39]:
documents[0].page_content

'Machine Learning: Transforming the Future of Technology\nMachine Learning (ML) is a dynamic and rapidly evolving branch of artificial intelligence (AI) that\nenables systems to learn from data, identify patterns, and make decisions with minimal human\nintervention. It has revolutionized how we interact with technology, contributing to advancements in\nnumerous fields such as healthcare, finance, transportation, and entertainment.\nAt its core, machine learning is about developing algorithms that allow computers to learn from and\nmake predictions or decisions based on data. These algorithms improve over time as they are\nexposed to more data, enhancing their accuracy and efficiency. ML is categorized into three main\ntypes: supervised learning, unsupervised learning, and reinforcement learning.\nSupervised learning involves training a model on a labeled dataset, where the input data is paired\nwith the correct output. The model learns to map inputs to the correct output and is then te

In [40]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [41]:
splitter = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=30)
chunks = splitter.split_documents(documents)
print(chunks)

[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20250410052841', 'source': '/content/sample_data/Machine_Learning_Essay.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Machine Learning: Transforming the Future of Technology'), Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20250410052841', 'source': '/content/sample_data/Machine_Learning_Essay.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Machine Learning (ML) is a dynamic and rapidly evolving branch of artificial intelligence (AI) that'), Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20250410052841', 'source': '/content/sample_data/Machine_Learning_Essay.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='enables systems to learn from data, identify patterns, and make de

In [42]:
len(chunks)

44

In [43]:
from langchain.embeddings import OpenAIEmbeddings

In [None]:
openai_api_key = "your api key -> better to give in a .env file or set an environment variable"
langsmith_key = "fetch from .env file or environment variables"

In [45]:
embedding = OpenAIEmbeddings(api_key=openai_api_key,model="text-embedding-3-large")

  embedding = OpenAIEmbeddings(api_key=openai_api_key,model="text-embedding-3-large")


In [46]:
from langchain.vectorstores import Chroma

In [47]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.10.0-py3-none-any.whl.metadata (6.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  Downloading opentelemetry_instrumentation_fastapi-0.55b1-py3-none-any.whl.metadata (2.2 kB)
Collecting opentelemetry-sdk>=1.2.0 (from 

In [48]:
vector_store = Chroma.from_documents(chunks,embedding)

In [49]:
vectorstore_retriever = vector_store.as_retriever(search_kwargs={"k":3})

In [50]:
vectorstore_retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7df57cb901d0>, search_kwargs={'k': 3})

## We need keyword based also

In [51]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [52]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [53]:
keyword_retriever = BM25Retriever.from_documents(chunks)

## Ensemble Retriever

In [54]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retriever,keyword_retriever],weights=[0.7,0.3])

In [55]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [56]:
pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [57]:
pip install accelerate



In [58]:
import torch

In [59]:
pip install transformers



In [60]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain import HuggingFacePipeline

In [61]:
def load_quantized_model(model_name: str):
  """
  model_name :- Name or path of the model to be loaded
  return: Loaded quantized model
  """
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )
  model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, torch_dtype=torch.bfloat16)
  return model

In [62]:
def initialize_tokenizer(model_name: str):
  """
  model_name :- Name or path of the model to be loaded for tokenizer initialization
  return: Initialized tokenizer
  """

  tokenizer = AutoTokenizer.from_pretrained(model_name,return_token_type_ids=False)
  tokenizer.bos_token_id = tokenizer.eos_token_id
  return tokenizer

In [None]:
HUGGINGFACEHUB_API_TOKEN="<Your hugging face token>"

In [68]:
tokenizer = initialize_tokenizer(model_name)

In [None]:
model = load_quantized_model(model_name)

In [66]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1
)


NameError: name 'model' is not defined

In [69]:
llm=HuggingFacePipeline(pipeline=pipeline)

  llm=HuggingFacePipeline(pipeline=pipeline)


In [70]:
from langchain.chains import RetrievalQA

In [71]:
normal_chain=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=vectorstore_retriever)

In [72]:
hybrid_chain=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=ensemble_retriever)