Type of Retrieval:
1. Naive Retrieval 
2. Sentench Window Retrieval 
3. Self Query Retrieval
4. Parent Document Retrieval
5. HDE (Hypothetical Document Embedding) 

In [1]:
#Hybrid Search : Naive Retrieval + Keyword Search

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
documents = [
    "Autonomous vehicles rely on precise detection of road signs and signals.",
    "Artificial Intelligence plays a critical role in the development of autonomous driving technologies.",
    "Advanced algorithms, such as those used in autonomous driving systems, require versatile programming languages like Python.",
    "Traffic sign detection is a fundamental component of autonomous driving systems.",
    "Data science is essential for analyzing sensor data and improving autonomous driving accuracy."
]

In [3]:
query = "autonomous driving systems"

In [4]:
import re 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','', text)
    return text

In [5]:
preprocessed_documents = [preprocess_text(doc) for doc in documents]

In [6]:
preprocessed_query =  preprocess_text(query)

In [7]:
vector = TfidfVectorizer()

In [8]:
X = vector.fit_transform(preprocessed_documents)

In [9]:
X.toarray()[0]

array([0.        , 0.        , 0.        , 0.        , 0.26926948,
       0.        , 0.        , 0.15903489, 0.        , 0.        ,
       0.        , 0.26926948, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.22351799, 0.33375258,
       0.        , 0.33375258, 0.        , 0.        , 0.33375258,
       0.        , 0.33375258, 0.        , 0.        , 0.        ,
       0.        , 0.33375258, 0.33375258, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.33375258, 0.        ])

In [10]:
query_embedding = vector.transform([preprocessed_query])

In [11]:
similarities = cosine_similarity(X, query_embedding)

In [12]:
similarities

array([[0.069312  ],
       [0.16035627],
       [0.29382252],
       [0.41485198],
       [0.13893868]])

In [13]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [14]:
ranked_indices

array([3, 2, 1, 4, 0], dtype=int64)

In [15]:
ranked_documents = [documents[i] for i in ranked_indices]

In [16]:
query 

'autonomous driving systems'

In [17]:
for i, doc in enumerate(ranked_documents):
    print(f"Ranked{i+1}: {doc}")

Ranked1: Traffic sign detection is a fundamental component of autonomous driving systems.
Ranked2: Advanced algorithms, such as those used in autonomous driving systems, require versatile programming languages like Python.
Ranked3: Artificial Intelligence plays a critical role in the development of autonomous driving technologies.
Ranked4: Data science is essential for analyzing sensor data and improving autonomous driving accuracy.
Ranked5: Autonomous vehicles rely on precise detection of road signs and signals.


In [18]:
document_embeddings = np.array([
    [0.112, 0.422, 0.312, 0.491, 0.645],
    [0.553, 0.126, 0.625, 0.701, 0.725],
    [0.821, 0.859, 0.901, 0.953, 0.991]
])

In [19]:
query_embeddings = np.array([[0.1, 0.2, 0.3, 0.4, 0.5]])

In [20]:
similarities = cosine_similarity(document_embeddings,query_embeddings)

In [21]:
ranked_indices = np.argsort(similarities, axis= 0)[::-1].flatten()

In [22]:
ranked_indices 

array([0, 1, 2], dtype=int64)

In [23]:
doc_path = 'D:\Algo_lab\langchain_essentials\Traffic_Sign_Detection_on_Dhaka_Traffic_Sign_Dataset.pdf'

In [24]:
from langchain_community.document_loaders import PyPDFLoader

In [25]:
loader = PyPDFLoader(doc_path)

In [26]:
docs = loader.load()

In [27]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [28]:
splitter = RecursiveCharacterTextSplitter(chunk_size =200, chunk_overlap = 30)

In [29]:
chunks = splitter.split_documents(docs)

In [30]:
chunks[:5]

[Document(metadata={'source': 'D:\\Algo_lab\\langchain_essentials\\Traffic_Sign_Detection_on_Dhaka_Traffic_Sign_Dataset.pdf', 'page': 0}, page_content='DetectionandRecognitionofBangladeshiTrafficSigninReal-worldImages\nSyedSaminSadaf\n200042163\nMizbaulHaqueMaruf\n200042125\nSyedTamzidBakth\n200042145\nDepartmentofComputerScienceandEngineering'),
 Document(metadata={'source': 'D:\\Algo_lab\\langchain_essentials\\Traffic_Sign_Detection_on_Dhaka_Traffic_Sign_Dataset.pdf', 'page': 0}, page_content='IslamicUniversityofTechnology\nAugust,2024'),
 Document(metadata={'source': 'D:\\Algo_lab\\langchain_essentials\\Traffic_Sign_Detection_on_Dhaka_Traffic_Sign_Dataset.pdf', 'page': 1}, page_content='Contents\n1 Introduction 1\n1.1 Overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1\n1.2 Motivation . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1'),
 Document(metadata={'source': 'D:\\Algo_lab\\langchain_essentials\\Traffic_Sign_Detection_on_Dhaka_Traf

In [31]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [32]:
import os

In [33]:
from dotenv import load_dotenv

In [34]:
load_dotenv()

True

In [35]:
HF_TOKEN = os.environ['HF_TOKEN']

In [54]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")

In [55]:
from langchain.vectorstores import Chroma 

In [56]:
for i, doc in enumerate(chunks):
    if not doc.page_content:
        print(f"Empty content in document {i}")

In [57]:
keys = set(chunks[0].metadata.keys())
for i, doc in enumerate(chunks):
    if set(doc.metadata.keys()) != keys:
        print(f"Inconsistent metadata in document {i}")

In [58]:
print(type(embeddings))

<class 'langchain_community.embeddings.huggingface.HuggingFaceInferenceAPIEmbeddings'>


In [59]:
ids = list(range(len(chunks)))  # Ensure IDs match the number of documents

In [60]:
import traceback

try:
    vectorstore = Chroma.from_documents(chunks, embeddings)
except KeyError as e:
    print("KeyError:", e)
    traceback.print_exc()

KeyError: 0


Traceback (most recent call last):
  File "C:\Users\Hp\AppData\Local\Temp\ipykernel_2604\1236997188.py", line 4, in <module>
    vectorstore = Chroma.from_documents(chunks, embeddings)
  File "d:\Algo_lab\langchain_essentials\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 878, in from_documents
    return cls.from_texts(
  File "d:\Algo_lab\langchain_essentials\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 842, in from_texts
    chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
  File "d:\Algo_lab\langchain_essentials\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 295, in add_texts
    [embeddings[idx] for idx in non_empty_ids] if embeddings else None
  File "d:\Algo_lab\langchain_essentials\venv\lib\site-packages\langchain_community\vectorstores\chroma.py", line 295, in <listcomp>
    [embeddings[idx] for idx in non_empty_ids] if embeddings else None
KeyError: 0
