Type of Retrieval:
1. Naive Retrieval 
2. Sentench Window Retrieval 
3. Self Query Retrieval
4. Parent Document Retrieval
5. HDE (Hypothetical Document Embedding) 

In [1]:
#Hybrid Search : Naive Retrieval + Keyword Search

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [19]:
documents = [
    "Autonomous vehicles rely on precise detection of road signs and signals.",
    "Artificial Intelligence plays a critical role in the development of autonomous driving technologies.",
    "Advanced algorithms, such as those used in autonomous driving systems, require versatile programming languages like Python.",
    "Traffic sign detection is a fundamental component of autonomous driving systems.",
    "Data science is essential for analyzing sensor data and improving autonomous driving accuracy."
]

In [20]:
query = "autonomous driving systems"

In [21]:
import re 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','', text)
    return text

In [22]:
preprocessed_documents = [preprocess_text(doc) for doc in documents]

In [23]:
preprocessed_query =  preprocess_text(query)

In [24]:
vector = TfidfVectorizer()

In [25]:
X = vector.fit_transform(preprocessed_documents)

In [26]:
X.toarray()[0]

array([0.        , 0.        , 0.        , 0.        , 0.26926948,
       0.        , 0.        , 0.15903489, 0.        , 0.        ,
       0.        , 0.26926948, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.22351799, 0.33375258,
       0.        , 0.33375258, 0.        , 0.        , 0.33375258,
       0.        , 0.33375258, 0.        , 0.        , 0.        ,
       0.        , 0.33375258, 0.33375258, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.33375258, 0.        ])

In [27]:
query_embedding = vector.transform([preprocessed_query])

In [28]:
similarities = cosine_similarity(X, query_embedding)

In [29]:
similarities

array([[0.069312  ],
       [0.16035627],
       [0.29382252],
       [0.41485198],
       [0.13893868]])

In [30]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [31]:
ranked_indices

array([3, 2, 1, 4, 0], dtype=int64)

In [32]:
ranked_documents = [documents[i] for i in ranked_indices]

In [33]:
query 

'autonomous driving systems'

In [34]:
for i, doc in enumerate(ranked_documents):
    print(f"Ranked{i+1}: {doc}")

Ranked1: Traffic sign detection is a fundamental component of autonomous driving systems.
Ranked2: Advanced algorithms, such as those used in autonomous driving systems, require versatile programming languages like Python.
Ranked3: Artificial Intelligence plays a critical role in the development of autonomous driving technologies.
Ranked4: Data science is essential for analyzing sensor data and improving autonomous driving accuracy.
Ranked5: Autonomous vehicles rely on precise detection of road signs and signals.


In [36]:
document_embeddings = np.array([
    [0.112, 0.422, 0.312, 0.491, 0.645],
    [0.553, 0.126, 0.625, 0.701, 0.725],
    [0.821, 0.859, 0.901, 0.953, 0.991]
])

In [37]:
query_embeddings = np.array([[0.1, 0.2, 0.3, 0.4, 0.5]])

In [38]:
similarities = cosine_similarity(document_embeddings,query_embeddings)

In [40]:
ranked_indices = np.argsort(similarities, axis= 0)[::-1].flatten()

In [41]:
ranked_indices 

array([0, 1, 2], dtype=int64)

In [45]:
doc_path = 'D:\Algo_lab\langchain_essentials\Traffic_Sign_Detection_on_Dhaka_Traffic_Sign_Dataset.pdf'

In [46]:
from langchain_community.document_loaders import PyPDFLoader

In [47]:
loader = PyPDFLoader(doc_path)

In [48]:
docs = loader.load()

In [49]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [50]:
splitter = RecursiveCharacterTextSplitter(chunk_size =200, chunk_overlap = 30)

In [51]:
chunks = splitter.split_documents(docs)

In [54]:
chunks[:5]

[Document(metadata={'source': 'D:\\Algo_lab\\langchain_essentials\\Traffic_Sign_Detection_on_Dhaka_Traffic_Sign_Dataset.pdf', 'page': 0}, page_content='DetectionandRecognitionofBangladeshiTrafficSigninReal-worldImages\nSyedSaminSadaf\n200042163\nMizbaulHaqueMaruf\n200042125\nSyedTamzidBakth\n200042145\nDepartmentofComputerScienceandEngineering'),
 Document(metadata={'source': 'D:\\Algo_lab\\langchain_essentials\\Traffic_Sign_Detection_on_Dhaka_Traffic_Sign_Dataset.pdf', 'page': 0}, page_content='IslamicUniversityofTechnology\nAugust,2024'),
 Document(metadata={'source': 'D:\\Algo_lab\\langchain_essentials\\Traffic_Sign_Detection_on_Dhaka_Traffic_Sign_Dataset.pdf', 'page': 1}, page_content='Contents\n1 Introduction 1\n1.1 Overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1\n1.2 Motivation . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1'),
 Document(metadata={'source': 'D:\\Algo_lab\\langchain_essentials\\Traffic_Sign_Detection_on_Dhaka_Traf

In [68]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [69]:
import os

In [70]:
from dotenv import load_dotenv

In [71]:
load_dotenv()

True

In [72]:
HF_TOKEN = os.environ['HF_TOKEN']

In [73]:
embeddings= HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name ='BAAI/bge-base-en-v1.5')

In [76]:
from langchain.vectorstores import Chroma 

In [77]:
vectorstore = Chroma.from_documents(chunks, embeddings)

KeyError: 0