<a href="https://colab.research.google.com/github/Sankalpa0011/LLM-Hybrid-Search-RAG-Keyword-Search-Semantic-Search-/blob/main/Hybrid_Search_RAG_Langchain_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hybrid Search RAG** using Langchain and OpenAI

In [1]:
!pip install pypdf -q
!pip install langchain -q
!pip install langchain_community -q
!pip install langchain_openai -q
!pip install langchain_chroma -q
!pip install rank_bm25 -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.6/411.6 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.3/454.3 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Import necessary libraries
import os
from google.colab import userdata

## Initialize OpenAI LLM

In [3]:
from langchain_openai import ChatOpenAI

# Set OpenAI API key
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

# Initialize the ChatOpenAI model
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0
)

## Initialize Embedding Model

In [4]:
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

## Load PDF Document

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from langchain_community.document_loaders import PyPDFLoader

loader=PyPDFLoader("/content/drive/MyDrive/CodeProLK DL/LLM Hybrid Search RAG (Keyword Search + Semantic Search)/Sankalpa_Portfolio.pdf")

docs=loader.load()

## Split Documents into Chunks

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=250,chunk_overlap=30)

chunks = splitter.split_documents(docs)

In [8]:
len(chunks)

15

## Create Semantic Search Retriever

In [9]:
from langchain_chroma import Chroma

vectorstore=Chroma.from_documents(chunks, embedding_model)

vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 2})

In [10]:
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7ba387f99060>, search_kwargs={'k': 2})

## Create Keyword Search Retriever

In [11]:
from langchain.retrievers import BM25Retriever

keyword_retriever = BM25Retriever.from_documents(chunks)

keyword_retriever.k =  2

In [12]:
keyword_retriever

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7ba384a51a50>, k=2)

## Create Hybrid Search Retriever

In [13]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers = [vectorstore_retreiver, keyword_retriever], weights = [0.5, 0.5])

In [14]:
ensemble_retriever

EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7ba387f99060>, search_kwargs={'k': 2}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7ba384a51a50>, k=2)], weights=[0.5, 0.5])

## Define Prompt Template

In [15]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Define a message template for the chatbot
message = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""

# Create a chat prompt template from the message
prompt = ChatPromptTemplate.from_messages([("human", message)])

## Create RAG Chain with Hybrid Search

In [16]:
chain = (
    {
      "context": ensemble_retriever,
      "question": RunnablePassthrough()
    }
    | prompt
    | llm
)

## Invoke RAG Chain with Example Questions

In [19]:
response = chain.invoke("what are the skills that have sankalpa?")

print(response.content)

The skills that Sankalpa possesses include decision-making, innovation, developing intelligent systems, enhancing human interaction with technology, sharing knowledge through workshops, lectures, and tutorials, and continuous learning and collaboration.


## keyword_retriever, vectorstore_retreiver, ensemble_retriever

In [20]:
for doc in keyword_retriever.invoke("what are the skills that have sankalpa?"):
  print(doc.page_content)
  print("---------------------")

Educator and Innovator
Sankalpa actively engages with tech communities, sharing knowledge through workshops, lectures,
and tutorials. Recent lectures on Management Information Systems have received widespread
---------------------
Through continuous learning and collaboration, Sankalpa aspires to leave a lasting impression on
the tech world while making a difference in people's lives.
---------------------


In [21]:
for doc in vectorstore_retreiver.invoke("what are the skills that have sankalpa?"):
  print(doc.page_content)
  print("---------------------")

decision-making. With an unwavering commitment to innovation, Sankalpa specializes in developing
intelligent systems that enhance human interaction with technology, particularly through applications
in health, education, and accessibility.
---------------------
Sankalpa's Portfolio
Introduction to Sankalpa's Portfolio
Background and Vision
Motivation and Inspiration
Sankalpa's journey in technology began with a desire to bridge the gap between data and
---------------------


In [22]:
for doc in ensemble_retriever.invoke("what are the skills that have sankalpa?"):
  print(doc.page_content)
  print("---------------------")

decision-making. With an unwavering commitment to innovation, Sankalpa specializes in developing
intelligent systems that enhance human interaction with technology, particularly through applications
in health, education, and accessibility.
---------------------
Educator and Innovator
Sankalpa actively engages with tech communities, sharing knowledge through workshops, lectures,
and tutorials. Recent lectures on Management Information Systems have received widespread
---------------------
Sankalpa's Portfolio
Introduction to Sankalpa's Portfolio
Background and Vision
Motivation and Inspiration
Sankalpa's journey in technology began with a desire to bridge the gap between data and
---------------------
Through continuous learning and collaboration, Sankalpa aspires to leave a lasting impression on
the tech world while making a difference in people's lives.
---------------------
