<a href="https://colab.research.google.com/github/Sahilkom/Intern_project/blob/main/Advanced_RAG_Hybrid_Search_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain rank_bm25 pypdf unstructured chromadb
!pip install unstructured['pdf'] unstructured
!apt-get install poppler-utils
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev
!pip install pytesseract

### Load the required Packages

In [None]:
!pip install langchain_community

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.llms import HuggingFaceHub

from langchain.retrievers import BM25Retriever, EnsembleRetriever

import os

In [None]:
### Load the PDF file

In [None]:
file_path = "/content/Data_set.pdf"
data_file = UnstructuredPDFLoader(file_path)
docs = data_file.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
print(docs[0].page_content)

### Split Documents and Chunking

In [None]:
# create chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                          chunk_overlap=100)
chunks = splitter.split_documents(docs)

In [None]:
chunks[0].page_content

'6fyqc-cgb6r\n\ntable_name products sales support_tickets marketing_campaigns Stores marketing campaign information products employees employees departments inventory suppliers orders customers products sales support_tickets\n\ndescription Stores product information Stores sales information Stores customer support ticket information'

In [None]:
# Get Embedding Model from HF via API

from google.colab import userdata
HF_TOKEN = userdata.get('HUGGINGFACEHUB_API_TOKEN')

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5"
)

### VectorStore

In [None]:
# Vector store with the selected embedding model
vectorstore = Chroma.from_documents(chunks, embeddings)

In [None]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k =  3

### Ensemble Retriever

In [None]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,
                                                   keyword_retriever],
                                       weights=[0.4, 0.6])

In [None]:
llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    model_kwargs={"temperature": 0.3,"max_new_tokens":1024},
    huggingfacehub_api_token=HF_TOKEN,
)

  warn_deprecated(


### Prompt Template:

In [None]:
template = """
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0
                                        How may I help you today?

CONTEXT: {context}
</s>
<|user|>
{query}
</s>
<|assistant|>
"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

In [None]:
chain = (
    {"context": ensemble_retriever, "query": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

In [None]:
print(chain.invoke("Tell me Name of tables present in data"))

Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0  
                                        How may I help you today?

CONTEXT: [Document(page_content='columns/1/description Name of the product ID of the customer who made the purchase product_id product_id ID of the customer who raised the ticket start_date Name of the campaign category Name of the product last_name First name of the employee last_name First name of the employee manager_id Name of the department warehouse_location ID of the product in inventory contact_name Name of the supplier order_date ID of the customer who placed the order last_name First name of the customer Name of the', metadata={'source': '/content/Data_set.pdf'}), Document(page_content='6fyqc-cgb6r\n\ntable_name products sales support_tickets marketing_campaigns Stores marketing campaign information products employees employees departments inventory suppliers orders cus

In [None]:
print(chain.invoke("Don't count Stores as a table and now tell me the name of all unique tables"))

Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0  
                                        How may I help you today?

CONTEXT: [Document(page_content='6fyqc-cgb6r\n\ntable_name products sales support_tickets marketing_campaigns Stores marketing campaign information products employees employees departments inventory suppliers orders customers products sales support_tickets\n\ndescription Stores product information Stores sales information Stores customer support ticket information', metadata={'source': '/content/Data_set.pdf'}), Document(page_content='Unique identifier for each order Unique identifier for each customer Unique identifier for each product Unique identifier for each sale Unique identifier for each support ticket', metadata={'source': '/content/Data_set.pdf'}), Document(page_content='columns/0/description Unique identifier for each product Unique identifier for each sale Unique ident

In [None]:
print(chain.invoke("Tell me how all tables are related to each other"))

Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0  
                                        How may I help you today?

CONTEXT: [Document(page_content='columns/0/description Unique identifier for each product Unique identifier for each sale Unique identifier for each support ticket Unique identifier for each campaign Unique identifier for each product Unique identifier for each employee Unique identifier for each employee Unique identifier for each department Unique identifier for each inventory record product_id Unique identifier for each supplier Unique identifier for each order Unique identifier for each customer Unique identifier for each', metadata={'source': '/content/Data_set.pdf'}), Document(page_content='Unique identifier for each order Unique identifier for each customer Unique identifier for each product Unique identifier for each sale Unique identifier for each support ticket', metada

In [None]:
print(chain.invoke("Tell me Tables name that can be joined with each other"))

Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0  
                                        How may I help you today?

CONTEXT: [Document(page_content='Unique identifier for each order Unique identifier for each customer Unique identifier for each product Unique identifier for each sale Unique identifier for each support ticket', metadata={'source': '/content/Data_set.pdf'}), Document(page_content='columns/2/name category\n\nPage 3', metadata={'source': '/content/Data_set.pdf'}), Document(page_content='6fyqc-cgb6r\n\ntable_name products sales support_tickets marketing_campaigns Stores marketing campaign information products employees employees departments inventory suppliers orders customers products sales support_tickets\n\ndescription Stores product information Stores sales information Stores customer support ticket information', metadata={'source': '/content/Data_set.pdf'}), Document(page_cont

In [None]:
print(chain.invoke("Give me those tables that can be joined with each other on basis of product_id"))

Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0  
                                        How may I help you today?

CONTEXT: [Document(page_content='Unique identifier for each order Unique identifier for each customer Unique identifier for each product Unique identifier for each sale Unique identifier for each support ticket', metadata={'source': '/content/Data_set.pdf'}), Document(page_content='columns/0/description Unique identifier for each product Unique identifier for each sale Unique identifier for each support ticket Unique identifier for each campaign Unique identifier for each product Unique identifier for each employee Unique identifier for each employee Unique identifier for each department Unique identifier for each inventory record product_id Unique identifier for each supplier Unique identifier for each order Unique identifier for each customer Unique identifier for each', metada