<a href="https://colab.research.google.com/github/Sahilkom/Intern_project/blob/main/Task_1/Modified_RAG_Hybrid_Search_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and Importing Required Dependencies

In [None]:
!pip install langchain rank_bm25 pypdf unstructured chromadb
!pip install unstructured['pdf'] unstructured
!apt-get install poppler-utils
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev
!pip install pytesseract
!pip install fpdf
!pip install langchain_community

In [2]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.llms import HuggingFaceHub

from langchain.retrievers import BM25Retriever, EnsembleRetriever

from fpdf import FPDF

import os

# Pre-Processing Data

In [4]:
import json
def load_data():
    with open("/content/all_tables_data.json", 'r') as f:
        return json.load(f)

data=load_data()

In [5]:
# create chunks
data_set = []
for table in data['tables']:
    table_info=[]
    table_name=f"Table name: {table['table_name']}"
    table_description=f"Description: {table['description']}"
    table_info.append(table_name)
    table_info.append(table_description)
    title="Column name   Description"
    table_info.append(title)
    index=1
    for col in table['columns']:
        col_info=f" {col['name']}  ({col['description']})"
        col_info=str(index)+". "+col_info
        table_info.append(col_info)
        index+=1
    if data_set.count(table_info) <= 0:
        data_set.append(table_info)


In [30]:
for chunk in data_set:
    for info in chunk:
        print(info+"\n")
    print("\n")

Table name: products

Description: Stores product information

Column name   Description

1.  product_code  (ID of the purchased item)

2.  product_name  (Name of the product)

3.  category  (Product category (e.g., Smartphone, TV))

4.  launch_date  (Launch date of the product)

5.  price  (Price of the product)

6.  manufacturer  (Manufacturer of the product)

7.  warranty_period  (Warranty period of the product)

8.  stock_quantity  (Quantity of product in stock)

9.  rating  (Customer rating of the product)

10.  dimensions  (Dimensions of the product)

11.  weight  (Weight of the product)

12.  color  (Color of the product)

13.  material  (Material of the product)

14.  power_usage  (Power usage of the product)

15.  model_number  (Model number of the product)



Table name: sales

Description: Stores sales information

Column name   Description

1.  sale_id  (Unique identifier for each sale)

2.  buyer_id  (ID of the customer who made the purchase)

3.  transaction_product_id  (

In [7]:
pdf = FPDF()

for table in data_set:
  pdf.add_page()
  for info in table:
    pdf.set_font("Arial", size = 10)
    pdf.cell(2000, 10, txt = info, ln = 1, align = 'L')

pdf.output("new_data.pdf")

''

# Importing processed data in the form of Documnet

In [8]:
file_path = "/content/new_data.pdf"
data_file = UnstructuredPDFLoader(file_path)
docs = data_file.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [31]:
print(docs[0].page_content)

Table name: products

Description: Stores product information

Column name Description

1. product_code (ID of the purchased item)

2. product_name (Name of the product)

3. category (Product category (e.g., Smartphone, TV))

4. launch_date (Launch date of the product)

5. price (Price of the product)

6. manufacturer (Manufacturer of the product)

7. warranty_period (Warranty period of the product)

8. stock_quantity (Quantity of product in stock)

9. rating (Customer rating of the product)

10. dimensions (Dimensions of the product)

11. weight (Weight of the product)

12. color (Color of the product)

13. material (Material of the product)

14. power_usage (Power usage of the product)

15. model_number (Model number of the product)

Table name: sales

Description: Stores sales information

Column name Description

1. sale_id (Unique identifier for each sale)

2. buyer_id (ID of the customer who made the purchase)

3. transaction_product_id (ID of the purchased item)

4. sale_date (D

# Importing Feature Extraction Model

In [10]:
# Get Embedding Model from HF via API

from google.colab import userdata
HF_TOKEN = userdata.get('HUGGINGFACEHUB_API_TOKEN')

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5"
)

### VectorStore

In [11]:
# Vector store with the selected embedding model
vectorstore = Chroma.from_documents(docs, embeddings)

In [12]:
# vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})
vectorstore_retreiver = vectorstore.as_retriever()

In [13]:
keyword_retriever = BM25Retriever.from_documents(docs)
# keyword_retriever.k =  3

### Ensemble Retriever

In [14]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,
                                                   keyword_retriever],
                                       weights=[0.5,0.5])

In [15]:
llm = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    model_kwargs={"temperature": 0.3,"max_new_tokens":1024},
    huggingfacehub_api_token=HF_TOKEN,
)

  warn_deprecated(


### Prompt Template:

In [16]:
template = """
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0
                                        How may I help you today?

CONTEXT: {context}
</s>
<|user|>
{query}
</s>
<|assistant|>
"""

In [17]:
prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

In [18]:
chain = (
    {"context": ensemble_retriever, "query": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

# Queries

In [29]:
print(chain.invoke("Give me list of all tables present in data"))



Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0
                                        How may I help you today?

CONTEXT: [Document(page_content="Table name: products\n\nDescription: Stores product information\n\nColumn name Description\n\n1. product_code (ID of the purchased item)\n\n2. product_name (Name of the product)\n\n3. category (Product category (e.g., Smartphone, TV))\n\n4. launch_date (Launch date of the product)\n\n5. price (Price of the product)\n\n6. manufacturer (Manufacturer of the product)\n\n7. warranty_period (Warranty period of the product)\n\n8. stock_quantity (Quantity of product in stock)\n\n9. rating (Customer rating of the product)\n\n10. dimensions (Dimensions of the product)\n\n11. weight (Weight of the product)\n\n12. color (Color of the product)\n\n13. material (Material of the product)\n\n14. power_usage (Power usage of the product)\n\n15. model_number (Model num

In [20]:
print(chain.invoke("give me tables that can be join on the basis of column description"))



Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0
                                        How may I help you today?

CONTEXT: [Document(page_content="Table name: products\n\nDescription: Stores product information\n\nColumn name Description\n\n1. product_code (ID of the purchased item)\n\n2. product_name (Name of the product)\n\n3. category (Product category (e.g., Smartphone, TV))\n\n4. launch_date (Launch date of the product)\n\n5. price (Price of the product)\n\n6. manufacturer (Manufacturer of the product)\n\n7. warranty_period (Warranty period of the product)\n\n8. stock_quantity (Quantity of product in stock)\n\n9. rating (Customer rating of the product)\n\n10. dimensions (Dimensions of the product)\n\n11. weight (Weight of the product)\n\n12. color (Color of the product)\n\n13. material (Material of the product)\n\n14. power_usage (Power usage of the product)\n\n15. model_number (Model num

In [21]:
print(chain.invoke("Give me all 10 table with there description"))



Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0
                                        How may I help you today?

CONTEXT: [Document(page_content="Table name: products\n\nDescription: Stores product information\n\nColumn name Description\n\n1. product_code (ID of the purchased item)\n\n2. product_name (Name of the product)\n\n3. category (Product category (e.g., Smartphone, TV))\n\n4. launch_date (Launch date of the product)\n\n5. price (Price of the product)\n\n6. manufacturer (Manufacturer of the product)\n\n7. warranty_period (Warranty period of the product)\n\n8. stock_quantity (Quantity of product in stock)\n\n9. rating (Customer rating of the product)\n\n10. dimensions (Dimensions of the product)\n\n11. weight (Weight of the product)\n\n12. color (Color of the product)\n\n13. material (Material of the product)\n\n14. power_usage (Power usage of the product)\n\n15. model_number (Model num

In [25]:
print(chain.invoke("Give me tables names that have (description:ID of the customer who made the purchase)"))



Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0
                                        How may I help you today?

CONTEXT: [Document(page_content="Table name: products\n\nDescription: Stores product information\n\nColumn name Description\n\n1. product_code (ID of the purchased item)\n\n2. product_name (Name of the product)\n\n3. category (Product category (e.g., Smartphone, TV))\n\n4. launch_date (Launch date of the product)\n\n5. price (Price of the product)\n\n6. manufacturer (Manufacturer of the product)\n\n7. warranty_period (Warranty period of the product)\n\n8. stock_quantity (Quantity of product in stock)\n\n9. rating (Customer rating of the product)\n\n10. dimensions (Dimensions of the product)\n\n11. weight (Weight of the product)\n\n12. color (Color of the product)\n\n13. material (Material of the product)\n\n14. power_usage (Power usage of the product)\n\n15. model_number (Model num

In [27]:
print(chain.invoke("Tell me on which column I can join customer, employee and department table"))



Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0
                                        How may I help you today?

CONTEXT: [Document(page_content="Table name: products\n\nDescription: Stores product information\n\nColumn name Description\n\n1. product_code (ID of the purchased item)\n\n2. product_name (Name of the product)\n\n3. category (Product category (e.g., Smartphone, TV))\n\n4. launch_date (Launch date of the product)\n\n5. price (Price of the product)\n\n6. manufacturer (Manufacturer of the product)\n\n7. warranty_period (Warranty period of the product)\n\n8. stock_quantity (Quantity of product in stock)\n\n9. rating (Customer rating of the product)\n\n10. dimensions (Dimensions of the product)\n\n11. weight (Weight of the product)\n\n12. color (Color of the product)\n\n13. material (Material of the product)\n\n14. power_usage (Power usage of the product)\n\n15. model_number (Model num

In [28]:
print(chain.invoke("Tell me column that connects these customer, employee and department table"))



Human: 
<|system|>>
                                              !! Hello !!
                                          This is AI Model-2.0
                                        How may I help you today?

CONTEXT: [Document(page_content="Table name: products\n\nDescription: Stores product information\n\nColumn name Description\n\n1. product_code (ID of the purchased item)\n\n2. product_name (Name of the product)\n\n3. category (Product category (e.g., Smartphone, TV))\n\n4. launch_date (Launch date of the product)\n\n5. price (Price of the product)\n\n6. manufacturer (Manufacturer of the product)\n\n7. warranty_period (Warranty period of the product)\n\n8. stock_quantity (Quantity of product in stock)\n\n9. rating (Customer rating of the product)\n\n10. dimensions (Dimensions of the product)\n\n11. weight (Weight of the product)\n\n12. color (Color of the product)\n\n13. material (Material of the product)\n\n14. power_usage (Power usage of the product)\n\n15. model_number (Model num