In [2]:
import pandas as pd
import numpy as np



## Reading all the pdf files

In [31]:
import os
from PyPDF2 import PdfReader

pdf_dir = r"C:\Users\Vipul\RAG-PDF-QUERYING\data"
pdf_texts = {}

for filename in os.listdir(pdf_dir):
    if filename.lower().endswith('.pdf'):
        path = os.path.join(pdf_dir, filename)
        reader = PdfReader(path)
        text = ''.join(page.extract_text() or '' for page in reader.pages)
        pdf_texts[filename] = text


for fname, content in pdf_texts.items():
    print(f"--- {fname} ---\n{content}\n")



--- CleanBot_Robotic_Vacuum_Cleaner_FAQ.pdf ---
CleanBot Robotic Vacuum Cleaner FAQ 
1. Product Overview 
CleanBot offers a range of robotic vacuum cleaners to suit different cleaning needs. Our 
current lineup includes: 
• CB-100 (Basic) 
• CB-200 (Smart Navigation) 
• CB-300 (Self-Emptying) 
Each model is designed to provide efficient and hassle-free cleaning. 
2. Technical Specifications 
CB-100 (Basic) 
Battery Life: 90 minutes 
Suction Power: 2000Pa 
Dustbin Capacity: 0.5L 
Noise Level: 60dB 
Weight: 3.5kg 
CB-200 (Smart Navigation) 
Battery Life: 90 minutes 
Suction Power: 2000Pa 
Dustbin Capacity: 0.5L 
Noise Level: 60dB 
Weight: 3.5kg 
CB-300 (Self-Emptying) 
Battery Life: 90 minutes 
Suction Power: 2000Pa 
Dustbin Capacity: 0.5L Noise Level: 60dB 
Weight: 3.5kg 
3. Key Features 
• Efficient Cleaning: High suction power and multiple cleaning modes. 
• Smart Navigation: Advanced sensors for better navigation and cleaning. 
• Self-Emptying: Automatically empties its dustbin for h

In [4]:
import re

# Section headers for CleanBot Robotic Vacuum Cleaner FAQ (common headers)
common_section_headers = [
    r"1\. Product Overview", r"2\. Technical Specifications", r"3\. Key Features",
    r"4\. Setup and Installation", r"5\. Usage Instructions", r"6\. Maintenance and Care",
    r"7\. Troubleshooting", r"8\. Warranty Information", r"9\. Customer Support"
]

# Function to clean page breaks (removes unnecessary newlines between pages)
def clean_page_breaks(text):
    return re.sub(r'\n+', '\n', text)

# Improved section splitting logic based on numbered headers
def split_into_sections(text, section_headers):
    sections = {}
    current_section = None
    current_content = []  # Collect lines for the current section

    # Compile regex pattern for section headers (strict matching)
    header_pattern = re.compile(rf"({'|'.join(section_headers)})", re.IGNORECASE)
    
    # Split text into lines, cleaning up page breaks
    lines = text.split('\n')
    
    for line in lines:
        # Check if the line matches one of the section headers
        match = header_pattern.match(line.strip())
        if match:
            # If a new section header is found, save the previous section
            if current_section:
                sections[current_section] = '\n'.join(current_content).strip()  # Save the section content
            current_section = match.group(0)  # Set the new section name
            current_content = []  # Reset content for the new section
        elif current_section:
            # If no new section is found, append the line to the current section
            current_content.append(line)
    
    # Save the last section after the loop finishes
    if current_section:
        sections[current_section] = '\n'.join(current_content).strip()
    
    return sections


pdf_text_cleaned = clean_page_breaks(pdf_text)

# Split the cleaned text into sections
sections = split_into_sections(pdf_text_cleaned, common_section_headers)

# Print the sections and their content to verify
for section_name, section_content in sections.items():
    print(f"--- {section_name} ---")
    print(section_content)
    print("\n")


--- 1. Product Overview ---
CleanBot offers a range of robotic vacuum cleaners to suit different cleaning needs. Our 
current lineup includes: 
• CB-100 (Basic) 
• CB-200 (Smart Navigation) 
• CB-300 (Self-Emptying) 
Each model is designed to provide efficient and hassle-free cleaning.


--- 2. Technical Specifications ---
CB-100 (Basic) 
Battery Life: 90 minutes 
Suction Power: 2000Pa 
Dustbin Capacity: 0.5L 
Noise Level: 60dB 
Weight: 3.5kg 
CB-200 (Smart Navigation) 
Battery Life: 90 minutes 
Suction Power: 2000Pa 
Dustbin Capacity: 0.5L 
Noise Level: 60dB 
Weight: 3.5kg 
CB-300 (Self-Emptying) 
Battery Life: 90 minutes 
Suction Power: 2000Pa 
Dustbin Capacity: 0.5L Noise Level: 60dB 
Weight: 3.5kg


--- 3. Key Features ---
• Efficient Cleaning: High suction power and multiple cleaning modes. 
• Smart Navigation: Advanced sensors for better navigation and cleaning. 
• Self-Emptying: Automatically empties its dustbin for hands-free maintenance.


--- 4. Setup and Installation ---
Ste

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

document_name = "CleanBot_Robotic_Vacuum_Cleaner_FAQ.pdf"

# Vectorize and tag the sections
section_vectors_with_tags = []
for section_name, section_content in sections.items():
    # Vectorize the section content
    vector = model.encode(section_content)  
    
    # Create a dictionary with the vector and metadata (tags)
    section_data = {
        "vector": vector,  # The vector representation of the section content
        "metadata": {
            "section_name": section_name,
            "document_name": document_name
        }
    }
    
    # Append to the list of vectors with tags
    section_vectors_with_tags.append(section_data)

    print(f"Vector Dimension: {vector.shape}")





  from tqdm.autonotebook import tqdm, trange


Vector Dimension: (384,)
Vector Dimension: (384,)
Vector Dimension: (384,)
Vector Dimension: (384,)
Vector Dimension: (384,)
Vector Dimension: (384,)
Vector Dimension: (384,)
Vector Dimension: (384,)
Vector Dimension: (384,)


In [6]:
print(len(section_vectors_with_tags[0]['vector']))


384


In [7]:
section_data

{'vector': array([-2.55723894e-01, -1.08243749e-01,  9.93790552e-02, -7.15232715e-02,
         3.04899216e-01,  1.31528294e-02,  2.29274392e-01,  2.68781316e-02,
        -2.07538918e-01,  2.38486310e-03,  2.89173890e-03, -2.69572530e-02,
         1.14853121e-01, -2.34422296e-01, -1.93441331e-01,  7.01083466e-02,
         2.07903758e-01,  2.36278594e-01,  3.88512239e-02, -3.76376417e-03,
        -1.14337496e-01, -2.10683681e-02, -4.53603491e-02,  1.13317057e-01,
         9.82076973e-02, -6.77358881e-02,  3.81689250e-01,  1.35200858e-01,
        -3.19765345e-03, -1.93931647e-02, -1.46542713e-01, -2.76669431e-02,
         1.31505460e-01,  2.01005694e-02,  1.60264179e-01, -1.01909138e-01,
        -1.30358310e-02,  1.10495109e-02, -2.12814331e-01, -8.01212564e-02,
         3.19885425e-02, -2.57662460e-02,  6.46148324e-02, -2.36733615e-01,
        -7.60573745e-02, -1.13318428e-01,  1.20247871e-01,  3.84056449e-01,
        -1.26708135e-01,  2.12391257e-01, -2.15192705e-01,  2.32412899e-03,
  

In [8]:
section_vectors_with_tags

[{'vector': array([-2.79125154e-01, -1.16200268e-01,  1.37104318e-01,  2.29291040e-02,
         -1.01016825e-02,  3.04103494e-01,  2.25371569e-01, -1.23919481e-02,
          4.15866375e-02,  4.97458205e-02,  2.11236075e-01, -4.17501666e-02,
          2.14627162e-01, -7.28324130e-02, -1.94288701e-01,  1.72726378e-01,
          2.15047434e-01,  8.54812488e-02,  8.84254090e-03, -2.15840209e-02,
          7.00466111e-02,  9.89999175e-02, -4.63401340e-02,  1.55719310e-01,
          1.35151118e-01, -1.10920385e-01,  4.28551435e-01, -4.35788892e-02,
         -2.56676257e-01,  5.88608421e-02, -1.67746246e-01,  7.60932863e-02,
         -9.96739715e-02, -1.45294711e-01,  1.86249703e-01, -1.41161114e-01,
         -1.30773827e-01, -1.30704969e-01, -2.25379482e-01,  5.05418926e-02,
         -1.85525008e-02,  2.16257107e-02, -1.85450122e-01, -4.43591148e-01,
         -1.15715712e-01, -1.36899026e-02, -4.33783755e-02,  1.91767216e-01,
         -2.42915004e-01, -8.45552385e-02, -9.70059484e-02, -9.877

In [9]:
# Check the first vector and its metadata
print(section_vectors_with_tags[0])  # Example print to verify


{'vector': array([-2.79125154e-01, -1.16200268e-01,  1.37104318e-01,  2.29291040e-02,
       -1.01016825e-02,  3.04103494e-01,  2.25371569e-01, -1.23919481e-02,
        4.15866375e-02,  4.97458205e-02,  2.11236075e-01, -4.17501666e-02,
        2.14627162e-01, -7.28324130e-02, -1.94288701e-01,  1.72726378e-01,
        2.15047434e-01,  8.54812488e-02,  8.84254090e-03, -2.15840209e-02,
        7.00466111e-02,  9.89999175e-02, -4.63401340e-02,  1.55719310e-01,
        1.35151118e-01, -1.10920385e-01,  4.28551435e-01, -4.35788892e-02,
       -2.56676257e-01,  5.88608421e-02, -1.67746246e-01,  7.60932863e-02,
       -9.96739715e-02, -1.45294711e-01,  1.86249703e-01, -1.41161114e-01,
       -1.30773827e-01, -1.30704969e-01, -2.25379482e-01,  5.05418926e-02,
       -1.85525008e-02,  2.16257107e-02, -1.85450122e-01, -4.43591148e-01,
       -1.15715712e-01, -1.36899026e-02, -4.33783755e-02,  1.91767216e-01,
       -2.42915004e-01, -8.45552385e-02, -9.70059484e-02, -9.87757370e-02,
       -7.4369

In [10]:
pip install pinecone-client


Note: you may need to restart the kernel to use updated packages.


In [11]:
import pinecone
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv()


api_key = 'd11b6a5a-ce3b-4415-9ac9-6e327eb758c8' 

pc = Pinecone(api_key=api_key) 

index_name = 'pdf-sections-index'


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=len(section_vectors_with_tags[0]['vector']),  
        metric='cosine',  # You can choose 'cosine', 'dotproduct', or 'euclidean'
        spec=ServerlessSpec(
            cloud='aws',  # Use AWS
            region='us-east-1'  # U.S. West region on AWS
        )
        
    )

# Connect to the index
index = pc.Index(index_name)


In [12]:
# Prepare the data for upsert (insertion)
to_upsert = []

for section in section_vectors_with_tags:
    vector_id = section['metadata']['section_name']  # Use section name as the unique ID
    vector = section['vector']
    metadata = section['metadata']

    
    metadata['content'] = sections[section['metadata']['section_name']]  # Include section content
    
    # Append the tuple (ID, vector, metadata) for upserting
    to_upsert.append((vector_id, vector, metadata))

# Insert the vectors into Pinecone
index.upsert(vectors=to_upsert)
print("Vectors inserted into Pinecone with content!")


Vectors inserted into Pinecone with content!


In [13]:
print(len(section_vectors_with_tags[0]['vector']))  # Confirm dimension


384


In [14]:

query_text = "How do I troubleshoot my CleanBot vacuum?"

# Vectorize the query using your model
query_vector = model.encode(query_text)

# Check the dimension
print(f"Query Vector Dimension: {len(query_vector)}")  # Should match index_stats.dimension


Query Vector Dimension: 384


In [15]:
import numpy as np

# Normalize the vector
norm = np.linalg.norm(query_vector)
if norm == 0:
    query_vector_normalized = np.zeros_like(query_vector)
else:
    query_vector_normalized = query_vector / norm

# Clip the values to ensure they are within [-1, 1]
query_vector_clipped = np.clip(query_vector_normalized, -1, 1)

# Verify the values
min_val = np.min(query_vector_clipped)
max_val = np.max(query_vector_clipped)
print(f"Min value after clipping: {min_val}")  # Should be >= -1
print(f"Max value after clipping: {max_val}")  # Should be <= 1

# Ensure the data type is float32
query_vector_clipped = query_vector_clipped.astype(np.float32)
print(f"Data type: {query_vector_clipped.dtype}")  # Should print float32


Min value after clipping: -0.14623844623565674
Max value after clipping: 0.16961467266082764
Data type: float32


In [16]:
# Describe the index statistics
index_stats = index.describe_index_stats()

# Print relevant information
print(f"Vector Dimension: {index_stats['dimension']}")


Vector Dimension: 384


In [17]:
# Query the Pinecone index for the most relevant sections
try:
    # Convert the clipped vector to a list for querying
    query_vector_list = query_vector_clipped.tolist()

    # Query Pinecone
    results = index.query(
        vector=query_vector_list,  # The query vector
        top_k=3,                   # Retrieve the top 3 most similar results
        include_metadata=True      # Include metadata to retrieve section names and content
    )

    # Print the retrieved sections
    for match in results['matches']:
        print(f"Section: {match['metadata'].get('section_name', 'N/A')}")
        print(f"Content: {match['metadata'].get('content', 'N/A')}")
        print(f"Score: {match['score']}\n")
        
except Exception as e:
    print(f"Error during Pinecone query: {e}")


Section: 4. Setup and Installation
Content: Step-by-step instructions for setting up your CleanBot robotic vacuum cleaner: 
1. Unbox the vacuum cleaner and charging dock. 
2. Place the charging dock against a wall and plug it in. 
3. Place the vacuum cleaner on the dock to charge. 
4. Download the CleanBot app and follow the setup instructions.
Score: 0.654589

Section: 1. Product Overview
Content: CleanBot offers a range of robotic vacuum cleaners to suit different cleaning needs. Our 
current lineup includes: 
• CB-100 (Basic) 
• CB-200 (Smart Navigation) 
• CB-300 (Self-Emptying) 
Each model is designed to provide efficient and hassle-free cleaning.
Score: 0.629998147

Section: 6. Maintenance and Care
Content: Guidelines for keeping your robotic vacuum cleaner in top condition: 
• Regularly empty the dustbin and clean the filters. 
• Check and clean the brushes and sensors to ensure optimal performance. 
• Keep the firmware updated for enhanced features and performance.
Score: 0.575

In [24]:
import openai

openai.api_key = os.getenv("open_ai_key")

user_query = "How do I set up my CleanBot vacuum?"

# Vectorize the user query
query_vector = model.encode(user_query)

In [25]:
# Normalize and clip the vector as before
norm = np.linalg.norm(query_vector)
query_vector_normalized = query_vector / norm if norm != 0 else np.zeros_like(query_vector)
query_vector_clipped = np.clip(query_vector_normalized, -1, 1).astype(np.float32)


In [26]:
# Query Pinecone for relevant sections
results = index.query(vector=query_vector_clipped.tolist(), top_k=3, include_metadata=True)

# Combine the retrieved section content
retrieved_sections = "\n".join([match['metadata']['content'] for match in results['matches']])

In [27]:
# Create a prompt for GPT based on the user query and the relevant sections
prompt = f"User query: {user_query}\n\nRelevant sections:\n{retrieved_sections}\n\nGenerate a helpful response based on the provided information."

In [22]:
pip install openai==0.28


Note: you may need to restart the kernel to use updated packages.


In [28]:
# Call GPT-3.5 or GPT-4 with the new method
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",  # or "gpt-4" if you have access
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ],
    max_tokens=150  # Adjust based on your needs
)

# Print the response
print("Generated Response:", response['choices'][0]['message']['content'].strip())

Generated Response: To set up your CleanBot robotic vacuum, follow these steps:

1. Unbox the vacuum cleaner and charging dock.
2. Place the charging dock against a wall and plug it in.
3. Place the vacuum cleaner on the dock to charge.
4. Download the CleanBot app and follow the setup instructions.

CleanBot offers a variety of robotic vacuum cleaners tailored to different cleaning needs, including the CB-100 (Basic), CB-200 (Smart Navigation), and CB-300 (Self-Emptying) models. Each model is designed to provide efficient and hassle-free cleaning.

To keep your robotic vacuum cleaner in top condition, remember to:
- Regularly empty the dustbin and clean the filters.
- Check and clean the brushes and sensors


In [29]:
def query_and_generate_response(user_query, section_filter=None, top_k=5):
    """
    Query Pinecone for relevant sections and generate a response using GPT.

    Args:
        user_query (str): The user's query for the system.
        section_filter (str, optional): Filter to restrict the search to specific sections (e.g., "Troubleshooting").
        top_k (int, optional): The number of top results to retrieve from Pinecone.
        
    Returns:
        str: The generated response from GPT.
    """
    # Vectorize the user query
    query_vector = model.encode(user_query)

    # Normalize and clip the vector
    norm = np.linalg.norm(query_vector)
    query_vector_normalized = query_vector / norm if norm != 0 else np.zeros_like(query_vector)
    query_vector_clipped = np.clip(query_vector_normalized, -1, 1).astype(np.float32)

    # Query Pinecone for relevant sections
    results = index.query(vector=query_vector_clipped.tolist(), top_k=top_k, include_metadata=True)

    # Filter sections if a filter is provided
    relevant_sections = []
    for match in results['matches']:
        if section_filter is None or section_filter in match['metadata'].get('section_name', ''):
            relevant_sections.append(match['metadata']['content'])

    # Combine relevant sections for the GPT prompt
    retrieved_sections = "\n".join(relevant_sections)

    # Create a prompt for GPT based on the user query and the relevant sections
    prompt = f"User query: {user_query}\n\nRelevant sections:\n{retrieved_sections}\n\nGenerate a helpful response based on the provided information."

    # Call GPT to generate a response
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # or GPT-4 if you have access
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150  # Adjust based on your needs
    )

    # Return the generated response
    return response['choices'][0]['message']['content'].strip()


# Example usage
response = query_and_generate_response("Why is my CleanBot vacuum not charging?", section_filter="Troubleshooting")
print("Generated Response:", response)


Generated Response: To address the issue of your CleanBot vacuum not charging, please follow these steps:

1. Make sure the charging dock is securely plugged in and receiving power.
2. Check that the vacuum is properly aligned with the dock, ensuring the charging contacts are making good contact.
3. If necessary, clean the charging contacts on both the vacuum and the dock to remove any dust or debris that may be hindering the charging process.
4. If the issue persists, consider checking the power source and trying a different outlet to rule out electrical issues.

By following these steps, you can troubleshoot and potentially resolve the charging problem with your CleanBot vacuum. If the issue persists, it may be beneficial to consult the product manual or contact customer support for further assistance


In [30]:
import openai
import numpy as np

# Function to query Pinecone and generate a response using GPT
def query_and_generate_response(user_query, section_filter=None, top_k=5):
    """
    Query Pinecone for relevant sections and generate a response using GPT.

    Args:
        user_query (str): The user's query for the system.
        section_filter (str, optional): Filter to restrict the search to specific sections (e.g., "Troubleshooting").
        top_k (int, optional): The number of top results to retrieve from Pinecone.
        
    Returns:
        str: The generated response from GPT.
    """
    # Vectorize the user query
    query_vector = model.encode(user_query)

    # Normalize and clip the vector
    norm = np.linalg.norm(query_vector)
    query_vector_normalized = query_vector / norm if norm != 0 else np.zeros_like(query_vector)
    query_vector_clipped = np.clip(query_vector_normalized, -1, 1).astype(np.float32)

    # Query Pinecone for relevant sections
    results = index.query(vector=query_vector_clipped.tolist(), top_k=top_k, include_metadata=True)

    # Filter sections if a filter is provided
    relevant_sections = []
    for match in results['matches']:
        if section_filter is None or section_filter in match['metadata'].get('section_name', ''):
            relevant_sections.append(match['metadata']['content'])

    # Combine relevant sections for the GPT prompt
    retrieved_sections = "\n".join(relevant_sections)

    # Create a prompt for GPT based on the user query and the relevant sections
    prompt = f"""
    User query: {user_query}

    Relevant sections: 
    {retrieved_sections}

    Instructions: Please generate a response based *only* on the provided information. You may add a maximum of 1 or 2 additional points if they are essential for clarifying the response, but avoid introducing more than 2 new points.
    """

    # Call GPT to generate a response
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # or GPT-4 if you have access
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150  # Adjust based on your needs
    )

    # Return the generated response
    return response['choices'][0]['message']['content'].strip()

# Example usage
response = query_and_generate_response("Why is my CleanBot vacuum not charging?", section_filter="Troubleshooting")
print("Generated Response:", response)


Generated Response: It seems like the issue with your CleanBot vacuum not charging could be due to an improper alignment with the charging dock or the dock not being plugged in. Ensure that the vacuum is correctly placed on the dock and that the dock is properly connected to a power source. If the problem persists, consider checking the power outlet as well to make sure it is functioning correctly.
