In [37]:
import ray

ray.init(logging_level="ERROR")


0,1
Python version:,3.10.15
Ray version:,2.37.0


## Extracting data from data location 

In [38]:
import os
from PyPDF2 import PdfReader
import ray
import ray.data


pdf_dir = r"C:\Users\Vipul\RAG-PDF-QUERYING\data"

# Function to extract text from a PDF
def extract_text(filepath):
    reader = PdfReader(filepath)
    text = ''.join(page.extract_text() or '' for page in reader.pages)
    return text

data = []
for filename in os.listdir(pdf_dir):
    if filename.lower().endswith('.pdf'):
        path = os.path.join(pdf_dir, filename)
        text = extract_text(path)
        data.append({"filename": filename, "content": text})


dataset = ray.data.from_items(data)


print(dataset.take(2))




2024-10-19 15:23:45,582	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\Vipul\AppData\Local\Temp\ray\session_2024-10-19_15-23-27_640146_1900\logs\ray-data
2024-10-19 15:23:45,583	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> LimitOperator[limit=2]


Running 0: 0.00 row [00:00, ? row/s]

- limit=2 1: 0.00 row [00:00, ? row/s]

[{'filename': 'CleanBot_Robotic_Vacuum_Cleaner_FAQ.pdf', 'content': 'CleanBot Robotic Vacuum Cleaner FAQ \n1. Product Overview \nCleanBot offers a range of robotic vacuum cleaners to suit different cleaning needs. Our \ncurrent lineup includes: \n• CB-100 (Basic) \n• CB-200 (Smart Navigation) \n• CB-300 (Self-Emptying) \nEach model is designed to provide efficient and hassle-free cleaning. \n2. Technical Specifications \nCB-100 (Basic) \nBattery Life: 90 minutes \nSuction Power: 2000Pa \nDustbin Capacity: 0.5L \nNoise Level: 60dB \nWeight: 3.5kg \nCB-200 (Smart Navigation) \nBattery Life: 90 minutes \nSuction Power: 2000Pa \nDustbin Capacity: 0.5L \nNoise Level: 60dB \nWeight: 3.5kg \nCB-300 (Self-Emptying) \nBattery Life: 90 minutes \nSuction Power: 2000Pa \nDustbin Capacity: 0.5L Noise Level: 60dB \nWeight: 3.5kg \n3. Key Features \n• Efficient Cleaning: High suction power and multiple cleaning modes. \n• Smart Navigation: Advanced sensors for better navigation and cleaning. \n• Self

## Splitting data into sections

In [39]:
import re
import ray
import ray.data

# Initialize Ray


# Define section headers
common_headers = [
    r"1\. Product Overview", r"2\. Technical Specifications", r"3\. Key Features",
    r"4\. Setup and Installation", r"5\. Usage Instructions", r"6\. Maintenance and Care",
    r"7\. Troubleshooting", r"8\. Warranty Information", r"9\. Customer Support"
]

techmobile_headers = [
    r"1\. Product Overview", r"2\. Technical Specifications", r"3\. Key Features",
    r"4\. Battery Life", r"5\. Camera", r"6\. Connectivity", r"7\. Troubleshooting"
]

# Function to clean page breaks
def clean_page_breaks(text):
    return re.sub(r'\n+', '\n', text)

# Function to split into sections
def split_sections(row):
    text = clean_page_breaks(row["content"])
    headers = techmobile_headers if row["filename"] == "TechMobile_Smartphone_FAQ.pdf" else common_headers
    sections = {}
    current_section = None
    current_content = []
    header_pattern = re.compile(rf"({'|'.join(headers)})", re.IGNORECASE)
    
    for line in text.split('\n'):
        match = header_pattern.match(line.strip())
        if match:
            if current_section:
                sections[current_section] = '\n'.join(current_content).strip()
            current_section = match.group(0)
            current_content = []
        elif current_section:
            current_content.append(line)
    
    if current_section:
        sections[current_section] = '\n'.join(current_content).strip()
    
    row["sections"] = sections
    return row



# Applying the split_sections function
split_dataset = dataset.map(split_sections)

techmobile = split_dataset.filter(lambda row: row["filename"] == "TechMobile_Smartphone_FAQ.pdf").take(1)
common_pdf = split_dataset.filter(lambda row: row["filename"] != "TechMobile_Smartphone_FAQ.pdf").take(1)

print("--- TechMobile_Smartphone_FAQ.pdf ---")
for section, content in techmobile[0]["sections"].items():
    print(f"--- {section} ---\n{content}\n")

print("--- Common PDF Example ---")
for section, content in common_pdf[0]["sections"].items():
    print(f"--- {section} ---\n{content}\n")


2024-10-19 15:23:53,560	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\Vipul\AppData\Local\Temp\ray\session_2024-10-19_15-23-27_640146_1900\logs\ray-data
2024-10-19 15:23:53,561	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[Map(split_sections)->Filter(<lambda>)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Map(split_sections)->Filter(<lambda>) 1: 0.00 row [00:00, ? row/s]

- limit=1 2: 0.00 row [00:00, ? row/s]

2024-10-19 15:23:55,525	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\Vipul\AppData\Local\Temp\ray\session_2024-10-19_15-23-27_640146_1900\logs\ray-data
2024-10-19 15:23:55,526	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[Map(split_sections)->Filter(<lambda>)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Map(split_sections)->Filter(<lambda>) 1: 0.00 row [00:00, ? row/s]

- limit=1 2: 0.00 row [00:00, ? row/s]

--- TechMobile_Smartphone_FAQ.pdf ---
--- 1. Product Overview ---
TechMobile offers a range of smartphones to suit various needs and preferences. Our 
current lineup includes: 
• TM-S100 (Standard) 
• TM-P200 (Pro) 
• TM-L150 (Lite) 
Each device is designed with cutting-edge technology to provide an exceptional user 
experience.

--- 2. Technical Specifications ---
TM-S100 (Standard) 
Processor: Snapdragon 888 
Memory: 8GB RAM 
Storage: 128GB 
Display: 6.5-inch AMOLED, 1080x2400 pixels 
Camera: 64MP main, 12MP ultra-wide, 5MP macro 
Battery Capacity: 4500mAh 
TM-P200 (Pro) 
Processor: Snapdragon 888 
Memory: 8GB RAM 
Storage: 128GB 
Display: 6.5-inch AMOLED, 1080x2400 pixels 
Camera: 64MP main, 12MP ultra-wide, 5MP macro 
Battery Capacity: 4500mAh 
TM-L150 (Lite) 
Processor: Snapdragon 888 Memory: 8GB RAM 
Storage: 128GB 
Display: 6.5-inch AMOLED, 1080x2400 pixels 
Camera: 64MP main, 12MP ultra-wide, 5MP macro 
Battery Capacity: 4500mAh

--- 3. Key Features ---
• AI-Enhanced Photograph

In [40]:
dataset.show()

2024-10-19 15:24:07,581	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\Vipul\AppData\Local\Temp\ray\session_2024-10-19_15-23-27_640146_1900\logs\ray-data
2024-10-19 15:24:07,582	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> LimitOperator[limit=20]


Running 0: 0.00 row [00:00, ? row/s]

- limit=20 1: 0.00 row [00:00, ? row/s]

{'filename': 'CleanBot_Robotic_Vacuum_Cleaner_FAQ.pdf', 'content': 'CleanBot Robotic Vacuum Cleaner FAQ \n1. Product Overview \nCleanBot offers a range of robotic vacuum cleaners to suit different cleaning needs. Our \ncurrent lineup includes: \n• CB-100 (Basic) \n• CB-200 (Smart Navigation) \n• CB-300 (Self-Emptying) \nEach model is designed to provide efficient and hassle-free cleaning. \n2. Technical Specifications \nCB-100 (Basic) \nBattery Life: 90 minutes \nSuction Power: 2000Pa \nDustbin Capacity: 0.5L \nNoise Level: 60dB \nWeight: 3.5kg \nCB-200 (Smart Navigation) \nBattery Life: 90 minutes \nSuction Power: 2000Pa \nDustbin Capacity: 0.5L \nNoise Level: 60dB \nWeight: 3.5kg \nCB-300 (Self-Emptying) \nBattery Life: 90 minutes \nSuction Power: 2000Pa \nDustbin Capacity: 0.5L Noise Level: 60dB \nWeight: 3.5kg \n3. Key Features \n• Efficient Cleaning: High suction power and multiple cleaning modes. \n• Smart Navigation: Advanced sensors for better navigation and cleaning. \n• Self-

## Vectorizing the data using sentence transformers

In [42]:
from sentence_transformers import SentenceTransformer

# Initializing the model
model = SentenceTransformer('all-MiniLM-L6-v2')

def add_embeddings(row):
    sections = row.get("sections")
    if sections:
        row["embeddings"] = {k: model.encode(v).tolist() for k, v in sections.items()}
    return row

# Applying the embeddings
vectorized_dataset = split_dataset.map(add_embeddings)

# Taking a sample and print the embeddings
sample = vectorized_dataset.take(1)
print(sample[0].get("embeddings", "No embeddings found"))





2024-10-19 15:45:27,979	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\Vipul\AppData\Local\Temp\ray\session_2024-10-19_15-23-27_640146_1900\logs\ray-data
2024-10-19 15:45:27,980	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[Map(split_sections)->Map(add_embeddings)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- Map(split_sections)->Map(add_embeddings) 1: 0.00 row [00:00, ? row/s]

- limit=1 2: 0.00 row [00:00, ? row/s]

{'1. Product Overview': [-0.019733421504497528, -0.02898591198027134, 0.04973491653800011, -0.0383463092148304, -0.00798757653683424, -0.011916300281882286, -0.05135061591863632, -0.022328751161694527, -0.05514802783727646, -0.012471417896449566, 0.05268944054841995, -0.020419711247086525, 0.05925546959042549, -0.0383404903113842, 0.019626185297966003, 0.051284193992614746, 0.06684503704309464, 0.06356421858072281, -0.00542270066216588, -0.005997914355248213, -0.059939488768577576, 0.0798417404294014, 0.04974566772580147, -0.009518001228570938, -0.09346228092908859, 0.0047973585315048695, -0.0229222122579813, -0.11738171428442001, -0.015925630927085876, -0.08075936883687973, 0.03634943813085556, -0.03631165996193886, 0.009531721472740173, -0.02887299284338951, 0.05302103981375694, -0.016742542386054993, -0.0445123054087162, 0.008961837738752365, 0.018183106556534767, 0.0022640686947852373, -0.033541660755872726, -0.008550062775611877, -0.03682660683989525, 0.031172599643468857, 0.02498

## Setting up Pinecone index for storing PDF section embeddings

In [45]:
import os
import pinecone
from dotenv import load_dotenv


load_dotenv()


pinecone_api_key = os.getenv('PINECONE_API')


pc = pinecone.Pinecone(api_key=pinecone_api_key)


index_name = 'pdf-embeddings-index'

# Check if index exists and create if not
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  
        metric='cosine',  # can Choose between 'cosine', 'dotproduct', or 'euclidean'
        spec=pinecone.ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )


index = pc.Index(index_name)


## Upserting it to Pinecone 

In [54]:

def upsert_to_pinecone(row):
    for section, embedding in row["embeddings"].items():
        # Creating a unique ID using filename and section header
        vector_id = f"{row['filename']}_{section}"
        
        # Upsert the embedding for this section
        index.upsert(vectors=[(vector_id, embedding)])


rows = vectorized_dataset.take_all()  # Take all rows from the dataset

for row in rows:
    upsert_to_pinecone(row)

print("Upsert process completed!")


2024-10-19 16:08:53,404	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\Vipul\AppData\Local\Temp\ray\session_2024-10-19_15-23-27_640146_1900\logs\ray-data
2024-10-19 16:08:53,405	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[Map(split_sections)->Map(add_embeddings)]


Running 0: 0.00 row [00:00, ? row/s]

- Map(split_sections)->Map(add_embeddings) 1: 0.00 row [00:00, ? row/s]

Upsert process completed!
