In [None]:
pip install PyPDF2 sentence-transformers faiss-cpu langchain pandas




In [None]:
import PyPDF2
import os
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_text_from_pdf(Source_pdf):
    """Extracting text content from a PDF file."""
    pdf_reader = PyPDF2.PdfReader('/content/Source_pdf')
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text

def chunk_text(text, chunk_size=500):
    chunks = []
    sentences = re.split(r'(?<=[.?!])\s+', text)
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


In [None]:
def create_embeddings(chunks):
    """Generating embeddings for text chunks."""
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
    return np.array(embeddings)

def store_embeddings(embeddings):
    """Store embeddings into a FAISS index."""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index


In [None]:
def query_embeddings(query, index, chunks, top_k=3):
    """Retrieving the most relevant chunks for a user query."""
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        if idx < len(chunks):
            results.append(chunks[idx])
    return results

def display_results(results):
    """Displaying retrieved results."""
    print("\nMost Relevant Chunks:")
    for i, res in enumerate(results, 1):
        print(f"\nResult {i}:\n{res}")


In [None]:
def extract_tabular_data(text):
    """Extracting tabular data using pandas."""
    tables = []
    table_matches = re.findall(r"((?:\w+\s*)+\n(?:[\d\s,]+\n)+)", text)
    for match in table_matches:
        rows = match.strip().split("\n")
        header = rows[0].split()
        data = [list(map(str.strip, row.split())) for row in rows[1:]]
        table = pd.DataFrame(data, columns=header)
        tables.append(table)
    return tables

def display_tables(tables):
    """Display extracted tables."""
    for i, table in enumerate(tables):
        print(f"\nTable {i+1}:")
        print(table)


In [None]:
if __name__ == "__main__":
    pdf_path = '/content/Source_pdf'
    print("Extracting text from PDF...")
    text = extract_text_from_pdf(Source_pdf)

    print("Chunking text...")
    chunks = chunk_text(text)

    print("Generating embeddings...")
    embeddings = create_embeddings(chunks)
    index = store_embeddings(embeddings)

    while True:
        query = input("\nEnter your query (or type 'exit' to stop): ")
        if query.lower() == "exit":
            print("Exiting the system.")
            break

        if "compare" in query.lower():
            print("Comparison query detected! Extracting tables...")
            tables = extract_tabular_data(text)
            display_tables(tables)
        else:
            results = query_embeddings(query, index, chunks)
            display_results(results)


Extracting text from PDF...
Chunking text...
Generating embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Enter your query (or type 'exit' to stop): hello

Most Relevant Chunks:

Result 1:
GDP by 
Industry (in millions of dollars)
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real 
Estate, Rental, 
Leasing4522451 4618678 4797313 5031881 5339678 5597018
Arts, 
Entertainment, 
Recreation, 
Accommodation,
and Food Service964032 1015238 1076249 1120496 1189646 1283813
Other 15614511 16320113 16948076 17495515 18318606 18686638Source: U.S.

Result 2:
Writing as an equation and solving, we get 
n = 0.15 x 31,000 = 4650
So the family spends $4650 on transportation yearly.0 5 10 15 20 25 30 35All industriesManufacturingFinance, insurance, real estate, rental, and leasingArts, entertainment, recrea tion, accommodation, and food
servicesOther
Dollars2015 GDP (in trillions of dollars)•The graph below is called a bar graph. •It shows each of the variables inde