<a href="https://colab.research.google.com/github/SruthiMangu133/Sitafal_technologies/blob/main/sithafal_technologies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
!pip install faiss-cpu #Install the faiss library for CPU usage

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [7]:
import os
import requests
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

def download_pdf_from_url(url, save_path):
    """Download a PDF file from a URL."""
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"PDF downloaded successfully to {save_path}")
        return save_path
    else:
        print(f"Failed to download PDF. HTTP Status Code: {response.status_code}")
        return None

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def chunk_text(text, chunk_size=500):
    """Split text into manageable chunks."""
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def create_vector_database(chunks, embedding_model='all-MiniLM-L6-v2'):
    """Embed text chunks and store them in a FAISS vector database."""
    model = SentenceTransformer(embedding_model)
    embeddings = model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index, model, embeddings

def query_vector_database(query, chunks, index, model, top_k=5):
    """Retrieve the most relevant chunks for a query."""
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [(chunks[i], distances[0][j]) for j, i in enumerate(indices[0])]

def generate_response(retrieved_chunks, query, llm_model='t5-small'):
    """Generate a response using an LLM based on retrieved chunks."""
    summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small", device=-1)
    context = " ".join([chunk for chunk, _ in retrieved_chunks])
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer only the relevant information:"
    try:
        response = summarizer(prompt, max_length=150, min_length=50, do_sample=False)
        return response[0]['summary_text']
    except Exception as e:
        print(f"Error generating response: {e}")
        return "Could not generate a response."

def extract_table_from_page(pdf_path, page_num):
    """Extract tabular data from a specific PDF page."""
    reader = PdfReader(pdf_path)
    page = reader.pages[page_num]
    text = page.extract_text()
    lines = text.split("\n")
    table_data = [line.split() for line in lines if line.strip()]
    return table_data

def main():
    # Step 1: URL of the PDF file
    url = "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"
    temp_file = "downloaded_file.pdf"

    print("Downloading PDF from URL...")
    pdf_path = download_pdf_from_url(url, temp_file)
    if not pdf_path:
        return

    print("Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)

    print("Chunking text...")
    chunks = chunk_text(text)

    print("Creating vector database...")
    index, embedding_model, _ = create_vector_database(chunks)

    while True:
        print("\nOptions:")
        print("1. Ask a question")
        print("2. Perform a comparison query")
        print("3. Extract tabular data from a page")
        print("4. Exit")
        choice = input("Enter your choice: ")

        if choice == "1":
            query = input("Enter your question: ")
            retrieved_chunks = query_vector_database(query, chunks, index, embedding_model)
            print("Retrieved Chunks:")
            for chunk, distance in retrieved_chunks:
                print(f"Chunk: {chunk}\nDistance: {distance}\n")
            response = generate_response(retrieved_chunks, query)
            print("\nResponse:")
            print(response)

        elif choice == "2":
            try:
                n = int(input("How many queries for comparison? "))
                queries = [input(f"Enter query {i + 1}: ") for i in range(n)]
                results = {}
                for query in queries:
                    retrieved_chunks = query_vector_database(query, chunks, index, embedding_model)
                    results[query] = retrieved_chunks
                for query, retrieved_chunks in results.items():
                    print(f"\nQuery: {query}")
                    for chunk, distance in retrieved_chunks:
                        print(f"Chunk: {chunk}\nDistance: {distance}\n")
            except ValueError:
                print("Invalid input. Please enter an integer for the number of queries.")

        elif choice == "3":
            try:
                page_num = int(input("Enter page number (starting from 1): ")) - 1
                table_data = extract_table_from_page(pdf_path, page_num)
                if isinstance(table_data, str):
                    print(table_data)
                else:
                    print("\nTabular Data:")
                    for row in table_data:
                        print(row)
            except ValueError:
                print("Invalid input. Please enter a valid page number.")

        elif choice == "4":
            print("Exiting...")
            break

        else:
            print("Invalid choice. Try again.")

if __name__ == "__main__":
    main()


Downloading PDF from URL...
PDF downloaded successfully to downloaded_file.pdf
Extracting text from PDF...
Chunking text...
Creating vector database...

Options:
1. Ask a question
2. Perform a comparison query
3. Extract tabular data from a page
4. Exit
Enter your choice: 1
Enter your question: What was the total GDP for all industries in 2015?
Retrieved Chunks:
Chunk: of Data
Table of Yearly U.S. GDP by 
Industry (in millions of dollars)
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real 
Estate, Rental, 
Leasing4522451 4618678 4797313 5031881 5339678 5597018
Arts, 
Entertainment, 
Recreation, 
Accommodation,
and Food Service964032 1015238 1076249 1120496 1189646 1283813
Other 15614511 16320113 16948076 17495515 18318606 
Distance: 0.7068169116973877

Chunk: 18686638Source: U.S. Bureau of Labor Statistics
19%
18%
4%59%2015 U.S. GDP (in millions of

Token indices sequence length is longer than the specified maximum sequence length for this model (694 > 512). Running this sequence through the model will result in indexing errors



Response:
the graph below is called a bar graph . it shows what percent “of the pie” a particular category occupies out of the whole . if total GDP in 2015 is the entire pie, then manufacturing makes up 19% of that pie .

Options:
1. Ask a question
2. Perform a comparison query
3. Extract tabular data from a page
4. Exit
Enter your choice: 2
How many queries for comparison? 2
Enter query 1: gdp of manufacturing sector in 2010
Enter query 2: gdp of manufacturing sector in 2015

Query: gdp of manufacturing sector in 2010
Chunk: 18686638Source: U.S. Bureau of Labor Statistics
19%
18%
4%59%2015 U.S. GDP (in millions of dollars)
Manufacturing
Finance, insurance, real
estate, rental, and
leasing
Arts, entertainment,recreation,
accommodation, and
food services
Other•The chart below is called a pie chart.  It shows what 
percent “of the pie” a particular category occupies 
out of the whole.
•If total GDP in 2015 is the entire pie, then 
manufacturing makes up 19% of that pie and finance makes