<a href="https://colab.research.google.com/github/Pravallika-Padarthi/Sithafal-tasks/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pdfplumber # Install pdfplumber library
!pip install sentence_transformers # Install SentenceTransformer library
!pip install faiss-cpu # Install faiss library
!pip install transformers # Install transformers library

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# New Section

In [None]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Step 1: Extract Text from PDF and Chunk It
def extract_text_with_page_numbers(pdf_path):
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if text:
                extracted_data.append((page_number, text))
    return extracted_data

def chunk_text_with_page_numbers(extracted_data, max_len=512):
    chunked_data = []
    for page_number, page_text in extracted_data:
        chunks = [page_text[i:i+max_len] for i in range(0, len(page_text), max_len)]
        for chunk in chunks:
            chunked_data.append((page_number, chunk))
    return chunked_data

# Step 2: Generate Embeddings for Each Chunk Using SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings_with_page_numbers(chunked_data):
    embeddings_with_page_numbers = []
    for page_number, chunk_text in chunked_data:
        emb = model.encode([chunk_text])[0]  # Generate embedding for the chunk
        embeddings_with_page_numbers.append((page_number, emb))  # Store embedding with page number
    return embeddings_with_page_numbers

# Step 3: Store Embeddings in FAISS Index
def store_embeddings_in_faiss(embeddings_with_page_numbers, index_path='faiss_index_with_page_numbers'):
    embeddings = np.array([emb for _, emb in embeddings_with_page_numbers]).astype('float32')
    page_numbers = [page for page, _ in embeddings_with_page_numbers]

    index = faiss.IndexFlatL2(embeddings.shape[1])  # Use L2 distance
    index.add(embeddings)  # Add embeddings to the FAISS index
    faiss.write_index(index, index_path)
    np.save("page_numbers.npy", page_numbers)  # Save page numbers for mapping to retrieved data
    return index, page_numbers

# Step 4: Query Handling (Search the FAISS Index)
def query_embedding(query):
    return model.encode([query])

def search_index_with_page_numbers(query_embedding, k=3, index_path='faiss_index_with_page_numbers'):
    index = faiss.read_index(index_path)
    distances, indices = index.search(query_embedding, k)
    return distances, indices

# Step 5: Summarization (Using BART)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_retrieved_chunks(retrieved_chunks):
    combined_text = " ".join(retrieved_chunks)
    input_length = len(combined_text.split())  # Number of words in the input text

    # Dynamically set max_length for the summary
    max_summary_length = min(input_length, 130)  # Adjust max_length to the input length
    summary = summarizer(combined_text, max_length=max_summary_length, min_length=30, do_sample=False)
    return summary[0]['summary_text']

# Step 6: Putting it All Together
def chat_with_pdf(pdf_path, query):
    # Extract, chunk, and generate embeddings
    extracted_data = extract_text_with_page_numbers(pdf_path)
    chunked_data = chunk_text_with_page_numbers(extracted_data)
    embeddings_with_page_numbers = generate_embeddings_with_page_numbers(chunked_data)

    # Store embeddings in FAISS
    index, page_numbers = store_embeddings_in_faiss(embeddings_with_page_numbers)

    # Handle the query
    query_emb = query_embedding(query)
    distances, indices = search_index_with_page_numbers(query_emb)

    # Retrieve relevant chunks
    retrieved_chunks = []
    for i in indices[0]:
        if i < len(chunked_data):
            retrieved_chunks.append(chunked_data[i][1])

    # Generate summary
    summary = summarize_retrieved_chunks(retrieved_chunks)

    return summary

# Example query
pdf_path = "Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life - 2017-2018.pdf"
query = "What are the different types of data visualizations?"
summary = chat_with_pdf(pdf_path, query)
print("Summary of Retrieved Content:")
print(summary)

Summary of Retrieved Content:
We use charts and graphs to visualize data. This data can either be generated data, data gathered from an experiment, or data collected from some source. e chart or a bar graph.


In [None]:
# Install necessary libraries
!pip install pdfplumber

import pdfplumber

# Function to extract text from a specific page in the PDF
def extract_page_text(pdf_path, page_number):
    """
    Extract text from a specific page in the PDF.
    Args:
        pdf_path: Path to the PDF file.
        page_number: Page number to extract (0-indexed).
    Returns:
        Extracted text as a string.
    """
    with pdfplumber.open(pdf_path) as pdf:
        if page_number < len(pdf.pages):
            page = pdf.pages[page_number]
            return page.extract_text()
        else:
            return f"Error: Page {page_number + 1} not found in the PDF."

# Upload the PDF file to Google Colab
from google.colab import files
uploaded = files.upload()

# Extract the PDF file name (the key returned by files.upload())
pdf_path = list(uploaded.keys())[0]

# Extract specific pages
page_2_text = extract_page_text(pdf_path, 1)  # Page 2 (0-indexed)
page_6_text = extract_page_text(pdf_path, 5)  # Page 6 (0-indexed)

# Output Page 2 Content: Unemployment Info
print("=== Page 2: Unemployment Information ===")
print(page_2_text)

# Output Page 6 Content: Tabular Data
print("\n=== Page 6: Tabular Data ===")
print(page_6_text)



Saving Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life - 2017-2018.pdf to Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life - 2017-2018 (1).pdf
=== Page 2: Unemployment Information ===


=== Page 6: Tabular Data ===
Table of Yearly U.S. GDP by
Industry (in millions of dollars)
Source: U.S. Bureau of Labor Statistics
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real
4522451 4618678 4797313 5031881 5339678 5597018
Estate, Rental,
Leasing
Arts,
Entertainment,
Recreation, 964032 1015238 1076249 1120496 1189646 1283813
Accommodation,
and Food Service
Other 15614511 16320113 16948076 17495515 18318606 18686638
