In [2]:
!pip install fitz



In [1]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
!pip install frontend

Collecting frontend
  Downloading frontend-0.0.3-py3-none-any.whl.metadata (847 bytes)
Collecting starlette>=0.12.0 (from frontend)
  Downloading starlette-0.42.0-py3-none-any.whl.metadata (6.0 kB)
Collecting uvicorn>=0.7.1 (from frontend)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting aiofiles (from frontend)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading frontend-0.0.3-py3-none-any.whl (32 kB)
Downloading starlette-0.42.0-py3-none-any.whl (73 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: uvicorn, aiofiles, starlette, frontend
Successfully installed aiofiles-24.1.0 frontend-0.0.3 starlette-0.4

In [17]:
# Install necessary libraries and create required directories
!pip install --upgrade pymupdf
!pip install pdfplumber
!pip install colorama
!mkdir static

import fitz
import pdfplumber
import os
from colorama import Fore, Style


# --- Helper Functions ---

def extract_page_content(pdf_path, page_num):
    """
    Fetch text and tables from a specific page of the PDF.
    Think of this as mining a treasure chest for data on the given page.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not (0 <= page_num < len(pdf.pages)):
                return None, None
            page = pdf.pages[page_num]
            text = page.extract_text()
            tables = page.extract_tables()
            return text, tables
    except Exception as e:
        print(Fore.RED + f"Error reading text and tables: {e}" + Style.RESET_ALL)
        return None, None


def extract_images(pdf_path, page_num, img_dir):
    """
    Extracts images from a specific page of the PDF and saves them locally.

    """
    try:
        if not os.path.exists(img_dir):
            os.makedirs(img_dir)
        images = []
        with fitz.open(pdf_path) as pdf:
            if not (0 <= page_num < len(pdf)):
                return None
            page = pdf[page_num]
            for i, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_img = pdf.extract_image(xref)
                img_bytes = base_img["image"]
                img_filename = os.path.join(img_dir, f"page_{page_num + 1}_img{i + 1}.png")
                with open(img_filename, "wb") as img_file:
                    img_file.write(img_bytes)
                images.append(img_filename)
        return images
    except Exception as e:
        print(Fore.RED + f"Error extracting images: {e}" + Style.RESET_ALL)
        return []


def parse_user_query(query):
    """
    Analyzes the user query and extracts requested page numbers.
    Example: "Extract data from page 1 and page 3" -> [0, 2]
    """
    try:
        pages = [int(part.split()[0]) - 1 for part in query.split("page")[1:]]
        return pages
    except (IndexError, ValueError):
        raise ValueError("Oops! Couldn't understand the page numbers in your query. Please try again.")


def process_single_page(pdf_path, page_num, img_dir):
    """
    Gathers all the data(text, tables, and images) from a single page.
    """
    result = f"\n{Fore.YELLOW}--- Data from Page {page_num + 1} ---{Style.RESET_ALL}\n"

    # Extract text and tables
    text, tables = extract_page_content(pdf_path, page_num)
    if text is None:
        return f"{Fore.RED}Page {page_num + 1} does not exist in the PDF.{Style.RESET_ALL}\n"

    result += f"\n{Fore.GREEN}Text Content:{Style.RESET_ALL}\n{text or 'No text found on this page.'}\n"

    if tables:
        result += f"\n{Fore.CYAN}Tables Found:{Style.RESET_ALL}\n"
        for idx, table in enumerate(tables, 1):
            result += f"\nTable {idx}:\n"
            for row in table:
                result += " | ".join(str(cell) for cell in row) + "\n"
    else:
        result += f"\n{Fore.MAGENTA}No tables found on this page.{Style.RESET_ALL}\n"

    # Extract images
    images = extract_images(pdf_path, page_num, img_dir)
    if images:
        result += f"\n{Fore.BLUE}Images Extracted:{Style.RESET_ALL}\n" + "\n".join(images) + "\n"
    else:
        result += f"\n{Fore.MAGENTA}No images found on this page.{Style.RESET_ALL}\n"

    return result


def process_user_query(query, pdf_path, img_dir):
    """
    Processes the user query and retrieves data from all requested pages.
    """
    try:
        page_numbers = parse_user_query(query)
    except ValueError as e:
        return f"{Fore.RED}{e}{Style.RESET_ALL}"

    output = ""
    for page_num in page_numbers:
        output += process_single_page(pdf_path, page_num, img_dir)
    return output


# --- Main Execution ---

def main():

    PDF_PATH = "/content/sithafaltask1/sithafal file.pdf"
    IMG_DIR = "/content/output"

    print(Fore.BLUE + "Welcome to the PDF Data Extractor!" + Style.RESET_ALL)
    print("You can extract text, tables, and images from specific pages of a PDF.")
    print("For example, try: 'Extract data from page 1 and page 3'")

    user_query = input(Fore.YELLOW + "\nEnter your query: " + Style.RESET_ALL)
    output = process_user_query(user_query, PDF_PATH, IMG_DIR)

    print(Fore.GREEN + "\n--- Extraction Results ---\n" + Style.RESET_ALL)
    print(output)


if __name__ == "__main__":
    main()


Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6
mkdir: cannot create directory ‘static’: File exists
[34mWelcome to the PDF Data Extractor![0m
You can extract text, tables, and images from specific pages of a PDF.
For example, try: 'Extract data from page 1 and page 3'
[33m
Enter your query: [0mpage 6
[32m
--- Extraction Results ---
[0m

[33m--- Data from Page 6 ---[0m

[32mText Content:[0m
Table of Yearly U.S. GDP by
Industry (in millions of dollars)
Source: U.S. Bureau of Labor Statistics
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real
4522451 4618678 4797313 5031881 5339678 5597018
Estate, Rental,
Leasing
Arts,
Entertainment,
Recreation, 964032 1015238 1076249 11