In [None]:
# Kirtan Panchal
# 2301208011

In [9]:
import fitz  # PyMuPDF for PDF handling
import csv
import os
import pdfplumber

def extract_pdf_data(pdf_path):
    text_data = []
    images = []
    tables = []
    links = []

    # Open the PDF file using fitz (PyMuPDF)
    doc = fitz.open(pdf_path)

    for page_number in range(len(doc)):
        page = doc.load_page(page_number)

        # Extract text
        text_data.append(page.get_text())

        # Extract images
        page_images = []
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_path = f"image_{page_number + 1}_{img_index + 1}.png"
            with open(image_path, 'wb') as img_file:
                img_file.write(image_bytes)
            page_images.append(image_path)
        images.append(page_images)

    # Extract tables using pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        for page_number in range(len(pdf.pages)):
            page_plumber = pdf.pages[page_number]
            table = page_plumber.extract_tables()
            tables.append(table)

    # Extract hyperlinks
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        links_on_page = []
        for link in page.get_links():
            if 'uri' in link:
                links_on_page.append(link['uri'])
        links.append(links_on_page)

    return text_data, images, tables, links

def save_to_csv(text_data, images, tables, links, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Textual Content", "Images", "Table Content", "Links"])

        for i in range(len(text_data)):
            writer.writerow([
                text_data[i],
                "; ".join(images[i]) if images[i] else "",
                str(tables[i]) if tables[i] else "",
                "; ".join(links[i]) if links[i] else ""
            ])

# Path to your PDF file (Windows format)
pdf_file_path = r'C:\Users\kirta\Downloads\Skill 4 PDF File.pdf'

print(pdf_file_path)
output_csv = r'C:\Users\kirta\Downloads\output.csv'

# Check if the PDF file exists
if not os.path.exists(pdf_file_path):
    print("The file path is incorrect or the file does not exist.")
else:
    # Extract data from the PDF
    text_data, images, tables, links = extract_pdf_data(pdf_file_path)

    # Save the extracted data to a CSV file
    save_to_csv(text_data, images, tables, links, output_csv)

    print("Extraction completed and saved to CSV.")


C:\Users\kirta\Downloads\Skill 4 PDF File.pdf
Extraction completed and saved to CSV.
