In [1]:
# The code extracts metadata, outlines, text, links, and comments from a PDF file. It uses PyPDF2 to read the PDF, 
## gathers basic information like author and title, counts pages and form fields, 
## and navigates through the document's structure to list sections and annotations. The results are compiled into a structured summary of the PDF's contents and metadata.

from PyPDF2 import PdfReader

def get_outlines(outlines, reader, level=0):
    sections = []
    for outline in outlines:
        if isinstance(outline, list):
            sections.extend(get_outlines(outline, reader, level + 1))
        else:
            if hasattr(outline, 'page') and outline.page is not None:
                # Resolve the IndirectObject to get the actual PageObject
                page_obj = reader.get_object(outline.page)
                if page_obj:
                    # Assuming '/StructParents' gives the page number, adjust if necessary
                    page_num = page_obj.get('/StructParents')
                    sections.append({"level": level, "title": outline.title, "page": page_num})
    return sections

def extract_metadata_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PdfReader(f)
        info = reader.metadata
        number_of_pages = len(reader.pages)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + ' '

        form_fields = reader.get_form_text_fields() if reader.get_form_text_fields() is not None else {}
        number_of_forms = len(form_fields)

        outlines = []
        if hasattr(reader, 'outline') and reader.outline:
            outlines = get_outlines(reader.outline, reader)

        links = []
        comments = []
        for page in reader.pages:
            annots = page.get("/Annots")
            if annots:
                if isinstance(annots, list):
                    for annot in annots:
                        annot_obj = annot.get_object()
                        if annot_obj.get("/Subtype") == "/Link":
                            links.append(annot_obj.get("/A").get("/URI"))
                        elif annot_obj.get("/Subtype") == "/Text":
                            comments.append(annot_obj.get("/Contents"))

        creation_date = info.get('CreationDate', 'Unknown')
        modification_date = info.get('ModDate', 'Unknown')

        return {
            'Author': info.get('Author', 'Unknown'),
            'Title': info.get('Title', 'No Title'),
            'Subject': info.get('Subject', 'No Subject'),
            'Creator': info.get('Creator', 'Unknown Creator'),
            'Producer': info.get('Producer', 'Unknown Producer'),
            'CreationDate': creation_date,
            'ModificationDate': modification_date,
            'NumberOfPages': number_of_pages,
            'NumberOfForms': number_of_forms,
            'Outlines': outlines,
            'Links': links,
            'Comments': comments
        }

file_path = 'The-Field-Guide-to-Data-Science.pdf'  # Update this to the path of your PDF
metadata = extract_metadata_from_pdf(file_path)

In [2]:
metadata

{'Author': 'Unknown',
 'Title': 'No Title',
 'Subject': 'No Subject',
 'Creator': 'Unknown Creator',
 'Producer': 'Unknown Producer',
 'CreationDate': 'Unknown',
 'ModificationDate': 'Unknown',
 'NumberOfPages': 110,
 'NumberOfForms': 0,
 'Outlines': [{'level': 0, 'title': 'The Outline of Our Story', 'page': None},
  {'level': 0, 'title': 'Model Validation', 'page': None},
  {'level': 0, 'title': 'Meet your Guides', 'page': None},
  {'level': 0, 'title': 'The Short Version', 'page': None},
  {'level': 0, 'title': 'Start Here for the Basics', 'page': None},
  {'level': 1, 'title': 'What Do We Mean by Data Science?', 'page': None},
  {'level': 1, 'title': 'How Does Data Science Actually Work?', 'page': None},
  {'level': 1,
   'title': 'What Does It Take to Create a Data Science Capability?',
   'page': None},
  {'level': 0, 'title': 'Take off the Training Wheels', 'page': None},
  {'level': 1, 'title': 'Guiding Principles ', 'page': None},
  {'level': 1, 'title': 'The Importance of Reas