Digital Signatures<br>
Metadata Analysis<br>
PDF Structure Analysis<br>
Using PyMuPDF (fitz) for Detailed Analysis<br>
Extracting font styles used<br>

<h1>Digital Signatures</h1>
First, let's check if the PDF has any digital signatures using PyPDF2:

In [5]:
!pip install pypdf2




In [7]:
from PyPDF2 import PdfReader

def check_digital_signature(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PdfReader(file)
        signature_found = False
        for page in reader.pages:
            if '/Annots' in page:
                annotations = page['/Annots']
                for annot in annotations:
                    annot_obj = annot.get_object()
                    if annot_obj.get('/Subtype') == '/Widget' and annot_obj.get('/FT') == '/Sig':
                        signature_found = True
                        break
            if signature_found:
                break

        if signature_found:
            print("PDF is digitally signed.")
        else:
            print("PDF is not digitally signed.")

# Example usage
pdf_file = "cisco.pdf"
check_digital_signature(pdf_file)

pdf_file = "msword.pdf"
check_digital_signature(pdf_file)

pdf_file = "compresssed.pdf"
check_digital_signature(pdf_file)


PDF is not digitally signed.
PDF is not digitally signed.
PDF is not digitally signed.


<h1>Metadata Analysis</h1>
Let's proceed with the metadata extraction using PyMuPDF (fitz):

In [9]:
import fitz  

def extract_metadata(pdf_file):
    document = fitz.open(pdf_file)
    metadata = document.metadata
    return metadata

# Example usage
pdf_file = "cisco.pdf"
metadata = extract_metadata(pdf_file)
print(f"Metadata: {metadata}")

pdf_file = "compresssed.pdf"
metadata = extract_metadata(pdf_file)
print(f"Metadata: {metadata}")


Metadata: {'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'dompdf 1.2.2 + CPDF', 'creationDate': "D:20240803191606+05'30'", 'modDate': "D:20240803191606+05'30'", 'trapped': '', 'encryption': None}
Metadata: {'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'iLovePDF', 'creationDate': "D:20240803191606+05'30'", 'modDate': 'D:20240808112452Z', 'trapped': '', 'encryption': None}


<h1>PDF Structure Analysis</h1>
Here's the structure analysis using PyMuPDF:

In [5]:
import fitz  # PyMuPDF
import os

def analyze_pdf_structure_and_extract_images_and_colors(pdf_file, output_dir="extracted_images"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    document = fitz.open(pdf_file)
    structure = {
        "number_of_pages": document.page_count,
        "fonts": set(),
        "images": [],
        "font_colors": []
    }
    
    for page_num in range(len(document)):
        page = document[page_num]
        # Extract text to find fonts and colors
        text = page.get_text("dict")
        for block in text["blocks"]:
            if block["type"] == 0:  # Text block
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_info = (span["font"], span["size"], span["color"])
                        structure["fonts"].add(span["font"])
                        structure["font_colors"].append(font_info)
        
        # Extract images
        for image_index, image in enumerate(page.get_images(full=True)):
            xref = image[0]
            base_image = document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_name = f"page_{page_num+1}_image_{image_index+1}.{image_ext}"
            image_path = os.path.join(output_dir, image_name)
            
            # Save the image
            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
                
            structure["images"].append(image_name)

    return structure

# Example usage
pdf_file = "cisco.pdf"
structure_info = analyze_pdf_structure_and_extract_images_and_colors(pdf_file)
print(f"Structure Info: {structure_info}")

pdf_file2 = "./compresssed.pdf"
structure_info2 = analyze_pdf_structure_and_extract_images_and_colors(pdf_file2)
print(f"Structure Info: {structure_info2}")


Structure Info: {'number_of_pages': 1, 'fonts': {'DejaVuSans-Bold', 'Helvetica'}, 'images': ['page_1_image_1.png', 'page_1_image_2.png', 'page_1_image_3.png', 'page_1_image_4.png'], 'font_colors': [('Helvetica', 12.0, 7434095), ('Helvetica', 12.0, 7434095), ('Helvetica', 12.0, 7434095), ('Helvetica', 15.0, 5607403), ('Helvetica', 10.5, 11184810), ('Helvetica', 18.0, 0), ('Helvetica', 12.800000190734863, 0), ('Helvetica', 12.800000190734863, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('Helvetica', 10.5, 0), ('DejaVuSans-Bold', 9.800000190734863, 0), ('Helvetica', 8.199999809265137, 0), ('DejaVuSans-Bold', 9.800000190734863, 0), ('Helvetica', 8.1999

<h1>Extracting Font Styles Used with pdfminer.six</h1>

In [3]:
import pandas as pd
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBox, LTTextLine, LTChar

def extract_font_information(pdf_file):
    font_data = []
    with open(pdf_file, 'rb') as file:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page_number, page in enumerate(PDFPage.get_pages(file), start=1):
            interpreter.process_page(page)
            layout = device.get_result()
            for element in layout:
                if isinstance(element, (LTTextBox, LTTextLine)):
                    for text_line in element:
                        line_text = text_line.get_text().strip()
                        for char in text_line:
                            if isinstance(char, LTChar):
                                font_data.append({
                                    "Page": page_number,
                                    "Font": char.fontname,
                                    "Size": char.size,
                                    "Color": char.graphicstate.ncolor,
                                    "Text": line_text
                                })
    return font_data

# Extract font information from both PDFs
pdf_file1 = "certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf"
font_info1 = extract_font_information(pdf_file1)
font_info_df1 = pd.DataFrame(font_info1)

pdf_file2 = "certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf"
font_info2 = extract_font_information(pdf_file2)
font_info_df2 = pd.DataFrame(font_info2)

# Group by text, font attributes, and count occurrences
def summarize_font_info(df):
    summary_df = df.groupby(["Page", "Text", "Font", "Size", "Color"]).size().reset_index(name='Count')
    return summary_df

font_info_df1_summary = summarize_font_info(font_info_df1)
font_info_df2_summary = summarize_font_info(font_info_df2)

# Adjust display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Display the summarized DataFrames
print("Summarized Font Information for Cisco.pdf:")
display(font_info_df1_summary)

print("Summarized Font Information for Sumit.pdf:")
display(font_info_df2_summary)

# Compare summarized font information
comparison_df = pd.concat([font_info_df1_summary, font_info_df2_summary], keys=['certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf', 'certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf'], names=['PDF', 'Index'])
comparison_df = comparison_df.reset_index(level='PDF').reset_index(drop=True)
print("Comparison of Summarized Font Information:")
display(comparison_df)


Summarized Font Information for Cisco.pdf:


Unnamed: 0,Page,Text,Font,Size,Color,Count
0,1,Can execute the power of the Linux command line,Helvetica,11.2,"(0.075, 0.365, 0.612)",47
1,1,"Congratulations on successfully completing the NDG Linux Essentials course in the Cisco Networking Academy. By completing this course, you are now",Helvetica,11.2,"(0.075, 0.365, 0.612)",147
2,1,DEVELOPMENT CERTIFICATE SHOWS THAT YOU:,Helvetica,12.8,"(0.075, 0.365, 0.612)",39
3,1,Date 29 Jan 2023,Helvetica-Oblique,11.2,"(0.075, 0.365, 0.612)",16
4,1,"Dear Sumit Mesta,",DejaVuSans,12.8,"(0.075, 0.365, 0.612)",10
5,1,"Dear Sumit Mesta,",Helvetica,12.8,"(0.075, 0.365, 0.612)",6
6,1,Development Certificate!,Helvetica,11.2,"(0.075, 0.365, 0.612)",24
7,1,Go to LPI.org to learn more about the Linux Essentials Professional,Helvetica,11.2,"(0.075, 0.365, 0.612)",68
8,1,Have demonstrated the ability to navigate a Linux system,Helvetica,11.2,"(0.075, 0.365, 0.612)",56
9,1,Have the motivation to advance your IT career,Helvetica,11.2,"(0.075, 0.365, 0.612)",45


Summarized Font Information for Sumit.pdf:


Unnamed: 0,Page,Text,Font,Size,Color,Count
0,1,Can execute the power of the Linux command line,Helvetica,11.2,"(0.075, 0.365, 0.612)",47
1,1,"Congratulations on successfully completing the NDG Linux Essentials course in the Cisco Networking Academy. By completing this course, you are now",Helvetica,11.2,"(0.075, 0.365, 0.612)",147
2,1,DEVELOPMENT CERTIFICATE SHOWS THAT YOU:,Helvetica,12.8,"(0.075, 0.365, 0.612)",39
3,1,Date 29 Jan 2023,Helvetica-Oblique,11.2,"(0.075, 0.365, 0.612)",16
4,1,"Dear Sumit Mesta,",DejaVuSans,12.8,"(0.075, 0.365, 0.612)",10
5,1,"Dear Sumit Mesta,",Helvetica,12.8,"(0.075, 0.365, 0.612)",6
6,1,Development Certificate!,Helvetica,11.2,"(0.075, 0.365, 0.612)",24
7,1,Go to LPI.org to learn more about the Linux Essentials Professional,Helvetica,11.2,"(0.075, 0.365, 0.612)",68
8,1,Have demonstrated the ability to navigate a Linux system,Helvetica,11.2,"(0.075, 0.365, 0.612)",56
9,1,Have the motivation to advance your IT career,Helvetica,11.2,"(0.075, 0.365, 0.612)",45


Comparison of Summarized Font Information:


Unnamed: 0,PDF,Page,Text,Font,Size,Color,Count
0,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,Can execute the power of the Linux command line,Helvetica,11.2,"(0.075, 0.365, 0.612)",47
1,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,"Congratulations on successfully completing the NDG Linux Essentials course in the Cisco Networking Academy. By completing this course, you are now",Helvetica,11.2,"(0.075, 0.365, 0.612)",147
2,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,DEVELOPMENT CERTIFICATE SHOWS THAT YOU:,Helvetica,12.8,"(0.075, 0.365, 0.612)",39
3,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,Date 29 Jan 2023,Helvetica-Oblique,11.2,"(0.075, 0.365, 0.612)",16
4,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,"Dear Sumit Mesta,",DejaVuSans,12.8,"(0.075, 0.365, 0.612)",10
5,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,"Dear Sumit Mesta,",Helvetica,12.8,"(0.075, 0.365, 0.612)",6
6,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,Development Certificate!,Helvetica,11.2,"(0.075, 0.365, 0.612)",24
7,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,Go to LPI.org to learn more about the Linux Essentials Professional,Helvetica,11.2,"(0.075, 0.365, 0.612)",68
8,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,Have demonstrated the ability to navigate a Linux system,Helvetica,11.2,"(0.075, 0.365, 0.612)",56
9,certificates_dataset/SumitMesta-Linux Essentials-certificate.pdf,1,Have the motivation to advance your IT career,Helvetica,11.2,"(0.075, 0.365, 0.612)",45
