# PDF parser


In [1]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [6]:
%pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# FINAL CLASS TO BE USED

In [2]:
import os
import re
import unicodedata
from PyPDF2 import PdfReader
import pdfplumber


# PDF Processor

Class to process PDFs

In [7]:
class PDFProcessor:
    def __init__(self, folder_path):
        self.folder_path = folder_path
        self.pdf_paths = self.load_pdf_paths()
        self.text = ""
        self.form_fields = {}
        self.signature_found = False


  # 1. Loading PDF file paths from a folder
    def load_pdf_paths(self):
        return [
            os.path.join(self.folder_path, f)
            for f in os.listdir(self.folder_path)
            if f.lower().endswith(".pdf")
        ]

  # 2. Extracting text and form fields from PDF files
    def extract_text_and_fields(self):
        all_text = ""
        all_fields = {}

        for pdf_path in self.pdf_paths:
            reader = PdfReader(pdf_path)

            # Extract text
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    all_text += page_text + "\n"

            # Extract form fields
            fields = reader.get_fields()
            if fields:
                for key, field in fields.items():
                    value = field.get("/V")
                    all_fields[key] = str(value) if value is not None else None

        self.text = all_text
        self.form_fields = all_fields

       # print("Form field keys:", self.form_fields.keys())
       # print("\nExtracted Form Fields (User Input):\n", self.form_fields)

  # 3. Cleaning the text
    def clean_text(self):
        text = unicodedata.normalize("NFKD", self.text)
        text = re.sub(r"[^a-zA-Z0-9.,!?%€$-]", " ", text)
        text = text.lower()
        text = re.sub(r"\s+", " ", text).strip()
        text = re.sub(r"\.{5,}", " ", text)
        self.text = text

  # 4. Extracting and storing client info from cleaned text
    def extract_client_info(self):
        client_info = dict(self.form_fields)  # Making a copy of the form fields
        client_info["signature_image_found"] = self.signature_found

        #print("Extracted client info:", self.form_fields)
        return client_info

    # 5. checking for signature
    def detect_signature_as_image(self):

        pdf_path = self.pdf_paths[0]
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[0]
            images = page.images
            self.signature_found = bool(images)

            if images:
                print(f"Signature found.")
                return True
            else:
                print(f"No signature found.")
                return False

    def run_pipeline(self):
        self.extract_text_and_fields()
        self.clean_text()
        self.detect_signature_as_image()

        return {
            "client_info": self.extract_client_info()
        }

# TO DO FOR CLASSIFICATION :
# if any field is None -- Reject
# if currency all '/Off' & 'other_ccy' not appearing -- Reject

# Processing

Here we are processing the PDF text

In [13]:
import json
import os


input_folder_path = "../downloads/"
output_folder_path = "../data/"
processor = PDFProcessor(input_folder_path)
data = processor.run_pipeline()
print("\nAll steps completed. Data retrieved:")
os.makedirs(output_folder_path, exist_ok=True)
with open(os.path.join(output_folder_path, "account.pdf.json"), "w") as json_from_pdf:
  pdf_pretty_json = json.dumps(data, indent=2)
  json_from_pdf.write(pdf_pretty_json)
print(data)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Signature found.

All steps completed. Data retrieved:
{'client_info': {'account_name': 'Lia Sara Costa', 'account_holder_name': 'Lia Sara', 'account_holder_surname': 'Costa', 'passport_number': 'SV5946944', 'chf': '/Off', 'eur': '/Yes', 'usd': '/Off', 'other_ccy': '', 'building_number': '27', 'postal_code': '7771-876', 'city': 'Porto', 'country': 'Portugal', 'name': 'Lia Sara Costa', 'phone_number': '+351 965 492 552', 'email': 'lia.costa@yahoo.com', 'street_name': 'Avenida Almirante Reis', 'signature_image_found': True}}
