# PDF parser


In [5]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [45]:
import os
import re
import unicodedata
from PyPDF2 import PdfReader

# 1. Loading PDF file paths from a folder
def load_pdf_paths(pdf_folder):
    return [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]

# 2. Extracting text and form fields from PDF files
def get_pdf_text_and_fields(pdf_folder_path):
    all_text = ""
    all_fields = {}

    pdf_files = load_pdf_paths(pdf_folder_path)

    for pdf_path in pdf_files:
        reader = PdfReader(pdf_path)

        # Extracting text from pages
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                all_text += page_text + "\n"

        # Extracting form fields (user input)
        fields = reader.get_fields()
        if fields:
            for key, field in fields.items():
                field_value = field.get('/V')
                all_fields[key] = str(field_value) if field_value is not None else None


    # print("Extracted Text:\n", all_text)
    print("Form field keys:", all_fields.keys())
    print("\nExtracted Form Fields (User Input):\n", all_fields)
    return all_text, all_fields

# 3. Cleaning the text
def clean_text(text):
    text = unicodedata.normalize("NFKD", text) # needed for special characters DO IT IN OTHER TYPES OF FILE TOO
    text = re.sub(r"[^a-zA-Z0-9.,!?%€$-]", " ", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\.{5,}", " ", text)
    return text

# 6. Extracting and storing client info from cleaned text
def extract_client_info(form_fields):

    signature = form_fields.get("Specimen Signature:")
    signature_filled = bool(signature and str(signature).strip() != "" and str(signature) != "/Off")
    print(signature_filled)

    # Just return the form_fields directly as client_info
    print("Extracted client info:", form_fields)
    return form_fields


# 7. Main PDF processing function
def process_pdfs(pdf_folder):
    raw_text, form_fields = get_pdf_text_and_fields(pdf_folder)
    client_info = extract_client_info(form_fields)

    return {
        "client_info": client_info
    }

# 8. Run the pipeline
if __name__ == "__main__":
    folder_path = "/content/client_1"  # <- update this path as needed
    data = process_pdfs(folder_path)
    print("\nPipeline complete. Data summary:")
    print(data)

# TO DO FOR CLASSIFICATION :
# if any field is None -- Reject
# if currency all '/Off' & 'other_ccy' not appearing -- Reject


Form field keys: dict_keys(['account_name', 'account_holder_name', 'account_holder_surname', 'passport_number', 'chf', 'eur', 'usd', 'other_ccy', 'building_number', 'postal_code', 'city', 'country', 'name', 'phone_number', 'email', 'street_name'])

Extracted Form Fields (User Input):
 {'account_name': 'Joona Onni Niskanen', 'account_holder_name': 'Joona Onni', 'account_holder_surname': 'Niskanen', 'passport_number': 'GB7720776', 'chf': '/Off', 'eur': '/Yes', 'usd': '/Off', 'other_ccy': '', 'building_number': '61', 'postal_code': '28356', 'city': 'Oulu', 'country': 'Finland', 'name': 'Joona Onni Niskanen', 'phone_number': '+358 049 614 09 81', 'email': 'joona.niskanen@gmail.com', 'street_name': 'Pyynikintie'}
False
Extracted client info: {'account_name': 'Joona Onni Niskanen', 'account_holder_name': 'Joona Onni', 'account_holder_surname': 'Niskanen', 'passport_number': 'GB7720776', 'chf': '/Off', 'eur': '/Yes', 'usd': '/Off', 'other_ccy': '', 'building_number': '61', 'postal_code': '283

# Checking if PDF was signed or not

In [46]:
%pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [47]:
import pdfplumber

def detect_signature_in_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Getting text from the page
            text = page.extract_text()

            if text and ("specimen signature" in text.lower()):
                # You can also extract the name/signature value next to it, if present
                pattern = r"specimen signature\s*:?\s*([A-Za-z\s]+)"
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    print(f"Signature filled with: {match.group(1)}")
                else:
                    print("Specimen signature label found, but no value next to it.")
            else:
                print(f"No signature field found on page {page_num + 1}.")

pdf_path = "/content/client_1/account.pdf"
detect_signature_in_pdf(pdf_path)




Signature field found on page 1.
Signature filled with: BANK JULIUS BAER 


In [52]:
import pdfplumber

def detect_signature_as_image(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]

        # Checking if there's an image on the page (signature is image I think)
        images = page.images

        if images:
            print("Signature found on the page.")
        else:
            print("No images found on the page.")

pdf_path = "/content/client_1/account.pdf"
detect_signature_as_image(pdf_path)

# TO DO in classifiation compare signature



Signature found on the page.
