# PDF Text Extraction Notebook

This notebook demonstrates how to extract text from a PDF file and analyze its content.

In [23]:
# Import required libraries
import PyPDF2
import re
from pathlib import Path
from docx import Document
import io

In [19]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [20]:
def extract_data_from_text(full_text):
    data = {
        'matter_name': None,
        'account_number': None,
        'balance_amount': None,
        'instalment_amount': None,
        'arrears_amount': None,
        'address': None
    }

    # Patterns for all fields
    patterns = {
        'matter_name': r'(?:\*JNT\s+)?(.*?)\s+CONT PERSON',
        'account_number': r'ACCOUNT #\s*([\d-]+)',
        'balance_amount': r'CURRENT BALANCE\.\.:\s*([\d,.]+)',
        'instalment_amount': r'PAYMENT AMOUNT\.\.\.:\s*([\d,.]+)',
        'arrears_amount': r'PAST DUE AMOUNT\.\.:\s*([\d,.]+)',
        'address': r'CONT PERSON\.:[^\n]*\n(.*?)(?=\s+BRANCH|$)'
    }

    # Extract all fields
    for key, pattern in patterns.items():
        if key == 'address':
            match = re.search(pattern, full_text, re.IGNORECASE | re.DOTALL)
        else:
            match = re.search(pattern, full_text, re.IGNORECASE)
        
        if match:
            if key == 'address':
                address_text = match.group(1).strip()
                # Remove PER PHONE and BUS PHONE parts
                address_text = re.sub(r'\s*PER PHONE.*?(?=\n|$)', '', address_text, flags=re.IGNORECASE)
                address_text = re.sub(r'\s*BUS PHONE.*?(?=\n|$)', '', address_text, flags=re.IGNORECASE)
                address_text = re.sub(r'\s*CELL PHONE.*?(?=\n|$)', '', address_text, flags=re.IGNORECASE)
                # Split into lines and remove empty lines
                address_lines = [line.strip() for line in address_text.split('\n') if line.strip()]
                # Process address lines to remove STAND NUMBER and anything after
                processed_lines = []
                for line in address_lines:
                    if "STAND NUMBER" in line.upper():
                        break  # Stop processing when we hit "STAND NUMBER"
                    processed_lines.append(line)
                
                data[key] = '\n'.join(processed_lines)
            else:
                data[key] = match.group(1).strip()

    return data

In [21]:
MAIN_PATH = Path('/Users/philipseimenis/Documents/work/Van Hulsteyns/populate_word/data/sample_pdfs/')
pdf_list = ['3000013304687.pdf', '3000702544455.pdf', '3000706982902.pdf', 'original.pdf', 'testrunforme.pdf']

for pdf in pdf_list:
    pdf_path = MAIN_PATH / pdf
    full_text = extract_text_from_pdf(pdf_path)
    matter_name = extract_data_from_text(full_text)['matter_name']
    account_number = extract_data_from_text(full_text)['account_number']
    balance_amount = extract_data_from_text(full_text)['balance_amount']
    instalment_amount = extract_data_from_text(full_text)['instalment_amount']
    arrears_amount = extract_data_from_text(full_text)['arrears_amount']
    address = extract_data_from_text(full_text)['address']
    print(matter_name)
    print(account_number)
    print(balance_amount)
    print(instalment_amount)
    print(arrears_amount)
    print(address)
    print('<------------------->')



GONYANE-M A/XABA-L I
3-000-013-304-687
991,285.85
8,806.06
329,386.57
2221 BROADACRES DRIVE
FOURWAYS EXT 50 + 59
<------------------->
MR JAMES M THIPE
3-000-702-544-455
173,243.93
2,969.34
56,146.13
62
LONG BEACH STREET
EVATON WEST
<------------------->
NKOSI-P / NKOSI-B
3-000-706-982-902
385,553.10
5,729.11
53,897.99
44
LUNGILE STREET
DIEPKLOOF ZONE 5
SOWETO
<------------------->
NGWENYA-E / NTULI-N
3-000-703-367-417
437,103.34
4,900.97
42,079.06
3801
WHITE ONYX CRESCENT
DAWN PARK EXT 7
<------------------->
MR JERRY R NTSHABELE
3-000-705-885-172
184,478.26
2,658.95
26,048.45
436
MAIN STREET
ROUXVILLE
JOHANNESBURG
<------------------->


In [34]:
doc_path = '/Users/philipseimenis/Documents/work/Van Hulsteyns/populate_word/data/sample_templates/template.docx'

# Open the document
doc = Document(doc_path)

# Get the first paragraph
print(len(doc.paragraphs))

125
