In [None]:
import os
import re
import pdfplumber
import pandas as pd
import camelot
from babel.numbers import parse_decimal


In [None]:
pdf_dir = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples"
# pdf_dir = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Main"

for filename in os.listdir(pdf_dir):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, filename)

        with pdfplumber.open(pdf_path) as pdf:
            all_text = ""           # Raw extracted text from all pages
            all_pages_body = []     # Store processed body text per page
            preserved_layout_text = ""  # Text with preserved layout (spaces, etc.)

            # Header pattern like "*05B00041981*"
            header_pattern = re.compile(r'^\*\d{2}B\d+\*$')

            # === PAGE LOOP ===
            for page in pdf.pages:
                # Append raw text to all_text (full PDF content, unprocessed)
                page_text = page.extract_text() or ""
                all_text += page_text + "\n"
                preserved_text = "" 
                preserved_text += page.extract_text(
                    layout=True,      # Preserves layout including spaces
                    keep_blank_chars=True,  # Keeps blank characters
                    x_tolerance=1,    # Smaller tolerance preserves spacing better
                    y_tolerance=1
                ) or ""
                preserved_layout_text += preserved_text + "\n"

                # Clean body text for this page
                lines = page_text.split('\n')
                in_header = False
                page_body_lines = []

                for line in lines:
                    if header_pattern.match(line.strip()):
                        in_header = True
                        continue
                    if in_header and line.strip().startswith('Order '):
                        in_header = False
                        continue
                    if re.match(r'^Page \d+ of \d+$', line.strip()):
                        continue
                    page_body_lines.append(line)

                cleaned_body = '\n'.join(page_body_lines).strip()
                all_pages_body.append(cleaned_body)

            # === Combine cleaned page texts ===
            body_text = "\n".join(all_pages_body)

            # === Camelot table extraction (run ONCE per file) ===
            tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
            camelot_tables = [table.df for table in tables]

            # === Extract order details ===
            order_details_lines = []
            capture = False
            last_total_usd_index = None
            lines = body_text.split("\n")

            # Pre-calc last "Total USD" if "Applicable Certifications" not found
            if "Applicable Certifications" not in body_text:
                for i, line in enumerate(lines):
                    if "Total USD" in line:
                        last_total_usd_index = i

            for i, line in enumerate(lines):
                if "This document contains certified products. Please see table below for details." in line:
                    capture = True
                    continue
                
                if not capture and "shipment FOB date" in line:
                    capture = True

                if "Applicable Certifications" in line:
                    capture = False
                    continue

                # Stop continuing if we hit the last "Total USD" line
                if last_total_usd_index is not None and i > last_total_usd_index:
                    capture = False

                if capture and line.strip():
                    order_details_lines.append(line)

            order_details_text = "\n".join(order_details_lines)


            print(f"All Text:\n{all_text}")  # Raw text
            print(f"Body Text:\n{body_text}")  # Cleaned text without headers/footers
            print(f"Order Details:\n{order_details_text}")  # Extracted order details texts
            print(f"Preserved Layout Text:\n{preserved_layout_text}")  

ALMOST FIX

In [None]:
import re

text = preserved_layout_text

pattern = r"(Size[^\n]+)\n\s*(Qty[^\n]+)"
matches_iter = re.finditer(pattern, text)

all_size_qty_mappings = []

def tokens_with_cols(line):
    return [(m.group(), m.start()) for m in re.finditer(r"\S+", line)]

for block_idx, m in enumerate(matches_iter, 1):
    size_line = m.group(1)
    qty_line  = m.group(2)

    # --- Tokens with positions ---
    size_cols = [(tok, col) for tok, col in tokens_with_cols(size_line) if tok.lower() != "size"]
    qty_cols  = [(tok, col) for tok, col in tokens_with_cols(qty_line) if tok.lower() != "qty"]

    # --- Build midpoint ranges for each size ---
    size_ranges = []
    for i, (s_tok, s_col) in enumerate(size_cols):
        if i == 0:
            s_start = -9999   # extend left infinity
        else:
            prev_col = size_cols[i - 1][1]
            s_start = (prev_col + s_col) // 2

        if i + 1 < len(size_cols):
            next_col = size_cols[i + 1][1]
            s_end = (s_col + next_col) // 2
        else:
            s_end = 9999      # extend right infinity

        size_ranges.append((s_tok, s_start, s_end))

    mapping = {s: 0 for s, _, _ in size_ranges}

    # --- DEBUG: raw lines ---
    print(f"\n=== Block {block_idx} ===")
    print(f"size_line: {size_line!r}")
    print(f"qty_line : {qty_line!r}")
    print(f"size_cols: {size_cols}")
    print(f"qty_cols : {qty_cols}")
    print(f"size_ranges (token, start, end): {size_ranges}")

    # --- Assign quantities into ranges ---
    for tok, q_col in qty_cols:
        norm = tok.replace('.', '').replace(',', '')
        if norm.isdigit():
            q_val = int(norm)
            # find which size range this qty belongs to
            for s_tok, s_start, s_end in size_ranges:
                if s_start <= q_col < s_end:
                    mapping[s_tok] += q_val
                    print(f"  -> Qty {q_val} (col={q_col}) assigned to size {s_tok} (range {s_start}-{s_end})")
                    break

    print(f"Final mapping: {mapping}")
    all_size_qty_mappings.append(mapping)

# --- Summary ---
print("\n=== Summary ===")
for i, m in enumerate(all_size_qty_mappings, 1):
    print(f"Block {i}: {m}")


Same Size and Qty Handled

In [79]:
import re

text = preserved_layout_text

pattern = r"(Size[^\n]+)\n\s*(Qty[^\n]+)"
matches_iter = re.finditer(pattern, text)

all_size_qty_mappings = []

def tokens_with_cols(line):
    return [(m.group(), m.start()) for m in re.finditer(r"\S+", line)]

for block_idx, m in enumerate(matches_iter, 1):
    size_line = m.group(1)
    qty_line  = m.group(2)

    # --- Tokens with positions ---
    size_cols = [(tok, col) for tok, col in tokens_with_cols(size_line) if tok.lower() != "size"]
    qty_cols  = [(tok, col) for tok, col in tokens_with_cols(qty_line) if tok.lower() != "qty"]

    mapping = {s: 0 for s, _ in size_cols}

    # --- DEBUG: raw lines ---
    print(f"\n=== Block {block_idx} ===")
    print(f"size_line: {size_line!r}")
    print(f"qty_line : {qty_line!r}")
    print(f"size_cols: {size_cols}")
    print(f"qty_cols : {qty_cols}")

    # --- First try direct one-to-one mapping if counts match ---
    size_tokens = [s for s, _ in size_cols]
    qty_tokens  = [q for q, _ in qty_cols]

    if len(size_tokens) == len(qty_tokens):
        for s_tok, q_tok in zip(size_tokens, qty_tokens):
            norm = q_tok.replace('.', '').replace(',', '')
            mapping[s_tok] = int(norm) if norm.isdigit() else 0
            print(f"  -> Direct mapping: {s_tok} = {mapping[s_tok]}")
    else:
        # --- Build midpoint ranges for each size ---
        size_ranges = []
        for i, (s_tok, s_col) in enumerate(size_cols):
            if i == 0:
                s_start = -9999   # extend left infinity
            else:
                prev_col = size_cols[i - 1][1]
                s_start = (prev_col + s_col) // 2

            if i + 1 < len(size_cols):
                next_col = size_cols[i + 1][1]
                s_end = (s_col + next_col) // 2
            else:
                s_end = 9999      # extend right infinity

            size_ranges.append((s_tok, s_start, s_end))

        print(f"size_ranges (token, start, end): {size_ranges}")

        # --- Assign quantities into ranges ---
        for tok, q_col in qty_cols:
            norm = tok.replace('.', '').replace(',', '')
            if norm.isdigit():
                q_val = int(norm)
                # find which size range this qty belongs to
                for s_tok, s_start, s_end in size_ranges:
                    if s_start <= q_col < s_end:
                        mapping[s_tok] += q_val
                        print(f"  -> Qty {q_val} (col={q_col}) assigned to size {s_tok} (range {s_start}-{s_end})")
                        break

    print(f"Final mapping: {mapping}")
    all_size_qty_mappings.append(mapping)

# --- Summary ---
print("\n=== Summary ===")
for i, m in enumerate(all_size_qty_mappings, 1):
    print(f"Block {i}: {m}")



=== Block 1 ===
size_line: 'Size    XS   S     M    L    XL   XXL   3XL  4XL  5XL                     '
qty_line : 'Qty    400  1.320 1.220 840  1.280 240  50   250   20                     '
size_cols: [('XS', 8), ('S', 13), ('M', 19), ('L', 24), ('XL', 29), ('XXL', 34), ('3XL', 40), ('4XL', 45), ('5XL', 50)]
qty_cols : [('400', 7), ('1.320', 12), ('1.220', 18), ('840', 24), ('1.280', 29), ('240', 35), ('50', 40), ('250', 45), ('20', 51)]
  -> Direct mapping: XS = 400
  -> Direct mapping: S = 1320
  -> Direct mapping: M = 1220
  -> Direct mapping: L = 840
  -> Direct mapping: XL = 1280
  -> Direct mapping: XXL = 240
  -> Direct mapping: 3XL = 50
  -> Direct mapping: 4XL = 250
  -> Direct mapping: 5XL = 20
Final mapping: {'XS': 400, 'S': 1320, 'M': 1220, 'L': 840, 'XL': 1280, 'XXL': 240, '3XL': 50, '4XL': 250, '5XL': 20}

=== Block 2 ===
size_line: 'Size    XS   S     M    L    XL   XXL   3XL  4XL                          '
qty_line : 'Qty          250  850   90   430   90   20   10  

EXCEPTIONS