In [1]:
# Parser to structure items and build requested DataFrame
import re
import pandas as pd
from IPython.display import display

# Choose source: reuse text from the second cell by re-reading the same PDF path for reproducibility
PDF_PATH = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\OBU\Purchase order.pdf'

try:
    import pdfplumber
    with pdfplumber.open(PDF_PATH) as pdf:
        text = "\n".join([page.extract_text() or '' for page in pdf.pages])
except Exception as e:
    raise RuntimeError(f'Could not read PDF at {PDF_PATH}: {e}')

# Helper regex patterns
code_token_re = re.compile(r'[A-Z0-9][A-Z0-9\-]*[A-Z0-9]')
sku_first_code_re = re.compile(r'^(?P<sku>\d+-[A-Z]{2}\d{3})')
po_re = re.compile(r'PO#\s*:\s*(\d+)')
article_header_re = re.compile(r'^Article code', re.IGNORECASE)
quantity_line_re = re.compile(r'^(\d+)\s+(\d+)$')
style_tone_re = re.compile(r'-([A-Z]{1,3})$')  # e.g. -YG, -WG, -RG

# Extract PO number
po_match = po_re.search(text)
item_po_no = po_match.group(1) if po_match else ''

# Split PDF text into item blocks
lines = [ln.strip() for ln in text.split('\n') if ln.strip()]
blocks, current = [], []
for ln in lines:
    if article_header_re.match(ln):
        if current:
            blocks.append(current)
            current = []
        current.append(ln)
    elif current:
        current.append(ln)
if current:
    blocks.append(current)

items = []
for b_index, block in enumerate(blocks):
    is_last_block = (b_index == len(blocks) - 1)
    btxt = "\n".join(block)

    # --- Extract codes section ---
    codes = []
    for idx, ln in enumerate(block[1:4], start=1):
        tokens = code_token_re.findall(ln)
        if tokens:
            codes.extend(tokens)
        if len(codes) >= 2:
            break

    codes = [re.sub(r'[^A-Z0-9\-]', '', c) for c in codes]

    sku_full = codes[0] if len(codes) >= 1 else ''
    item_ref_no = codes[1] if len(codes) == 3 else ''
    style_code = codes[2] if len(codes) == 3 else (codes[1] if len(codes) == 2 else '')

    # --- Extract ItemSize from SKU ---
    item_size = ''
    # Check for tone pattern (YG, WG, RG) in SKU dynamically
    tone_match = re.search(r'([YWR]G)-(\d+(?:-\d+)*)', sku_full)
    if tone_match:
        after_tone = tone_match.group(2)
        nums = re.findall(r'\d+', after_tone)
        if len(nums) >= 2:
            item_size = nums[1]

    # ✅ Add "EU" prefix to item size if present
    if item_size:
        item_size = f"EU{item_size}"

    # --- Extract Tone and clean StyleCode ---
    tone = ''
    if style_code:
        mt = style_tone_re.search(style_code)
        if mt:
            tone_full = mt.group(1)  # e.g. 'YG'
            tone = tone_full[0] if tone_full else ''  # only 'Y'
            style_code = style_code[:mt.start()]  # keep only 'RG047376'

    # --- Extract quantity info ---
    sr_no = ''
    order_qty = ''
    order_item_pcs = ''
    try:
        desc_idx = next(i for i, ln in enumerate(block) if ln.lower().startswith('description'))
    except StopIteration:
        desc_idx = None

    if desc_idx is not None:
        for ln in block[desc_idx:desc_idx+5]:
            qm = quantity_line_re.match(ln)
            if qm:
                sr_no = qm.group(1)
                order_qty = qm.group(2)
                order_item_pcs = order_qty
                break

    # --- Extract description text ---
    desc_lines = []
    for ln in block:
        if article_header_re.match(ln) or quantity_line_re.match(ln) or ln.lower().startswith('description'):
            continue
        if code_token_re.fullmatch(ln.replace(' ', '')):
            continue
        desc_lines.append(ln)

    full_desc = ' '.join(desc_lines)

    # Trim trailing purchase total in last item
    if is_last_block:
        pot_idx = re.search(r'Purchase order Total', full_desc, flags=re.IGNORECASE)
        if pot_idx:
            full_desc = full_desc[:pot_idx.start()].strip()

    # Split Customer and Stamp instructions
    split_match = re.search(r'\b(stamp\b.*)', full_desc, flags=re.IGNORECASE)
    if split_match:
        customer_instr = full_desc[:split_match.start()].strip()
        stamp_instr = full_desc[split_match.start():].strip()
    else:
        customer_instr = full_desc
        stamp_instr = ''

    customer_instr = re.sub(r'\s*\band\b\s*$', '', customer_instr, flags=re.IGNORECASE).strip()

    # --- Certificate detection ---
    certificate = ''
    if sku_full:
        nums = re.findall(r'\d+', sku_full)
        if nums and nums[-1] == '100':
            certificate = 'IGI Certified'

    # --- SKU Number extraction ---
    sku_no = ''
    if sku_full:
        m = re.match(r'^(\d+-[A-Z]{2}\d{3})', sku_full)
        if m:
            sku_no = m.group(1)
        else:
            parts = sku_full.split('-')
            if len(parts) >= 2:
                sku_no = parts[0] + '-' + parts[1]

    # --- Append structured data ---
    items.append({
        'SrNo': sr_no,
        'StyleCode': style_code,
        'ItemSize': item_size,
        'OrderQty': order_qty,
        'OrderItemPcs': 1,
        'Metal': '',
        'Tone': tone,
        'ItemPoNo': item_po_no,
        'ItemRefNo': item_ref_no,
        'StockType': '',
        'MakeType': '',
        'CustomerProductionInstruction': customer_instr,
        'SpecialRemarks': '',
        'DesignProductionInstruction': '',
        'StampInstruction': stamp_instr,
        'OrderGroup': '',
        'Certificate': certificate,
        'SKUNo': sku_no,
        'Basestoneminwt': '',
        'Basestonemaxwt': '',
        'Basemetalminwt': '',
        'Basemetalmaxwt': '',
        'Productiondeliverydate': '',
        'Expecteddeliverydate': '',
        '': '',
        'SetPrice': '',
        'StoneQuality': 'VVS+' if re.search(r'\bVVS\+\b', btxt) else ''
    })

# --- Build and save DataFrame ---
columns_order = [
    'SrNo','StyleCode','ItemSize','OrderQty','OrderItemPcs','Metal','Tone','ItemPoNo','ItemRefNo',
    'StockType','MakeType','CustomerProductionInstruction','SpecialRemarks','DesignProductionInstruction',
    'StampInstruction','OrderGroup','Certificate','SKUNo','Basestoneminwt','Basestonemaxwt','Basemetalminwt',
    'Basemetalmaxwt','Productiondeliverydate','Expecteddeliverydate','', 'SetPrice','StoneQuality'
]
result_df = pd.DataFrame(items, columns=columns_order)

output_file = 'new_obu_2.xlsx'
result_df.to_excel(output_file, index=False)
print('Structured items saved to', output_file)
display(result_df)


Structured items saved to new_obu_2.xlsx


Unnamed: 0,SrNo,StyleCode,ItemSize,OrderQty,OrderItemPcs,Metal,Tone,ItemPoNo,ItemRefNo,StockType,...,SKUNo,Basestoneminwt,Basestonemaxwt,Basemetalminwt,Basemetalmaxwt,Productiondeliverydate,Expecteddeliverydate,Unnamed: 19,SetPrice,StoneQuality
0,1,RG047376,EU58,1,1,,Y,6299,,,...,9-DD028,,,,,,,,,
1,2,RG056063,EU56,1,1,,Y,6299,,,...,9-DD035,,,,,,,,,
