In [None]:
import re
import pandas as pd
import pdfplumber
from typing import List, Dict, Optional

# -----------------------------
# Helpers to extract and parse
# -----------------------------

def read_pdf_text(pdf_path: str) -> str:
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            full_text += page_text + "\n"
    return full_text


def extract_po_number(full_text: str) -> Optional[str]:
    m = re.search(r"PO\s*#\s*:\s*(\d+)", full_text, flags=re.IGNORECASE)
    return m.group(1) if m else None


def split_items(full_text: str) -> List[str]:
    lines = [ln.strip() for ln in full_text.splitlines()]
    items: List[str] = []
    current: List[str] = []
    inside_items = False
    for ln in lines:
        if re.match(r"^\d+\.", ln):
            inside_items = True
            if current:
                items.append("\n".join(current).strip())
            current = [ln]
            continue
        if inside_items:
            current.append(ln)
            if "**FOR SHIMAYRA**" in ln or "FOR SHIMAYRA" in ln:
                items.append("\n".join(current).strip())
                current = []
                inside_items = False
    if current:
        items.append("\n".join(current).strip())
    return [it for it in items if it]


def parse_first_line_tokens(line: str) -> List[str]:
    return re.findall(r"[A-Za-z0-9/*]+(?:-[A-Za-z0-9]+)*", line)


def find_item_size_and_qty(line: str) -> (Optional[str], Optional[str]):
    tokens = re.findall(r"\d+\.\d+|\d+", line)
    if not tokens:
        return None, None
    size_idx = None
    for i, tok in enumerate(tokens):
        if re.match(r"^\d+\.\d+$", tok):
            size_idx = i
            break
    if size_idx is None:
        size_idx = 0
    size_val = tokens[size_idx]
    qty_val = None
    if size_idx + 1 < len(tokens):
        qty_val = tokens[size_idx + 1]
    return size_val, qty_val


def find_item_ref_no(line: str) -> Optional[str]:
    m = re.search(r"(\d{5,}/\d+)", line)
    return m.group(1) if m else None


def find_style_code(line: str) -> Optional[str]:
    """
    Extracts the StyleCode token from a line.
    Skips date-like tokens such as 'Nov/12/2025' and takes the next
    alphanumeric code (e.g. 'BR0000094K').
    """
    tokens = parse_first_line_tokens(line)
    for i, tok in enumerate(tokens):
        # Skip date-like tokens (e.g. Nov/12/2025, Jul/22/2025)
        if re.match(r"^[A-Za-z]{3}/\d{1,2}/\d{4}$", tok):
            if i + 1 < len(tokens):
                next_tok = tokens[i + 1]
                if any(c.isalpha() for c in next_tok) and any(c.isdigit() for c in next_tok):
                    return next_tok
    # fallback: previous logic
    candidate_codes: List[str] = []
    for t in tokens:
        if any(c.isalpha() for c in t) and any(c.isdigit() for c in t) and len(t) >= 6:
            candidate_codes.append(t)
    if len(candidate_codes) >= 2:
        return candidate_codes[-2]
    return candidate_codes[-1] if candidate_codes else None


def find_sku(line: str) -> Optional[str]:
    m = re.search(r"\b([A-Z]{2,}[A-Z0-9]*-\d{1,})\b", line)
    if m:
        return m.group(1)
    tokens = parse_first_line_tokens(line)
    for t in tokens:
        if '-' in t and any(ch.isdigit() for ch in t) and any(ch.isalpha() for ch in t):
            return t
    return None


def find_metal_and_tone(block_text: str) -> (Optional[str], Optional[str]):
    metal_code = None
    tone = None
    if re.search(r"\bSILV\b", block_text, flags=re.IGNORECASE):
        metal_code = "AG925"
        m = re.search(r"\bSILV\b\s+([A-Z]{1,3})\b", block_text)
        if m:
            tone = m.group(1)
    if metal_code is None:
        if re.search(r"\bG14\w*\b|\b14K\b|\bGOLD\b", block_text, flags=re.IGNORECASE):
            metal_code = "G14"
            m2 = re.search(r"\bG14\w*\b\s+([A-Z]{1,3})\b", block_text)
            if m2:
                tone = m2.group(1)
    return metal_code, tone


def find_customer_instruction_from_line(line: str) -> Optional[str]:
    toks = line.split()
    if not toks:
        return None
    sku_idx = None
    for i, t in enumerate(toks):
        if re.match(r"[A-Z]{2,}[A-Z0-9]*-\d{1,}$", t):
            sku_idx = i
            break
    if sku_idx is None:
        return None
    desc_terms: List[str] = []
    for j in range(sku_idx + 1, len(toks)):
        t = toks[j]
        if t.upper() in {"SILV", "G14", "G14W", "14K", "GOLD"}:
            break
        desc_terms.append(t)
    return " ".join(desc_terms) if desc_terms else None


def extract_design_instructions(block_text: str) -> Optional[str]:
    phrases = re.findall(r"\*\*([^*]+)\*\*", block_text)
    return " ".join(p.strip() for p in phrases) if phrases else None


def extract_stamp_instruction(block_text: str) -> Optional[str]:
    m = re.search(r"Special\s+Inst\.[^\n]*?STAMP\s+([^,\n]+)", block_text, flags=re.IGNORECASE)
    return m.group(1).strip() if m else None


def extract_special_remarks(block_text: str) -> Optional[str]:
    for ln in block_text.splitlines():
        if ln.upper().startswith("SPECIAL INST."):
            return ln.split("Special Inst.", 1)[-1].strip()
    return None


def parse_items(full_text: str) -> List[Dict[str, str]]:
    item_po_no = extract_po_number(full_text) or ""
    blocks = split_items(full_text)
    parsed: List[Dict[str, str]] = []

    for blk in blocks:
        first_line = next((ln for ln in blk.splitlines() if re.match(r"^\d+\.\s", ln)),
                          blk.splitlines()[0] if blk.splitlines() else "")
        sr_m = re.match(r"^(\d+)\.", first_line.strip())
        sr_no = sr_m.group(1) if sr_m else ""
        item_size, order_qty = find_item_size_and_qty(first_line)
        item_ref_no = find_item_ref_no(first_line) or ""
        style_code = find_style_code(first_line) or ""
        sku_no = find_sku(first_line) or ""
        metal, tone = find_metal_and_tone(blk)
        cust_instr = find_customer_instruction_from_line(first_line) or ""
        design_instr = extract_design_instructions(blk) or ""
        stamp_instr = extract_stamp_instruction(blk) or ""
        special_remarks = extract_special_remarks(blk) or ""

        # ✅ Clean up item size: remove .00 if present, add " Inch"
        if item_size:
            try:
                size_float = float(item_size)
                if size_float.is_integer():
                    item_size = f"{int(size_float)} INCH"
                else:
                    item_size = f"{size_float} INCH"
            except ValueError:
                item_size = f"{item_size} INCH"

        # ✅ Extract tone from style_code, replace V→W, and trim style_code
        if style_code:
            mt = re.search(r'-([A-Z]+)$', style_code)
            if mt:
                tone_full = mt.group(1)
                tone = (tone_full[0] if tone_full else '').replace('V', 'W')
                style_code = style_code[:mt.start()]

        parsed.append({
            "SrNo": sr_no,
            "StyleCode": style_code,
            "ItemSize": item_size or "",
            "OrderQty": order_qty or "",
            "OrderItemPcs": 1,
            "Metal": metal or "",
            "Tone": tone or "",
            "ItemPoNo": item_po_no,
            "ItemRefNo": item_ref_no,
            "StockType": "",
            "MakeType": "",
            "CustomerProductionInstruction": cust_instr,
            "SpecialRemarks": special_remarks,
            "DesignProductionInstruction": design_instr,
            "StampInstruction": stamp_instr,
            "OrderGroup": "",
            "Certificate": "",
            "SKUNo": sku_no,
            "Basestoneminwt": "",
            "Basestonemaxwt": "",
            "Basemetalminwt": "",
            "Basemetalmaxwt": "",
            "Productiondeliverydate": "",
            "Expecteddeliverydate": "",
            "BlankColumn": "",
            "SetPrice": "",
            "StoneQuality": "",
        })

    return parsed


def build_dataframe(records: List[Dict[str, str]]) -> pd.DataFrame:
    columns = [
        "SrNo",
        "StyleCode",
        "ItemSize",
        "OrderQty",
        "OrderItemPcs",
        "Metal",
        "Tone",
        "ItemPoNo",
        "ItemRefNo",
        "StockType",
        "MakeType",
        "CustomerProductionInstruction",
        "SpecialRemarks",
        "DesignProductionInstruction",
        "StampInstruction",
        "OrderGroup",
        "Certificate",
        "SKUNo",
        "Basestoneminwt",
        "Basestonemaxwt",
        "Basemetalminwt",
        "Basemetalmaxwt",
        "Productiondeliverydate",
        "Expecteddeliverydate",
        "BlankColumn",
        "SetPrice",
        "StoneQuality",
    ]
    return pd.DataFrame(records, columns=columns)


# -----------------------------
# Execute for the Ambition PO
# -----------------------------

pdf_file_path = r'C:\Users\Pratik Mali\Desktop\tools\data\Ambition Jewels Pvt. Ltd (AJP)\NL PO# 101533 AMBITION (SHIMAYRA)-EDITING.pdf'

print(f"Reading PDF from: {pdf_file_path}")
full_text = read_pdf_text(pdf_file_path)

records = parse_items(full_text)
df_items = build_dataframe(records)

print("\nParsed Items DataFrame:")
print(df_items)

output_excel = 'structured_items_1.xlsx'
df_items.to_excel(output_excel, index=False)
print(f"\nData successfully saved to '{output_excel}'")
