In [None]:
import pandas as pd
import re
import pdfplumber
import tempfile
from PyPDF2 import PdfReader, PdfWriter


def rotate_pdf_left(input_path):
    """
    Rotate all pages in the PDF 90° counterclockwise (left)
    and return the path to the temporary rotated file.
    """
    reader = PdfReader(input_path)
    writer = PdfWriter()

    for page in reader.pages:
        page.rotate(90)
        writer.add_page(page)

    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
    with open(temp_file.name, "wb") as f_out:
        writer.write(f_out)

    return temp_file.name


def extract_raw_text_from_pdf(pdf_path):
    """
    Rotates the PDF, extracts text using pdfplumber, and returns raw text.
    """
    try:
        rotated_pdf = rotate_pdf_left(pdf_path)
        full_text = ""

        with pdfplumber.open(rotated_pdf) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text + "\n"

        print("\n📜 Extracted Text from Rotated PDF:")
        print("=" * 60)
        print(full_text)
        print("=" * 60)

        return full_text.strip()

    except Exception as e:
        print(f"Error reading or processing PDF: {e}")
        return ""


def clean_item_size(size_raw, prefix):
    """
    Cleans and formats the item size.
    - Removes trailing zeros (e.g., 7.00 -> 7)
    - Adds prefix (e.g., US07)
    """
    size_raw = size_raw.strip()
    if not size_raw:
        return ""

    # Remove unnecessary decimals
    if re.match(r'^\d+\.\d+$', size_raw):
        size_clean = str(float(size_raw)).rstrip('0').rstrip('.')
    else:
        size_clean = size_raw

    # Pad single-digit sizes (e.g., 7 -> 07)
    if re.match(r'^\d+$', size_clean) and len(size_clean) == 1:
        size_clean = f"0{size_clean}"

    return f"{prefix}{size_clean}" if prefix else size_clean


def parse_purchase_order_data(full_text):
    """
    Parses structured data from the extracted text (multi-page supported).
    """
    if not full_text.strip():
        print("⚠️ Empty text provided.")
        return pd.DataFrame()

    # Extract PO number
    po_match = re.search(r'PO\s*#\s*[:]*\s*(\d+)', full_text)
    item_po_no = po_match.group(1) if po_match else ""

    # ✅ Updated regex pattern to capture Item Size explicitly
    item_blocks = re.findall(
        r'(\d+)\.\s*'                          # Sr No.
        r'(\d+/\d+)\s+'                        # Order Code
        r'([\d.]+)\s+'                         # Order Qty
        r'(\S+)\s+'                            # Style Code
        r'(\S+)\s+'                            # Vendor Style
        r'(\S+)\s+'                            # SKU No
        r'(18K[T]?|14K[T]?)\s+'                # Metal KT
        r'([YW])\s+'                           # Tone
        r'([\d.]+)\s+'                         # ✅ Item Size
        r'([A-Z0-9]+)[\s\S]*?Stamping Instructions:\s*([^\n]+)',  # Next token + stamping
        full_text
    )

    if not item_blocks:
        print("⚠️ No item blocks found.")
        return pd.DataFrame()

    # Ask prefix once for the whole batch
    size_prefix = input("Enter prefix for item size (e.g., 'US'): ").strip()

    data = []
    for i, block in enumerate(item_blocks, start=1):
        (
            sr_no, order_code, order_qty, style_code,
            vendor_style, sku_no, metal_kt, tone,
            item_size, next_token, stamping_instr
        ) = block

        # Ask for priority for each item
        priority = input(f"Enter Priority for item {style_code} (default REG): ").strip().upper() or "REG"

        # Clean & format size
        formatted_size = clean_item_size(item_size, size_prefix)

        # Metal formatting
        metal = f"G{metal_kt.replace('KT', '').replace('K', '')}{tone}"
        tone_full = "Yellow Gold" if tone == "Y" else "White Gold"

        # Determine item type (Bracelet/Earring/Ring)
        desc_match = re.search(r'(BRACELET|EARRING|RING)', full_text[full_text.find(style_code):], re.IGNORECASE)
        desc = desc_match.group(1).capitalize() if desc_match else "Item"
        desc_full = f"{metal_kt} {tone_full} {desc} 1.00 CTW"

        # Special Remarks (includes size only if valid)
        if formatted_size:
            special_remarks = (
                f"BRILLIANT EARTH CRAFT,{order_code}, {style_code},{vendor_style}, "
                f"{sku_no},SZ-{formatted_size}, {metal_kt} {tone_full.upper()},COC CERTIFIED RE-CYCLE GOLD"
            )
        else:
            special_remarks = (
                f"BRILLIANT EARTH CRAFT,{order_code}, {style_code},{vendor_style}, "
                f"{sku_no}, {metal_kt} {tone_full.upper()},COC CERTIFIED RE-CYCLE GOLD"
            )

        design_prod_instr = "White Rodium" if tone == "W" else "No Rodium"

        data.append({
            "SrNO": i,
            "StyleCode": style_code,
            "ItemSize": formatted_size,
            "OrderQty": order_qty,
            "OrderItemPcs": 1,
            "Metal": metal,
            "Tone": tone,
            "ItemPoNo": item_po_no,
            "ItemRefNo": "",
            "StockType": "",
            "Priority": priority,
            "MakeType": "",
            "CustomerProductionInstruction": desc_full,
            "SpecialRemarks": special_remarks,
            "DesignProductionInstruction": design_prod_instr,
            "StampInstruction": stamping_instr.strip(),
            "OrderGroup": "BRILLIANT EARTH CRAFT",
            "Certificate": "",
            "SKUNo": sku_no,
            "Basestoneminwt": "",
            "Basestonemaxwt": "",
            "Basemetalminwt": "",
            "Basemetalmaxwt": "",
            "Productiondeliverydate": "",
            "Expecteddeliverydate": "",
            "SetPrice": "",
            "StoneQuality": ""
        })

    print(f"\n✅ {len(data)} items successfully parsed.")
    return pd.DataFrame(data)


# ================== MAIN EXECUTION ==================
if __name__ == "__main__":
    pdf_file_path = r"C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\with_size_HKD NO#803688-466525-Shimayra.pdf"

    print(f"\n📂 Reading and processing PDF: {pdf_file_path}")

    full_text = extract_raw_text_from_pdf(pdf_file_path)

    if full_text:
        df = parse_purchase_order_data(full_text)

        if not df.empty:
            print("\n✅ Final Structured Data:")
            print("=" * 80)
            print(df)
            print("=" * 80)
            output_path = "final_OUTPUT_CRAFT.xlsx"
            df.to_excel(output_path, index=False)
            print(f"\n💾 Data successfully saved to '{output_path}'")
        else:
            print("⚠️ No structured data could be extracted.")
    else:
        print("❌ No text extracted from the PDF.")
