In [3]:
!pip install pandas openpyxl xlrd

Defaulting to user installation because normal site-packages is not writeable
Collecting xlrd
  Using cached xlrd-2.0.2-py2.py3-none-any.whl.metadata (3.5 kB)
Using cached xlrd-2.0.2-py2.py3-none-any.whl (96 kB)
Installing collected packages: xlrd
Successfully installed xlrd-2.0.2


In [40]:
import pandas as pd

df = pd.read_excel(r'C:\Users\Admin\Desktop\Aanya Inc\PO 6273.xls', skiprows=9)

selected_columns = ['Serial No', 'Style No', 'Description', 
                    'Diamonds', 'Qty', 'Sizes']

# Explicit copy
df_selected = df[selected_columns].copy()

# Rename columns
df_selected.rename(columns={
    'Style No': 'StyleCode',
    'Description': 'MetalR',
    'Diamonds': 'CustomerProductionInstruction',
    'Qty': 'OrderQty',
    'Sizes': 'ItemSize'
}, inplace=True)

# Remove rows where StyleCode is NaN
df_selected.dropna(subset=['StyleCode'], inplace=True)

df_selected['OrderQty'] = df_selected['OrderQty'].fillna(0).astype(int)

# Convert "Size-X" to "UPXX" and replace NaN with blank
def convert_size(size):
    if pd.isna(size):
        return ''
    size_str = str(size)
    if 'Size-' in size_str:
        num = int(size_str.split('-')[1])
        return f'UP{num:02d}'
    return size_str

df_selected['ItemSize'] = df_selected['ItemSize'].apply(convert_size)

# Move ItemSize column after StyleCode
itemsize = df_selected.pop('ItemSize')
df_selected.insert(df_selected.columns.get_loc('StyleCode') + 1, 'ItemSize', itemsize)

# Move OrderQty column after StyleCode
orderqty = df_selected.pop('OrderQty')
df_selected.insert(df_selected.columns.get_loc('ItemSize') + 1, 'OrderQty', orderqty)

# Remove newline characters from MetalR
df_selected['MetalR'] = df_selected['MetalR'].astype(str).str.replace('\n', ' ', regex=False).str.strip()

# Create Metal column with short codes
def metal_code(metal_str):
    metal_str = metal_str.upper()
    code = ''
    # Karat
    if '14KT' in metal_str:
        code += '14'
    elif '18KT' in metal_str:
        code += '18'
    elif '10KT' in metal_str:
        code += '10'
    else:
        code += 'XX'  # fallback

    # Metal type
    if 'WHITE GOLD' in metal_str:
        code = 'G' + code + 'W'
    elif 'YELLOW GOLD' in metal_str:
        code = 'G' + code + 'Y'
    elif 'PINK GOLD' in metal_str:
        code = 'G' + code + 'P'
    elif 'PLATINUM' in metal_str:
        code = 'PC950' + code
    else:
        code = 'G' + code + 'X'  # unknown type
    return code

# Insert Metal column before MetalR
df_selected.insert(df_selected.columns.get_loc('MetalR'), 'Metal', df_selected['MetalR'].apply(metal_code))

# Create Tone column (last letter of Metal) after Metal
df_selected.insert(df_selected.columns.get_loc('Metal') + 1, 'Tone', df_selected['Metal'].astype(str).str[-1])

# Insert ItemPoNo. column after Tone with value from G5
item_po_no = pd.read_excel(
    r'C:\Users\Admin\Desktop\Aanya Inc\PO 6273.xls',
    header=None,
    usecols="G",
    nrows=5
).iloc[4, 0]  # G5 cell

df_selected.insert(
    df_selected.columns.get_loc('Tone') + 1,
    'ItemPoNo.',
    item_po_no
)

# Insert ItemRefNo, StockType, MakeType after ItemPoNo.
additional_cols = ['ItemRefNo', 'StockType', 'MakeType']
pos = df_selected.columns.get_loc('ItemPoNo.') + 1
for col in additional_cols:
    df_selected.insert(pos, col, '')
    pos += 1

# Insert SpecialRemarks column after CustomerProductionInstruction
df_selected.insert(
    df_selected.columns.get_loc('CustomerProductionInstruction') + 1, 
    'SpecialRemarks', 
    'Need Hallmark "A" and Trademark on Every piece'
)

#DesignProductionInstruction
df_selected.insert(
    df_selected.columns.get_loc('SpecialRemarks') + 1, 
    'DesignProductionInstruction', 
    value=''
)

# InsertStampInstruction column 
df_selected.insert(
    df_selected.columns.get_loc('DesignProductionInstruction') + 1, 
    'StampInstruction', 
    '"A" on one side and metal KT on other side of the ring'
)

# Add multiple new columns after StampInstruction
new_columns = [
    'OrderGroup', 'Certificate', 'SKUNo', 'Basestoneminwt', 'Basestonemaxwt',
    'Basemetalminwt', 'Basemetalmaxwt', 'Productiondeliverydate',
    'Expecteddeliverydate', 'Blank_Column', 'SetPrice', 'StoneQuality'
]
pos = df_selected.columns.get_loc('StampInstruction') + 1
for col in new_columns:
    df_selected.insert(pos, col, '')
    pos += 1

df_selected.insert(
    df_selected.columns.get_loc('OrderQty') + 1, 
    'OrderItemPcs', 
    value=''
)

df_selected.drop(columns=['MetalR'], inplace=True) 
df_selected.to_csv(r'C:\Users\Admin\Desktop\Anaya_PO_Cleaned.csv', index=False)
#df_selected.head(20)


In [10]:
#FINAL CODE
import pandas as pd

# --- Step 1: Read Excel ---
df = pd.read_excel(r'C:\Users\Admin\Desktop\Aanya Inc\PO 6273.xls', skiprows=9)

# --- Step 2: Select and copy required columns ---
selected_columns = ['Serial No', 'Style No', 'Description', 'Diamonds', 'Qty', 'Sizes']
df_selected = df[selected_columns].copy()

# --- Step 3: Rename columns ---
df_selected.rename(columns={
    'Serial No': 'SrNo',
    'Style No': 'StyleCode',
    'Description': 'MetalR',
    'Diamonds': 'CustomerProductionInstruction',
    'Qty': 'OrderQty',
    'Sizes': 'ItemSize'
}, inplace=True)

# --- Step 4: Clean StyleCode ---
df_selected.dropna(subset=['StyleCode'], inplace=True)
# Keep only part before first '-' and clean spaces or underscores
df_selected['StyleCode'] = (
    df_selected['StyleCode']
    .astype(str)
    .str.split('-').str[0]
    .str.replace(r'[_\s]+', '', regex=True)
    .str.strip()
)

# --- Step 5: Clean and transform ItemSize ---
def convert_size(size):
    if pd.isna(size):
        return ''
    size_str = str(size).strip().upper()
    # Convert "Size-X" → "UPXX"
    if 'SIZE-' in size_str:
        num = size_str.split('-')[-1]
        return f'UP{int(num):02d}'
    return size_str

df_selected['ItemSize'] = df_selected['ItemSize'].apply(convert_size)

# Move ItemSize after StyleCode
itemsize = df_selected.pop('ItemSize')
df_selected.insert(df_selected.columns.get_loc('StyleCode') + 1, 'ItemSize', itemsize)

# Move OrderQty after ItemSize
orderqty = df_selected.pop('OrderQty')
df_selected.insert(df_selected.columns.get_loc('ItemSize') + 1, 'OrderQty', orderqty)

# --- Step 6: Clean MetalR ---
df_selected['MetalR'] = (
    df_selected['MetalR']
    .astype(str)
    .str.replace('\n', ' ', regex=False)
    .str.strip()
)

# --- Step 7: Create Metal column ---
def metal_code(metal_str):
    metal_str = str(metal_str).upper()
    # Platinum or PT handling
    if 'PLATINUM' in metal_str or 'PT' in metal_str:
        return 'PC95'
    # Default gold mappings
    if '14KT' in metal_str:
        karat = '14'
    elif '18KT' in metal_str:
        karat = '18'
    elif '10KT' in metal_str:
        karat = '10'
    else:
        karat = 'XX'
    # Tone detection
    if 'WHITE' in metal_str:
        tone = 'W'
    elif 'YELLOW' in metal_str:
        tone = 'Y'
    elif 'PINK' in metal_str or 'ROSE' in metal_str:
        tone = 'P'
    else:
        tone = 'X'
    return f'G{karat}{tone}'

df_selected.insert(df_selected.columns.get_loc('MetalR'), 'Metal', df_selected['MetalR'].apply(metal_code))

# --- Step 8: Create Tone column ---
df_selected.insert(df_selected.columns.get_loc('Metal') + 1, 'Tone', df_selected['Metal'].astype(str).str[-1])

# --- Step 9: Extract ItemPoNo from Excel cell G5 ---
item_po_no = pd.read_excel(
    r'C:\Users\Admin\Desktop\Aanya Inc\PO 6273.xls',
    header=None,
    usecols="G",
    nrows=5
).iloc[4, 0]  # G5 cell

df_selected.insert(df_selected.columns.get_loc('Tone') + 1, 'ItemPoNo', item_po_no)

# --- Step 10: Add ItemRefNo, StockType, MakeType ---
additional_cols = ['ItemRefNo', 'StockType', 'MakeType']
pos = df_selected.columns.get_loc('ItemPoNo') + 1
for col in additional_cols:
    df_selected.insert(pos, col, '')
    pos += 1

# --- Step 11: Add SpecialRemarks ---
df_selected.insert(
    df_selected.columns.get_loc('CustomerProductionInstruction') + 1, 
    'SpecialRemarks', 
    'Need Hallmark "A" and Trademark on Every piece'
)

# --- Step 12: Add DesignProductionInstruction ---
df_selected.insert(
    df_selected.columns.get_loc('SpecialRemarks') + 1, 
    'DesignProductionInstruction', 
    ''
)

# --- Step 13: Add StampInstruction ---
df_selected.insert(
    df_selected.columns.get_loc('DesignProductionInstruction') + 1, 
    'StampInstruction', 
    '"A" on one side and metal KT on other side of the ring'
)

# --- Step 14: Add extra columns after StampInstruction ---
new_columns = [
    'OrderGroup', 'Certificate', 'SKUNo', 'Basestoneminwt', 'Basestonemaxwt',
    'Basemetalminwt', 'Basemetalmaxwt', 'Productiondeliverydate',
    'Expecteddeliverydate', 'Blank_Column', 'SetPrice', 'StoneQuality'
]
pos = df_selected.columns.get_loc('StampInstruction') + 1
for col in new_columns:
    df_selected.insert(pos, col, '')
    pos += 1

# --- Step 15: Add OrderItemPcs ---
df_selected.insert(df_selected.columns.get_loc('OrderQty') + 1, 'OrderItemPcs', value=1)

# --- Step 16: Cleanup ---
df_selected.drop(columns=['MetalR'], inplace=True)

# --- Step 17: Export to CSV ---
df_selected.to_csv(r'C:\Users\Admin\Desktop\Anaya_PO_Cleaned_updated.csv', index=False)


# Optional: Preview top rows
df_selected.head(20)


Unnamed: 0,SrNo,StyleCode,ItemSize,OrderQty,OrderItemPcs,Metal,Tone,ItemPoNo,ItemRefNo,StockType,...,SKUNo,Basestoneminwt,Basestonemaxwt,Basemetalminwt,Basemetalmaxwt,Productiondeliverydate,Expecteddeliverydate,Blank_Column,SetPrice,StoneQuality
0,1,QR0350H,UP07,330.0,1,G14W,W,6273,,,...,,,,,,,,,,
1,2,QR0350H,UP06,10.0,1,G14W,W,6273,,,...,,,,,,,,,,
2,3,QR0350H,UP08,10.0,1,G14W,W,6273,,,...,,,,,,,,,,
3,4,QE0350K,,120.0,1,G14W,W,6273,,,...,,,,,,,,,,


In [None]:
#PDF TO TEXT CONVERTER
import pandas as pd  # #PDF TO TEXT PO1
import re
import pdfplumber

def extract_data_from_pdf(pdf_path):
    """
    Extract data from PDF file and return structured data
    """
    data = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
            
            print("Extracted text from PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)
            
            # Process the extracted text
            lines = full_text.split('\n')
            current_item = {}
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Extract SrNo (looking for lines starting with numbers)
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:  # Save previous item if exists
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}
                    
                # Extract Article code (patterns like 9-DD106-YG-25, 9-DD106-YG-7)
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)
                    
                # Extract StyleCode from Your reference (text after RSBR2074-01, RSBR2074-03, etc.)
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)
                    
                # Extract ItemSize from description (looking for numbers like 0.25, 0.07)
                # Multiple patterns to catch different formats
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',  # pendant 0.25
                    r'(\d+\.\d+)\s*ct',       # 0.25 ct
                    r'YG[-\s]*(\d+\.\d+)',    # YG-0.25 or YG 0.25
                ]
                
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break
            
            # Add the last item
            if current_item and 'SrNo' in current_item:
                data.append(current_item)
                
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return []
    
    return data

def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Reorder columns as requested
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']
    
    # Only include columns that exist in the data
    final_columns = [col for col in columns_order if col in df.columns]
    
    # Add missing columns with empty values
    for col in columns_order:
        if col not in df.columns:
            df[col] = ""
    
    return df[columns_order]

# Main execution
if __name__ == "__main__":
    # Specify your PDF file path here
   
    pdf_file_path = r'C:\Users\Admin\Desktop\Aanya Inc\PO#6393.pdf'  # Change this to your actual PDF path
    
    # Extract data from PDF
    print(f"Reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)
    
    if extracted_data:
        # Create DataFrame
        df = create_excel_dataframe(extracted_data)
        
        # Display the results
        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        
        # Save to Excel file
        output_file = 'extracted_data.xlsx'
        df.to_excel(output_file, index=False)
        print(f"\nData successfully saved to '{output_file}'")
        
        # Display basic statistics
        print(f"\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
    else:
        print("No data was extracted from the PDF file.")
    
    # Display the DataFrame in notebook
    df

In [8]:
#FOR PDF
# Data processing for PO #6393 - Create DataFrame with all required columns
import pandas as pd
import re

def process_po_6393_data():
    """
    Process the extracted text from PO #6393 and create DataFrame with specified columns
    """
    
    # The extracted text from the PDF
    extracted_text = """Aanya Inc
PO # 6393
48 W 48th Street, PO Date 08/05/2025
# 506
Ship Date 09/05/2025
New York, NY 10036
Gold Lock $3,300
Phone: 917-512-6933
Silver Lock $0.00
Email: cs@aanyainc.com
Terms
Purchase Order
Vendor Remarks
SHIMAYRA JEWELLERY 14K ,A
PLOT NO: 62, SEEPZ
ANDHERI (E), MUMBAI:400096,India
saurabh.dalal@jasanigroup.com
Sr
Item No Image Description Qty Price Amount
No
1.0 CT TW Round Shaped Diamond Band Size-7
1 QR0350H-I1/7 70.00 $0.00 $0.00
Total 70.00 $0.00
Notes
Need Hallmark & Trademark on every piece.
All PO's must be confirmed within 24-48 hours of receipt.
1/1"""
    
    # Initialize data list
    data = []
    
    # Extract using regex
    po_number = re.search(r'PO # (\d+)', extracted_text)
    po_number = po_number.group(1) if po_number else ""
    
    metal_match = re.search(r'SHIMAYRA JEWELLERY ([\w\s]+),', extracted_text)
    metal = metal_match.group(1).strip() if metal_match else ""
    
    desc_match = re.search(r'(\d+\.\d+ CT TW [^\n]+Size-(\d+))', extracted_text)
    if desc_match:
        customer_instruction = desc_match.group(1).replace(f"Size-{desc_match.group(2)}", f"SZ-{desc_match.group(2)}")
        item_size = f"UP{desc_match.group(2).zfill(2)}"
    else:
        customer_instruction = ""
        item_size = ""
    
    style_qty_match = re.search(r'(\d+)\s+([A-Z0-9-]+(?:/[A-Z0-9]+)?)\s+(\d+\.\d+)', extracted_text)
    if style_qty_match:
        sr_no = style_qty_match.group(1)
        style_code = style_qty_match.group(2)
        order_qty = style_qty_match.group(3).split('.')[0]
    else:
        sr_no = ""
        style_code = ""
        order_qty = ""
    
    special_remarks_match = re.search(r'Need Hallmark & Trademark on every piece\.', extracted_text)
    special_remarks = special_remarks_match.group(0) if special_remarks_match else ""

    # Ask for user input for Tone
    tone = input("Enter Tone (Y for Yellow, W for White, P for Pink): ").strip().upper()

    # Determine Metal Code based on Tone + Karat or Platinum
    def determine_metal_code(metal_text, tone_input):
        metal_text = metal_text.upper()
        if "PT" in metal_text or "PLATINUM" in metal_text:
            return "PC95"
        elif "14K" in metal_text:
            return f"G14{tone_input}"
        elif "18K" in metal_text:
            return f"G18{tone_input}"
        else:
            return f"GXX{tone_input}"  # fallback if not found
    
    metal_code = determine_metal_code(metal, tone)

    # Build row data
    item_data = {
        'SrNo': sr_no,
        'StyleCode': style_code.split('-')[0] if '-' in style_code else style_code,  # Only base code like QR0350H
        'ItemSize': item_size,
        'OrderQty': order_qty,
        'OrderItemPcs': 1,
        'Metal': metal_code,
        'Tone': tone,
        'ItemPoNo': po_number,
        'ItemRefNo': "",
        'StockType': "",
        'MakeType': "",
        'CustomerProductionInstruction': customer_instruction,
        'SpecialRemarks': special_remarks,
        'DesignProductionInstruction': "",
        'StampInstruction': "'A' on one side and metal KT on other side of the ring",
        'OrderGroup': "",
        'Certificate': "",
        'SKUNo': "",
        'Basestoneminwt': "",
        'Basestonemaxwt': "",
        'Basemetalminwt': "",
        'Basemetalmaxwt': "",
        'Productiondeliverydate': "",
        'Expecteddeliverydate': "",
        'Blank_Column': "",
        'SetPrice': "",
        'StoneQuality': ""
    }

    data.append(item_data)
    return data


# === Run Processing ===
processed_data = process_po_6393_data()
df = pd.DataFrame(processed_data)

# === Save Output ===
output_filename = 'PO_6393_anayapdf_final.xlsx'
df.to_excel(output_filename, index=False)

# === Display Results ===
print("\n✅ Processed DataFrame:")
print(df.to_string(index=False))
print(f"\n💾 Data successfully saved to '{output_filename}'")


Enter Tone (Y for Yellow, W for White, P for Pink):  Y



✅ Processed DataFrame:
SrNo StyleCode ItemSize OrderQty  OrderItemPcs Metal Tone ItemPoNo ItemRefNo StockType MakeType            CustomerProductionInstruction                            SpecialRemarks DesignProductionInstruction                                       StampInstruction OrderGroup Certificate SKUNo Basestoneminwt Basestonemaxwt Basemetalminwt Basemetalmaxwt Productiondeliverydate Expecteddeliverydate Blank_Column SetPrice StoneQuality
   1   QR0350H     UP07       70             1  G14Y    Y     6393                              1.0 CT TW Round Shaped Diamond Band SZ-7 Need Hallmark & Trademark on every piece.                             'A' on one side and metal KT on other side of the ring                                                                                                                                                                        

💾 Data successfully saved to 'PO_6393_anayapdf_final.xlsx'
