# Complete code with Automation

In [31]:
import re
import pdfplumber
import os
import pandas as pd


# Code 1: Process invoices with specific format
def process_invoice_pdf(pdf_path):
    data = {
        "Chassis Number": "Not found",
        "Color": "Not found",
        "Place of Supply": "Not found",
        "From": "Not found",
        "Invoice No.": "Not found",
        "Invoice Date": "Not found",
        "Owner": "Not found",
        "Ex-showroom Price": "₹6,41,500",  # Fixed value
        "Total Invoice Amount": "Not found",
        "Discount": "Not found",
        "Variant Description": "Not found",
        "Engine Number": "Not found",
    }

    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            lines = page.extract_text().split("\n")  # Extract and split text into lines

            for line in lines:
                # Debugging: Uncomment to print each line
                
                #print(f"Line: {line}")

                # 1. Chassis Number
                if "Vehicle ID" in line:
                    match = re.search(r"Vehicle ID\s*:\s*([A-Z0-9]+)", line)
                    if match:
                        data["Chassis Number"] = match.group(1)

                # 2. Engine Number
                if data["Chassis Number"] != "Not found" and "Superior" in line:
                    chassis_match = re.search(data["Chassis Number"], line)
                    if chassis_match:
                        engine_match = re.search(rf"{data['Chassis Number']}\s+([A-Z0-9\s]+)\s+Superior", line)
                        if engine_match:
                            data["Engine Number"] = engine_match.group(1).strip()

                # 3. Color
                if "Superior" in line:
                    match = re.search(r"(Superior\s\w+-\d+U)", line, re.IGNORECASE)
                    if match:
                        data["Color"] = match.group(1)

                # 4. Place of Supply
                if "Place of Supply" in line:
                    match = re.search(r"Place of Supply\s*:\s*([\w\s]+\(\d+\))", line)
                    if match:
                        data["Place of Supply"] = match.group(1)

                # 5. From (Dealer Information)
                if "For" in line and "PVT LTD" in line:
                    match = re.search(r"For\s+([\w\s\.\-]+PVT LTD)", line, re.IGNORECASE)
                    if match:
                        data["From"] = match.group(1).strip()

                # 6. Invoice No.
                if "Invoice No." in line:
                    # Look for "Invoice No." and capture it
                    match = re.search(r"Invoice No\.\s*[:\-]?\s*([\w\/\-]+)", line)
                    if match:
                        data["Invoice No."] = match.group(1)

                # 7. Invoice Date
                if "Invoice Date" in line:
                    match = re.search(r"Invoice Date\s*:\s*([\d/]+\s+\d{2}:\d{2}\s+[APM]{2})", line)
                    if match:
                        data["Invoice Date"] = match.group(1)

                # 8. Owner
                if "Sold To" in line:
                    # This will match the "Sold To" line and capture everything after "Sold To :"
                    match = re.search(r"Sold To\s*:\s*(.+)", line, re.IGNORECASE)
                    if match:
                        data["Owner"] = match.group(1).strip()
    

                # 9. Total Invoice Amount
                if "Total Invoice Amount" in line:
                    match = re.search(r"Total Invoice Amount\s*:\s*([\d,]+\.\d{2})", line)
                    if match:
                        data["Total Invoice Amount"] = match.group(1)

                # 10. Variant Description
                if "PRICE OF ONE" in line:
                    match = re.search(r"PRICE OF ONE\s+([\w\s]+)-", line)
                    if match:
                        data["Variant Description"] = match.group(1).strip()

    # Calculate Discount
    if data["Total Invoice Amount"] != "Not found":
        ex_showroom_price = 641500.00
        total_invoice = float(data["Total Invoice Amount"].replace(",", ""))
        data["Discount"] = f"₹{ex_showroom_price - total_invoice:.2f}"

    return data

# Code 2: Process insurance policy PDFs with specific company
def process_insurance_pdf_shreyash(pdf_path):
    extracted_details = {
        "Chassis Number": None,
        "Insurance Company": "Go Digit General Insurance Ltd",
        "Policy Number": None,
        "INSURED’S DECLARED VALUE (IDV)": None,
        "Insurance Premium": None,
        "Start Date": None,
        "End Date": None
    }


    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            lines = page.extract_text().split("\n")  # Extract and split text into lines
            #print(f"--- Page {page_number + 1} ---")  # Debugging: Print page number
            
            for i, line in enumerate(lines):
                #print(f"Line {i}: {line.strip()}")  # Debugging: Print each line

                # 1.Chassis Number 
                if "Vehicle Identification No." in line:
                    match = re.search(r"Vehicle Identification No\.\s+([A-Z0-9]+)", line)
                    if match:
                        extracted_details["Chassis Number"] = match.group(1)
                       # print(f"Extracted Chassis Number: {extracted_details['Chassis Number']}")  # Debugging
               
                # 2. Policy Number
                if "Policy No" in line:
                    match = re.search(r"Policy No\s+(\w+)", line)
                    if match:
                        extracted_details["Policy Number"] = match.group(1)
                      #  print(f"Extracted Policy Number: {extracted_details['Policy Number']}")  # Debugging

                # 3. INSURED’S DECLARED VALUE (IDV)
                if "Total IDV ₹" in line:
                    match = re.search(r"Total IDV ₹\s*([\d,]+)", line)
                    if match:
                        extracted_details["INSURED’S DECLARED VALUE (IDV)"] = match.group(1).replace(",", "")
                      #  print(f"Extracted IDV: {extracted_details['INSURED’S DECLARED VALUE (IDV)']}")  # Debugging

                # 4. Insurance Premium
                if "Gross Premium Paid ₹" in line:
                    match = re.search(r"Gross Premium Paid ₹\s*([\d,]+)", line)
                    if match:
                        extracted_details["Insurance Premium"] = match.group(1).replace(",", "")
                      #  print(f"Extracted Insurance Premium: {extracted_details['Insurance Premium']}")  # Debugging

                # 5. Start Date
                if "Period of Insurance" in line:
                    match = re.search(r"Period of Insurance\s+(\d{2}-[A-Z]{3}-\d{4})", line)
                    if match:
                        extracted_details["Start Date"] = match.group(1)
                      #  print(f"Extracted Start Date: {extracted_details['Start Date']}")  # Debugging

                # 6. End Date
                if "to" in line and "Period of Insurance" in line:
                    match = re.search(r"to\s+(\d{2}-[A-Z]{3}-\d{4})", line)
                    if match:
                        extracted_details["End Date"] = match.group(1)
                      #  print(f"Extracted End Date: {extracted_details['End Date']}")  # Debugging

                # # 7. Insurance Company
                # if "T h i s i s n o t a p a r t o f t h e p o l i c y d o c u m e n t" in line:
                #     if i + 1 < len(lines):  # Check if next line exists
                #         extracted_details["Insurance Company"] = lines[i + 1].strip()
                #        # print(f"Extracted Insurance Company: {extracted_details['Insurance Company']}")  # Debugging

                

    return extracted_details




# Code 3: Process insurance PDFs with SBI-specific format
def process_insurance_pdf_sbi(pdf_path):
    extracted_details= {
        "Chassis Number": None,
        "Insurance Company": "SBI General Insurance Company Limited",
        "Policy Number": None,
        "INSURED’S DECLARED VALUE (IDV)": None,
        "Insurance Premium": None,
        "Start Date": None,
        "End Date": None
        
    }


    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            lines = page.extract_text().split("\n")  # Extract and split text into lines
            #print(f"--- Page {page_number + 1} ---")  # Debugging: Print page number
            
            for i, line in enumerate(lines):
                #print(f"Line {i}: {line.strip()}")  # Debugging: Print each line

                # 1. Chassis Number 
                if "Chassis Number" in line:
                    match = re.search(r"Chassis Number\s+([A-Z0-9]+)", line)
                    if match:
                        extracted_details["Chassis Number"] = match.group(1)
                       # print(f"Extracted Chassis Number: {extracted_details['Chassis Number']}")  # Debugging
                
                # 2. Policy Number
                if "Your Policy No. :" in line:
                    match = re.search(r"Your Policy No\. :(\d+)", line)
                    if match:
                        extracted_details["Policy Number"] = match.group(1)
                        #print(f"Extracted Policy Number: {extracted_details['Policy Number']}")  # Debugging

                # 3. INSURED’S DECLARED VALUE (IDV)
                if "INSURED’S DECLARED VALUE (IDV)" in line:
                    if i + 4 < len(lines):  # Check if 4th line after exists
                        idv_line = lines[i + 4].strip()
                        match = re.search(r"(\d{1,3}(,\d{3})*(\.\d{2})?)", idv_line)
                        if match:
                            extracted_details["INSURED’S DECLARED VALUE (IDV)"] = match.group(1).split(".")[0]
                        #    print(f"Extracted IDV: {extracted_details['INSURED’S DECLARED VALUE (IDV)']}")  # Debugging

                # 4. Insurance Premium
                if "Total Premium Collected" in line:
                    match = re.search(r"Total Premium Collected\s+([\d,]+\.\d{2})", line)
                    if match:
                        extracted_details["Insurance Premium"] = match.group(1)
                      #  print(f"Extracted Insurance Premium: {extracted_details['Insurance Premium']}")  # Debugging

                # 5. Start Date
                if "Period of Insurance : From :" in line:
                    match = re.search(r"From\s*:(\d{2}/\d{2}/\d{4})", line)
                    if match:
                        extracted_details["Start Date"] = match.group(1)
                      #  print(f"Extracted Start Date: {extracted_details['Start Date']}")  # Debugging

                # 6. End Date
                if "Period of Insurance : From :" in line:
                    match = re.search(r"Midnight of:\s*(\d{2}/\d{2}/\d{4})", line)
                    if match:
                        extracted_details["End Date"] = match.group(1)
                      #  print(f"Extracted End Date: {extracted_details['End Date']}")  # Debugging
    
                # # 7.Insurance Company
                # if "The Postal Address of your SBI General Branch that will service you in future is:" in line:
                #     if i + 1 < len(lines):  # Check if next line exists
                #         extracted_details["Insurance Company"] = lines[i + 1].strip()
                #         #print(f"Extracted Insurance Company: {extracted_details['Insurance Company']}")  # Debugging



    return extracted_details


def process_pdf_based_on_type(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        first_page_text = pdf.pages[0].extract_text()

    if "TAX / VEHICLE & CHARGES INVOICE" in first_page_text:
        print("Processing Invoice PDF...")
        return process_invoice_pdf(pdf_path)
    elif "SHREYASH AUTOMOTIVES PRIVATE LIMIT" in first_page_text:
        print("Processing Insurance PDF (Shreyash)...")
        return process_insurance_pdf_shreyash(pdf_path)
    elif "The Postal Address of your SBI General Branch" in first_page_text:
        print("Processing Insurance PDF (SBI)...")
        return process_insurance_pdf_sbi(pdf_path)
    else:
        print("Unknown PDF format.")
        return None


def merge_dicts_based_on_key(existing_data, new_data, key):
    """
    Merges two dictionaries based on the value of a specific key. If the key value matches,
    it combines the data into a single dictionary, keeping non-null values from both.
    
    Args:
    - existing_data (dict): Existing dictionary to compare with.
    - new_data (dict): New dictionary to merge.
    - key (str): The key to compare values for merging.
    
    Returns:
    - dict: Merged dictionary.
    """
    for k, v in new_data.items():
        if pd.notna(v) and (k not in existing_data or pd.isna(existing_data[k])):
            existing_data[k] = v
    return existing_data

def save_to_excel(data_list, excel_path):
    """
    Save data to an Excel file, merging all information into a single row if the 'Chassis Number' matches.
    
    Args:
    - data_list (list of dict): List of dictionaries containing data to save.
    - excel_path (str): Path to the Excel file.
    """
    # Convert the new data to a DataFrame
    new_data = pd.DataFrame(data_list)

    # Ensure 'Chassis Number' is treated as a unique key
    if 'Chassis Number' not in new_data.columns:
        raise ValueError("The key column 'Chassis Number' is missing in the data.")

    # Check if the Excel file exists
    if os.path.exists(excel_path):
        # Load the existing data
        existing_data = pd.read_excel(excel_path)

        # Ensure both DataFrames have the same columns
        combined_columns = set(existing_data.columns).union(set(new_data.columns))
        existing_data = existing_data.reindex(columns=combined_columns)
        new_data = new_data.reindex(columns=combined_columns)

        # Merge the data row by row
        for _, new_row in new_data.iterrows():
            if new_row['Chassis Number'] in existing_data['Chassis Number'].values:
                # Find the index of the matching row
                idx = existing_data.loc[existing_data['Chassis Number'] == new_row['Chassis Number']].index[0]

                # Update the existing row with new non-null values
                for col in new_data.columns:
                    if pd.notna(new_row[col]):
                        existing_data.at[idx, col] = new_row[col]
            else:
                # Append new row if 'Chassis Number' is not found
                existing_data = pd.concat([existing_data, pd.DataFrame([new_row])], ignore_index=True)
    else:
        # If the file does not exist, use the new data as the initial DataFrame
        existing_data = new_data

    # Save the updated data back to Excel
    try:
        existing_data.to_excel(excel_path, index=False)
        print(f"Data saved successfully to {excel_path}")
    except Exception as e:
        print(f"Failed to save data to Excel: {e}")

# Function to process PDFs in a folder and write all data to Excel
def process_pdfs_in_folder(folder_path, excel_path):
    files = os.listdir(folder_path)
    pdf_files = [f for f in files if f.lower().endswith('.pdf')]

    all_extracted_data = []  # List to store data from all PDFs

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        try:
            extracted_data = process_pdf_based_on_type(pdf_path)  # Assume this function extracts data as a dict
            if extracted_data:
                print("\nExtracted Data:")
                for key, value in extracted_data.items():
                    print(f"{key}: {value}")

                # Check for existing data with the same value of the first key
                first_key = list(extracted_data.keys())[0]  # Get the first key
                existing_entry = next(
                    (row for row in all_extracted_data if row.get(first_key) == extracted_data[first_key]), None
                )

                if existing_entry:
                    # Merge the new data with the existing entry
                    merge_dicts_based_on_key(existing_entry, extracted_data, first_key)
                else:
                    # Append as a new entry if no matching key value is found
                    all_extracted_data.append(extracted_data)
                
                print("-" * 50)
            else:
                print(f"No data extracted from {pdf_path}.")
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")

    # Save all extracted data to the Excel file
    if all_extracted_data:
        save_to_excel(all_extracted_data, excel_path)
        #print(all_extracted_data)
    else:
        print("No data extracted from any PDFs.")

if __name__ == "__main__":
    folder_path = "C:\\Users\\welcome\\Documents\\Carrum"  # Replace with your folder path
    excel_path = "C:\\Users\\welcome\\Documents\\Carrum\\extracted_data_carrum.xlsx"  # Path to save the Excel file
    process_pdfs_in_folder(folder_path, excel_path)


Processing Invoice PDF...

Extracted Data:
Chassis Number: MA3JMTB1SRFB19421
Color: Superior White-26U
Place of Supply: TELANGANA(36)
From: VARUN MOTORS PVT LTD
Invoice No.: 12/VSL/24002577
Invoice Date: 28/08/2024 04:34 PM
Owner: M/S. CARRUM MOBILITY SOLUTIONS PVT LTD
Ex-showroom Price: ₹6,41,500
Total Invoice Amount: 5,58,500.00
Discount: ₹83000.00
Variant Description: MARUTI TOUR H3 CNG 1L 5MT
Engine Number: K10CN C597750
--------------------------------------------------
Processing Insurance PDF (SBI)...

Extracted Data:
Chassis Number: MA3JMTB1SRFB19421
Insurance Company: SBI General Insurance Company Limited
Policy Number: 0000000040526381
INSURED’S DECLARED VALUE (IDV): 530,575
Insurance Premium: 22,194.00
Start Date: 28/08/2024
End Date: 27/08/2025
--------------------------------------------------
Processing Invoice PDF...

Extracted Data:
Chassis Number: MA3JMTB1SRFB21380
Color: Superior White-26U
Place of Supply: HARYANA(06)
From: SHREYASH AUTOMOTIVES PVT LTD
Invoice No.: 2/

# Complete Code with Automation(Screenshots)

In [32]:
import pytesseract
from PIL import Image
import re
import os
import pandas as pd

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image(image_path):
    """
    Extract text from an image using Tesseract OCR.
    
    Args:
        image_path (str): Path to the image file.
        
    Returns:
        str: Extracted text from the image.
    """
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        return f"Error: {e}"

def extract_registration_date(text):
    """
    Extract the last date format from a line containing 'Date Of Registration'.
    
    Args:
        text (str): Input text containing 'Date Of Registration'.
        
    Returns:
        str: Extracted registration date or 'Not Found'.
    """
    # Split the text into lines
    lines = text.splitlines()
    for line in lines:
        if "Date Of Registration" in line:
            # Debugging: Print the matching line
            #print(f"Debug Line: {line}")
            # Regex to capture the last date-like pattern in the line
            pattern = r"(\d{1,2}/\d{1,2}/\d{4})$"
            match = re.search(pattern, line)
            if match:
                return match.group(1).strip()

    # Fallback: Search all lines for any date format
    fallback_pattern = r"(\d{1,2}/\d{1,2}/\d{4})"
    all_dates = re.findall(fallback_pattern, text)
    if all_dates:
        return all_dates[-1]  # Return the last found date
    return "Not Found"

def extract_field_values(text):
    """
    Extract specific field values from the OCR text using patterns.
    
    Args:
        text (str): Extracted text from the image.
        
    Returns:
        dict: A dictionary containing extracted field values.
    """
    try:
        patterns = {
            "Chassis Number": r"Enter Search Element:\*\s*\|\s*([A-Z0-9]+)",
            "Registration Number": r"Registration No:?[\|]?\s*([A-Z0-9]+)\s*Fuel Type:",
            "Mfg. Year": r"Mfg\. Year:\s*\|?\s*([0-9/]+)",
        }
        extracted_values = {}
        for field, pattern in patterns.items():
            match = re.search(pattern, text)
            extracted_values[field] = match.group(1) if match else "Not Found"

        # Extract Registration Date using a dedicated function
        extracted_values["Registration Date"] = extract_registration_date(text)
        return extracted_values
    except Exception as e:
        return {"Error": str(e)}

def save_to_excel(data_list, excel_path):
    """
    Save data to an Excel file, merging all information into a single row if the 'Chassis Number' matches.
    
    Args:
        data_list (list of dict): List of dictionaries containing data to save.
        excel_path (str): Path to the Excel file.
    """
    new_data = pd.DataFrame(data_list)

    if 'Chassis Number' not in new_data.columns:
        raise ValueError("The key column 'Chassis Number' is missing in the data.")

    if os.path.exists(excel_path):
        existing_data = pd.read_excel(excel_path)

        combined_columns = set(existing_data.columns).union(set(new_data.columns))
        existing_data = existing_data.reindex(columns=combined_columns)
        new_data = new_data.reindex(columns=combined_columns)

        for _, new_row in new_data.iterrows():
            if new_row['Chassis Number'] in existing_data['Chassis Number'].values:
                idx = existing_data.loc[existing_data['Chassis Number'] == new_row['Chassis Number']].index[0]
                for col in new_data.columns:
                    if pd.notna(new_row[col]):
                        existing_data.at[idx, col] = new_row[col]
            else:
                existing_data = pd.concat([existing_data, pd.DataFrame([new_row])], ignore_index=True)
    else:
        existing_data = new_data

    try:
        existing_data.to_excel(excel_path, index=False)
        print(f"Data saved successfully to {excel_path}")
    except Exception as e:
        print(f"Failed to save data to Excel: {e}")

def process_folder(folder_path, excel_path):
    """
    Process all images in a given folder and extract field values.
    
    Args:
        folder_path (str): Path to the folder containing images.
        excel_path (str): Path to the Excel file for saving data.
    """
    valid_extensions = {".jpg", ".jpeg", ".png", ".tiff", ".bmp", ".gif"}
    data_list = []

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and os.path.splitext(file_name.lower())[1] in valid_extensions:
            print(f"\nProcessing: {file_name}")
            extracted_text = extract_text_from_image(file_path)
            print(f"Extracted OCR Text:\n{extracted_text}")  # Debugging: Print the OCR text
            field_values = extract_field_values(extracted_text)
            data_list.append(field_values)
        else:
            continue
            print(f"Skipping non-image file: {file_name}")

    if data_list:
        save_to_excel(data_list, excel_path)

# Path to the folder containing images
folder_path = "C:\\Users\\welcome\\Documents\\Carrum\\"
excel_path = r"C:\Users\welcome\Documents\Carrum\extracted_data_carrum.xlsx"

# Process the folder and save the data to Excel
process_folder(folder_path, excel_path)
Mth. Yr. of Mfg: 09/09/2024



Processing: MA3JMTB1SRFB21380 RC.png
Extracted OCR Text:
Vehicle Registration Search

Select Search Element:* | CHASSIS NO. “
Enter Search Element:* | MA3JMTB1SRFB21380
585890

585890 +>

Captcha :*

Registration No:|TG0O7T3914 Fuel Type:|CNG PETROL
Owner Name:|CARRUM MOBILITY SOLU Vehicle Color:|SUPERIOR WHITE,
Vehicle Class:IMOTOR CAB Maker's Name:|MARUTI SUZUKI INDIA LTD.,
Mfg. Year:|06/06/2024 Maker's Class|MARUT! TOUR H3 CNG 1L 5MT BSVI-PH2}
Engine No:|K10CNC6XXXXX Date Of Registration:|20/07/2024
Chassis No:IMA3JMTB1SRFBXXXXX Financier: SUNDARAM FINANCE LIMITED
Prev Registration No:|TO724DLXXXXX Registration AuthorityRTA RANGAREDDY
[Harvester Chassis No: Status/ACTIVE



Processing: MA3JMTB1SRFB21391 RC.png
Extracted OCR Text:
Vehicle Registration Search

Select Search Element:* | CHASSIS NO. “
Enter Search Element:* | MA3JMTB1SRFB21391
585890

585890 +>

Captcha :*

Registration No:|TG0O7T3923 Fuel Type:|CNG PETROL
Owner Name:|CARRUM MOBILITY SOLU Vehicle Color:|SUPERIOR WHITE,

  existing_data.at[idx, col] = new_row[col]
  existing_data.at[idx, col] = new_row[col]
  existing_data.at[idx, col] = new_row[col]


# for Telangana Screenshot

In [42]:
import pytesseract
from PIL import Image
import os
import re

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# List of valid strings to check in the first line of text
VALID_STRINGS = [
    "M-Wallet", "(>) M-wallet", "()M-Wallet", "(>) wallet", "(>) M-wallet",
    "()M-Wallet", "(_)M-Wallet", "()M-Wallet", "()M-Wallet", "(>) M-wallet",
    "()M-Wallet", "()M-Wallet", "() wallet"
]

def extract_text_from_image(image_path):
    """
    Extract text from an image using Tesseract OCR.
    
    Args:
        image_path (str): Path to the image file.
        
    Returns:
        str: Extracted text from the image.
    """
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        return f"Error: {e}"

def extract_field_values(text):
    """
    Extract specific field values from the OCR text using refined patterns.
    
    Args:
        text (str): Extracted text from the image.
        
    Returns:
        dict: A dictionary containing extracted field values.
    """
    try:
        # Define refined patterns for each field
        patterns = {
            "Chassis Number": r"Chassis Number[:\s]*([A-Z0-9]+)",
            "Registration Number": r"Reg[n|a]?[.,]?\s*Number[:\s]*(TG[A-Z0-9]+)",
            "Mfg. Year": r"(Mth[.\s]*.*?Mfg.*?(\d{2}/\d{2}/\d{4}))",
            "Registration Date": r"Color.*?\n.*?(\d{2}/\d{2}/\d{4})"
        }

        # Extract values
        extracted_values = {}
        for field, pattern in patterns.items():
            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
            if field == "Mfg. Year" and match:
                # Extract the date from the matched line
                extracted_values[field] = match.group(2) if match else "Not Found"
            elif field == "Registration Date" and match:
                # Extract only the first valid date after "Color"
                date_match = re.search(r"\d{2}/\d{2}/\d{4}", match.group(1))
                extracted_values[field] = date_match.group(0) if date_match else "Not Found"
            else:
                extracted_values[field] = match.group(1) if match else "Not Found"
        
        return extracted_values
    except Exception as e:
        return {"Error": str(e)}

def process_folder(folder_path):
    """
    Process all images in a folder and print extracted details if the first line matches a valid string.
    
    Args:
        folder_path (str): Path to the folder containing images.
    """
    valid_extensions = {".jpg", ".jpeg"}
    data_list = []

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and os.path.splitext(file_name.lower())[1] in valid_extensions:
            print(f"\nProcessing: {file_name}")
            extracted_text = extract_text_from_image(file_path)
            
            # Check if the first line matches a valid string
            first_line = extracted_text.splitlines()[0].strip() if extracted_text else ""
            if first_line in VALID_STRINGS:
                field_values = extract_field_values(extracted_text)
                field_values["File Name"] = file_name  # Add file name for reference
                data_list.append(field_values)
            else:
                print(f"Skipping {file_name}: First line does not match valid strings.")
        else:
            print(f"Skipping non-image file: {file_name}")

    # Print extracted data
    for data in data_list:
        print(data)

# Path to the folder containing images
folder_path = "C:\\Users\\welcome\\Documents\\Carrum\\"  # Update with the folder path containing images

# Process the folder and print data
process_folder(folder_path)


Skipping non-image file: Boooook.xlsx
Skipping non-image file: Downloaded_Files
Skipping non-image file: extracted_data_carrum.xlsx
Skipping non-image file: MA3JMTB1SRFB19421 Invoice.pdf
Skipping non-image file: MA3JMTB1SRFB19421 Policy.pdf
Skipping non-image file: MA3JMTB1SRFB21380 Invoice.pdf
Skipping non-image file: MA3JMTB1SRFB21380 Policy.pdf
Skipping non-image file: MA3JMTB1SRFB21380 RC.png
Skipping non-image file: MA3JMTB1SRFB21391 Invoice.pdf
Skipping non-image file: MA3JMTB1SRFB21391 Policy.pdf
Skipping non-image file: MA3JMTB1SRFB21391 RC.png
Skipping non-image file: MA3JMTB1SRFB21432 Invoice.pdf
Skipping non-image file: MA3JMTB1SRFB21432 Policy.pdf
Skipping non-image file: MA3JMTB1SRFB21765 Invoice.pdf
Skipping non-image file: MA3JMTB1SRFB21765 Policy.pdf
Skipping non-image file: MA3JMTB1SRFB21765 RC.png
Skipping non-image file: MA3JMTB1SRFB21810 Invoice.pdf
Skipping non-image file: MA3JMTB1SRFB21810 Policy.pdf
Skipping non-image file: MA3JMTB1SRFB21810 RC.png
Skipping non-i

# Complete code (with Invoice and Policy) only print

In [55]:
import re
import pdfplumber

# Code 1: Process invoices with specific format
def process_invoice_pdf(pdf_path):
    data = {
        "Chassis Number": "Not found",
        "Color": "Not found",
        "Place of Supply": "Not found",
        "From": "Not found",
        "Invoice No.": "Not found",
        "Invoice Date": "Not found",
        "Owner": "Not found",
        "Ex-showroom Price": "₹6,41,500",  # Fixed value
        "Total Invoice Amount": "Not found",
        "Discount": "Not found",
        "Variant Description": "Not found",
        "Engine Number": "Not found",
    }

    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            lines = page.extract_text().split("\n")  # Extract and split text into lines

            for line in lines:
                # Debugging: Uncomment to print each line
                
                #print(f"Line: {line}")

                # 1. Chassis Number
                if "Vehicle ID" in line:
                    match = re.search(r"Vehicle ID\s*:\s*([A-Z0-9]+)", line)
                    if match:
                        data["Chassis Number"] = match.group(1)

                # 2. Engine Number
                if data["Chassis Number"] != "Not found" and "Superior" in line:
                    chassis_match = re.search(data["Chassis Number"], line)
                    if chassis_match:
                        engine_match = re.search(rf"{data['Chassis Number']}\s+([A-Z0-9\s]+)\s+Superior", line)
                        if engine_match:
                            data["Engine Number"] = engine_match.group(1).strip()

                # 3. Color
                if "Superior" in line:
                    match = re.search(r"(Superior\s\w+-\d+U)", line, re.IGNORECASE)
                    if match:
                        data["Color"] = match.group(1)

                # 4. Place of Supply
                if "Place of Supply" in line:
                    match = re.search(r"Place of Supply\s*:\s*([\w\s]+\(\d+\))", line)
                    if match:
                        data["Place of Supply"] = match.group(1)

                # 5. From (Dealer Information)
                if "For" in line and "PVT LTD" in line:
                    match = re.search(r"For\s+([\w\s\.\-]+PVT LTD)", line, re.IGNORECASE)
                    if match:
                        data["From"] = match.group(1).strip()

                # 6. Invoice No.
                if "Invoice No." in line:
                    # Look for "Invoice No." and capture it
                    match = re.search(r"Invoice No\.\s*[:\-]?\s*([\w\/\-]+)", line)
                    if match:
                        data["Invoice No."] = match.group(1)

                # 7. Invoice Date
                if "Invoice Date" in line:
                    match = re.search(r"Invoice Date\s*:\s*([\d/]+\s+\d{2}:\d{2}\s+[APM]{2})", line)
                    if match:
                        data["Invoice Date"] = match.group(1)

                # 8. Owner
                if "Sold To" in line:
                    # This will match the "Sold To" line and capture everything after "Sold To :"
                    match = re.search(r"Sold To\s*:\s*(.+)", line, re.IGNORECASE)
                    if match:
                        data["Owner"] = match.group(1).strip()
    

                # 9. Total Invoice Amount
                if "Total Invoice Amount" in line:
                    match = re.search(r"Total Invoice Amount\s*:\s*([\d,]+\.\d{2})", line)
                    if match:
                        data["Total Invoice Amount"] = match.group(1)

                # 10. Variant Description
                if "PRICE OF ONE" in line:
                    match = re.search(r"PRICE OF ONE\s+([\w\s]+)-", line)
                    if match:
                        data["Variant Description"] = match.group(1).strip()

    # Calculate Discount
    if data["Total Invoice Amount"] != "Not found":
        ex_showroom_price = 641500.00
        total_invoice = float(data["Total Invoice Amount"].replace(",", ""))
        data["Discount"] = f"₹{ex_showroom_price - total_invoice:.2f}"

    return data

# Code 2: Process insurance policy PDFs with specific company
def process_insurance_pdf_shreyash(pdf_path):
    extracted_details = {
        "Insurance Company": None,
        "Policy Number": None,
        "INSURED’S DECLARED VALUE (IDV)": None,
        "Insurance Premium": None,
        "Start Date": None,
        "End Date": None,
        "Chassis Number": None
    }


    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            lines = page.extract_text().split("\n")  # Extract and split text into lines
            #print(f"--- Page {page_number + 1} ---")  # Debugging: Print page number
            
            for i, line in enumerate(lines):
                #print(f"Line {i}: {line.strip()}")  # Debugging: Print each line

                # 1. Insurance Company
                if "T h i s i s n o t a p a r t o f t h e p o l i c y d o c u m e n t" in line:
                    if i + 1 < len(lines):  # Check if next line exists
                        extracted_details["Insurance Company"] = lines[i + 1].strip()
                       # print(f"Extracted Insurance Company: {extracted_details['Insurance Company']}")  # Debugging

                # 2. Policy Number
                if "Policy No" in line:
                    match = re.search(r"Policy No\s+(\w+)", line)
                    if match:
                        extracted_details["Policy Number"] = match.group(1)
                      #  print(f"Extracted Policy Number: {extracted_details['Policy Number']}")  # Debugging

                # 3. INSURED’S DECLARED VALUE (IDV)
                if "Total IDV ₹" in line:
                    match = re.search(r"Total IDV ₹\s*([\d,]+)", line)
                    if match:
                        extracted_details["INSURED’S DECLARED VALUE (IDV)"] = match.group(1).replace(",", "")
                      #  print(f"Extracted IDV: {extracted_details['INSURED’S DECLARED VALUE (IDV)']}")  # Debugging

                # 4. Insurance Premium
                if "Gross Premium Paid ₹" in line:
                    match = re.search(r"Gross Premium Paid ₹\s*([\d,]+)", line)
                    if match:
                        extracted_details["Insurance Premium"] = match.group(1).replace(",", "")
                      #  print(f"Extracted Insurance Premium: {extracted_details['Insurance Premium']}")  # Debugging

                # 5. Start Date
                if "Period of Insurance" in line:
                    match = re.search(r"Period of Insurance\s+(\d{2}-[A-Z]{3}-\d{4})", line)
                    if match:
                        extracted_details["Start Date"] = match.group(1)
                      #  print(f"Extracted Start Date: {extracted_details['Start Date']}")  # Debugging

                # 6. End Date
                if "to" in line and "Period of Insurance" in line:
                    match = re.search(r"to\s+(\d{2}-[A-Z]{3}-\d{4})", line)
                    if match:
                        extracted_details["End Date"] = match.group(1)
                      #  print(f"Extracted End Date: {extracted_details['End Date']}")  # Debugging

                # 7. Chassis Number
                if "Vehicle Identification No." in line:
                    match = re.search(r"Vehicle Identification No\.\s+([A-Z0-9]+)", line)
                    if match:
                        extracted_details["Chassis Number"] = match.group(1)
                       # print(f"Extracted Chassis Number: {extracted_details['Chassis Number']}")  # Debugging

    return extracted_details




# Code 3: Process insurance PDFs with SBI-specific format
def process_insurance_pdf_sbi(pdf_path):
    extracted_details= {
        "Insurance Company": None,
        "Policy Number": None,
        "INSURED’S DECLARED VALUE (IDV)": None,
        "Insurance Premium": None,
        "Start Date": None,
        "End Date": None,
        "Chassis Number": None
    }


    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            lines = page.extract_text().split("\n")  # Extract and split text into lines
            #print(f"--- Page {page_number + 1} ---")  # Debugging: Print page number
            
            for i, line in enumerate(lines):
                #print(f"Line {i}: {line.strip()}")  # Debugging: Print each line

                # 1. Insurance Company
                if "The Postal Address of your SBI General Branch that will service you in future is:" in line:
                    if i + 1 < len(lines):  # Check if next line exists
                        extracted_details["Insurance Company"] = lines[i + 1].strip()
                        #print(f"Extracted Insurance Company: {extracted_details['Insurance Company']}")  # Debugging

                # 2. Policy Number
                if "Your Policy No. :" in line:
                    match = re.search(r"Your Policy No\. :(\d+)", line)
                    if match:
                        extracted_details["Policy Number"] = match.group(1)
                        #print(f"Extracted Policy Number: {extracted_details['Policy Number']}")  # Debugging

                # 3. INSURED’S DECLARED VALUE (IDV)
                if "INSURED’S DECLARED VALUE (IDV)" in line:
                    if i + 4 < len(lines):  # Check if 4th line after exists
                        idv_line = lines[i + 4].strip()
                        match = re.search(r"(\d{1,3}(,\d{3})*(\.\d{2})?)", idv_line)
                        if match:
                            extracted_details["INSURED’S DECLARED VALUE (IDV)"] = match.group(1).split(".")[0]
                        #    print(f"Extracted IDV: {extracted_details['INSURED’S DECLARED VALUE (IDV)']}")  # Debugging

                # 4. Insurance Premium
                if "Total Premium Collected" in line:
                    match = re.search(r"Total Premium Collected\s+([\d,]+\.\d{2})", line)
                    if match:
                        extracted_details["Insurance Premium"] = match.group(1)
                      #  print(f"Extracted Insurance Premium: {extracted_details['Insurance Premium']}")  # Debugging

                # 5. Start Date
                if "Period of Insurance : From :" in line:
                    match = re.search(r"From\s*:(\d{2}/\d{2}/\d{4})", line)
                    if match:
                        extracted_details["Start Date"] = match.group(1)
                      #  print(f"Extracted Start Date: {extracted_details['Start Date']}")  # Debugging

                # 6. End Date
                if "Period of Insurance : From :" in line:
                    match = re.search(r"Midnight of:\s*(\d{2}/\d{2}/\d{4})", line)
                    if match:
                        extracted_details["End Date"] = match.group(1)
                      #  print(f"Extracted End Date: {extracted_details['End Date']}")  # Debugging
                # 7.Chassis Number 
                if "Chassis Number" in line:
                    match = re.search(r"Chassis Number\s+([A-Z0-9]+)", line)
                    if match:
                        extracted_details["Chassis Number"] = match.group(1)
                       # print(f"Extracted Chassis Number: {extracted_details['Chassis Number']}")  # Debugging


    return extracted_details

# Main function to process the PDF
import os
import pdfplumber

# Main function to process the PDF
def process_pdf_based_on_type(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        first_page_text = pdf.pages[0].extract_text()

    if "TAX / VEHICLE & CHARGES INVOICE" in first_page_text:
        print("Processing Invoice PDF...")
        return process_invoice_pdf(pdf_path)
    elif "SHREYASH AUTOMOTIVES PRIVATE LIMIT" in first_page_text:
        print("Processing Insurance PDF (Shreyash)...")
        return process_insurance_pdf_shreyash(pdf_path)
    elif "The Postal Address of your SBI General Branch" in first_page_text:
        print("Processing Insurance PDF (SBI)...")
        return process_insurance_pdf_sbi(pdf_path)
    else:
        print("Unknown PDF format.")
        return None

# Function to process all PDFs in a folder
def process_pdfs_in_folder(folder_path):
    # List all files in the folder
    files = os.listdir(folder_path)

    # Filter only PDF files
    pdf_files = [f for f in files if f.lower().endswith('.pdf')]

    # Process each PDF
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        try:
            extracted_data = process_pdf_based_on_type(pdf_path)
            if extracted_data:
                print("\nExtracted Data:")
                for key, value in extracted_data.items():
                    print(f"{key}: {value}")
                print("-" * 50)
            else:
                print(f"No data extracted from {pdf_path}.")
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")

# Main function to specify folder location and process PDFs
if __name__ == "__main__":
    folder_path = "C:\\Users\\welcome\\Documents\\Carrum\\"  # Replace with your folder path
    process_pdfs_in_folder(folder_path)


Unknown PDF format.
No data extracted from C:\Users\welcome\Documents\Carrum\MA3JMTB1SRFB18866 Invoice & Policy.pdf.
Unknown PDF format.
No data extracted from C:\Users\welcome\Documents\Carrum\MA3JMTB1SRFB18876 Invoice & Policy.pdf.
Unknown PDF format.
No data extracted from C:\Users\welcome\Documents\Carrum\MA3JMTB1SRFB18918 Invoice & Policy.pdf.
Processing Invoice PDF...

Extracted Data:
Chassis Number: MA3JMTB1SRFB19421
Color: Superior White-26U
Place of Supply: TELANGANA(36)
From: VARUN MOTORS PVT LTD
Invoice No.: 12/VSL/24002577
Invoice Date: 28/08/2024 04:34 PM
Owner: M/S. CARRUM MOBILITY SOLUTIONS PVT LTD
Ex-showroom Price: ₹6,41,500
Total Invoice Amount: 5,58,500.00
Discount: ₹83000.00
Variant Description: MARUTI TOUR H3 CNG 1L 5MT
Engine Number: K10CN C597750
--------------------------------------------------
Processing Insurance PDF (SBI)...

Extracted Data:
Insurance Company: SBI General Insurance Company Limited
Policy Number: 0000000040526381
INSURED’S DECLARED VALUE (IDV

# Go Digit General Insurance Ltd.

In [3]:
import re
import pdfplumber

def extract_pdf_details_for_digit(pdf_path):
    extracted_details = {
        "Insurance Company": "Go Digit General Insurance Ltd.",
        "Policy Number": None,
        "INSURED’S DECLARED VALUE (IDV)": None,
        "Insurance Premium": None,
        "Start Date": None,
        "End Date": None,
        "Chassis Number": None
    }

    # Open the PDF
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            lines = page.extract_text().split("\n")  # Extract and split text into lines
            #print(f"--- Page {page_number + 1} ---")  # Debugging: Print page number
            
            for i, line in enumerate(lines):
                #print(f"Line {i}: {line.strip()}")  # Debugging: Print each line

                # # 1. Insurance Company
                # if "For" in line and "Ltd" in line:
                #     match = re.search(r"For\s+([\w\s\.\-])", line, re.IGNORECASE)
                #     if match:
                #          extracted_details["From"] = match.group(1).strip()

                # 2. Policy Number
                if "Policy No" in line:
                    match = re.search(r"Policy No\s+(\w+)", line)
                    if match:
                        extracted_details["Policy Number"] = match.group(1)
                      #  print(f"Extracted Policy Number: {extracted_details['Policy Number']}")  # Debugging

                # 3. INSURED’S DECLARED VALUE (IDV)
                if "Total IDV ₹" in line:
                    match = re.search(r"Total IDV ₹\s*([\d,]+)", line)
                    if match:
                        extracted_details["INSURED’S DECLARED VALUE (IDV)"] = match.group(1).replace(",", "")
                      #  print(f"Extracted IDV: {extracted_details['INSURED’S DECLARED VALUE (IDV)']}")  # Debugging

                # 4. Insurance Premium
                if "Gross Premium Paid ₹" in line:
                    match = re.search(r"Gross Premium Paid ₹\s*([\d,]+)", line)
                    if match:
                        extracted_details["Insurance Premium"] = match.group(1).replace(",", "")
                      #  print(f"Extracted Insurance Premium: {extracted_details['Insurance Premium']}")  # Debugging

                # 5. Start Date
                if "Period of Insurance" in line:
                    match = re.search(r"Period of Insurance\s+(\d{2}-[A-Z]{3}-\d{4})", line)
                    if match:
                        extracted_details["Start Date"] = match.group(1)
                      #  print(f"Extracted Start Date: {extracted_details['Start Date']}")  # Debugging

                # 6. End Date
                if "to" in line and "Period of Insurance" in line:
                    match = re.search(r"to\s+(\d{2}-[A-Z]{3}-\d{4})", line)
                    if match:
                        extracted_details["End Date"] = match.group(1)
                      #  print(f"Extracted End Date: {extracted_details['End Date']}")  # Debugging

                # 7. Chassis Number
                if "Vehicle Identification No." in line:
                    match = re.search(r"Vehicle Identification No\.\s+([A-Z0-9]+)", line)
                    if match:
                        extracted_details["Chassis Number"] = match.group(1)
                        #print(f"Extracted Chassis Number: {extracted_details['Chassis Number']}")  # Debugging

    return extracted_details
 

# Path to PDF file
pdf_path = "C:\\Users\\welcome\\Documents\\Carrum\\MA3JMTB1SRGB33556 Policy.pdf"

# Extract details
details = extract_pdf_details_for_digit(pdf_path)
print("\nFinal Extracted Details:")

for key, value in details.items():
    print(f"{key}: {value}")



Final Extracted Details:
Insurance Company: Go Digit General Insurance Ltd.
Policy Number: D158999607
INSURED’S DECLARED VALUE (IDV): 520600
Insurance Premium: 20950
Start Date: 31-JUL-2024
End Date: 30-JUL-2025
Chassis Number: MA3JMTB1SRGB33556


# SBI General Insurance Company Limited

In [123]:
import re
import pdfplumber

def extract_pdf_details_with_pdfplumber(pdf_path):
    extracted_details = {
        "Insurance Company": None,
        "Policy Number": None,
        "INSURED’S DECLARED VALUE (IDV)": None,
        "Insurance Premium": None,
        "Start Date": None,
        "End Date": None,
        "Chassis Number" : None
    }

    # Open the PDF
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            lines = page.extract_text().split("\n")  # Extract and split text into lines
            #print(f"--- Page {page_number + 1} ---")  # Debugging: Print page number
            
            for i, line in enumerate(lines):
                #print(f"Line {i}: {line.strip()}")  # Debugging: Print each line

                # 1. Insurance Company
                if "The Postal Address of your SBI General Branch that will service you in future is:" in line:
                    if i + 1 < len(lines):  # Check if next line exists
                        extracted_details["Insurance Company"] = lines[i + 1].strip()
                        #print(f"Extracted Insurance Company: {extracted_details['Insurance Company']}")  # Debugging

                # 2. Policy Number
                if "Your Policy No. :" in line:
                    match = re.search(r"Your Policy No\. :(\d+)", line)
                    if match:
                        extracted_details["Policy Number"] = match.group(1)
                        #print(f"Extracted Policy Number: {extracted_details['Policy Number']}")  # Debugging

                # 3. INSURED’S DECLARED VALUE (IDV)
                if "INSURED’S DECLARED VALUE (IDV)" in line:
                    if i + 4 < len(lines):  # Check if 4th line after exists
                        idv_line = lines[i + 4].strip()
                        match = re.search(r"(\d{1,3}(,\d{3})*(\.\d{2})?)", idv_line)
                        if match:
                            extracted_details["INSURED’S DECLARED VALUE (IDV)"] = match.group(1).split(".")[0]
                        #    print(f"Extracted IDV: {extracted_details['INSURED’S DECLARED VALUE (IDV)']}")  # Debugging

                # 4. Insurance Premium
                if "Total Premium Collected" in line:
                    match = re.search(r"Total Premium Collected\s+([\d,]+\.\d{2})", line)
                    if match:
                        extracted_details["Insurance Premium"] = match.group(1)
                      #  print(f"Extracted Insurance Premium: {extracted_details['Insurance Premium']}")  # Debugging

                # 5. Start Date
                if "Period of Insurance : From :" in line:
                    match = re.search(r"From\s*:(\d{2}/\d{2}/\d{4})", line)
                    if match:
                        extracted_details["Start Date"] = match.group(1)
                      #  print(f"Extracted Start Date: {extracted_details['Start Date']}")  # Debugging

                # 6. End Date
                if "Period of Insurance : From :" in line:
                    match = re.search(r"Midnight of:\s*(\d{2}/\d{2}/\d{4})", line)
                    if match:
                        extracted_details["End Date"] = match.group(1)
                      #  print(f"Extracted End Date: {extracted_details['End Date']}")  # Debugging
                # 7.Chassis Number 
                if "Chassis Number" in line:
                    match = re.search(r"Chassis Number\s+([A-Z0-9]+)", line)
                    if match:
                        extracted_details["Chassis Number"] = match.group(1)
                       # print(f"Extracted Chassis Number: {extracted_details['Chassis Number']}")  # Debugging


    return extracted_details

# Path to PDF file
pdf_path = "C:\\Users\\welcome\\Documents\\Carrum\\MA3JMTB1SRGB30324 Policy.pdf"

# Extract details
details = extract_pdf_details_with_pdfplumber(pdf_path)
print("\nFinal Extracted Details:")
for key, value in details.items():
    print(f"{key}: {value}")



Final Extracted Details:
Insurance Company: SBI General Insurance Company Limited
Policy Number: 0000000040527221
INSURED’S DECLARED VALUE (IDV): 530,575
Insurance Premium: 22,194.00
Start Date: 28/08/2024
End Date: 27/08/2025
Chassis Number: MA3JMTB1SRGB30324


# INVOICE

In [96]:
import re
import pdfplumber

def process_pdf_line_by_line(pdf_path):
    data = {
        "Chassis Number": "Not found",
        "Color": "Not found",
        "Place of Supply": "Not found",
        "From": "Not found",
        "Invoice No.": "Not found",
        "Invoice Date": "Not found",
        "Owner": "Not found",
        "Ex-showroom Price": "₹6,41,500",  # Fixed value
        "Total Invoice Amount": "Not found",
        "Discount": "Not found",
        "Variant Description": "Not found",
        "Engine Number": "Not found",
    }

    # Open the PDF
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            lines = page.extract_text().split("\n")  # Extract and split text into lines

            for line in lines:
                # Debugging: Uncomment to print each line
                
                #print(f"Line: {line}")

                # 1. Chassis Number
                if "Vehicle ID" in line:
                    match = re.search(r"Vehicle ID\s*:\s*([A-Z0-9]+)", line)
                    if match:
                        data["Chassis Number"] = match.group(1)

                # 2. Engine Number
                if data["Chassis Number"] != "Not found" and "Superior" in line:
                    chassis_match = re.search(data["Chassis Number"], line)
                    if chassis_match:
                        engine_match = re.search(rf"{data['Chassis Number']}\s+([A-Z0-9\s]+)\s+Superior", line)
                        if engine_match:
                            data["Engine Number"] = engine_match.group(1).strip()

                # 3. Color
                if "Superior" in line:
                    match = re.search(r"(Superior\s\w+-\d+U)", line, re.IGNORECASE)
                    if match:
                        data["Color"] = match.group(1)

                # 4. Place of Supply
                if "Place of Supply" in line:
                    match = re.search(r"Place of Supply\s*:\s*([\w\s\(\)\d]+)", line)
                    if match:
                        data["Place of Supply"] = match.group(1)

                # 5. From (Dealer Information)
                if "For" in line and "PVT LTD" in line:
                    match = re.search(r"For\s+([\w\s\.\-]+PVT LTD)", line, re.IGNORECASE)
                    if match:
                        data["From"] = match.group(1).strip()

                # 6. Invoice No.
                if "Invoice No." in line:
                    # Look for "Invoice No." and capture it
                    match = re.search(r"Invoice No\.\s*[:\-]?\s*([\w\/\-]+)", line)
                    if match:
                        data["Invoice No."] = match.group(1)

                # 7. Invoice Date
                if "Invoice Date" in line:
                    match = re.search(r"Invoice Date\s*:\s*([\d/]+\s+\d{2}:\d{2}\s+[APM]{2})", line)
                    if match:
                        data["Invoice Date"] = match.group(1)

                # 8. Owner
                if "Sold To" in line:
                    # This will match the "Sold To" line and capture everything after "Sold To :"
                    match = re.search(r"Sold To\s*:\s*(.+)", line, re.IGNORECASE)
                    if match:
                        data["Owner"] = match.group(1).strip()
    

                # 9. Total Invoice Amount
                if "Total Invoice Amount" in line:
                    match = re.search(r"Total Invoice Amount\s*:\s*([\d,]+\.\d{2})", line)
                    if match:
                        data["Total Invoice Amount"] = match.group(1)

                # 10. Variant Description
                if "PRICE OF ONE" in line:
                    match = re.search(r"PRICE OF ONE\s+([\w\s]+)-", line)
                    if match:
                        data["Variant Description"] = match.group(1).strip()

    # Calculate Discount
    if data["Total Invoice Amount"] != "Not found":
        ex_showroom_price = 641500.00
        total_invoice = float(data["Total Invoice Amount"].replace(",", ""))
        data["Discount"] = f"₹{ex_showroom_price - total_invoice:.2f}"

    return data


# Path to PDF file
pdf_path = "C:\\Users\\welcome\\Documents\\Carrum\\MA3JMTB1SRGB36609 Invoice.pdf"

# Extract and process the data
extracted_data = process_pdf_line_by_line(pdf_path)

# Print results
for key, value in extracted_data.items():
    print(f"{key}: {value}")


Chassis Number: MA3JMTB1SRGB36609
Color: Superior White-26U
Place of Supply: TELANGANA(36) Order No
From: VARUN MOTORS PVT LTD
Invoice No.: 12/VSL/24002534
Invoice Date: 27/08/2024 10:47 PM
Owner: M/S. CARRUM MOBILITY SOLUTIONS PVT LTD
Ex-showroom Price: ₹6,41,500
Total Invoice Amount: 5,58,500.00
Discount: ₹83000.00
Variant Description: MARUTI TOUR H3 CNG 1L 5MT
Engine Number: K10CN C624099


In [10]:
"C:\\Users\\welcome\\Documents\\Carrum\\MA3JMTB1SRFB19421 Invoice.pdf"

'C:\\Users\\welcome\\Documents\\Carrum\\MA3JMTB1SRFB19421 Invoice.pdf'

# for Print text Available in Pdf

In [40]:
import re
import pdfplumber

# Path to PDF file
pdf_path = "C:\\Users\\welcome\\Documents\\Carrum\\MA3JMTB1SRFB19421 Invoice.pdf"

# Open the PDF
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        lines = page.extract_text().split("\n")  # Extract and split text into lines

        for line in lines:
            # Debugging: Uncomment to print each line
            print(f"Line: {line}")


Line: ORIGINAL FOR RECIPIENT/DUPLICATE FOR TRANSPORTER/TRIPLICATE FOR SUPPLIER
Line: TAX / VEHICLE & CHARGES INVOICE
Line: Sold To : M/S. CARRUM MOBILITY SOLUTIONS PVT LTD
Line: Guardian of : REP BY: SADAMUDDIN CHOUHAN
Line: Address : FNO: 504, ANURAG TOWERS, 100 FEET RD,
Line: AYYAPPA SOCIETY,SIDDI VINAYAK NAGAR
Line: MADHAPUR, HYD HYDERABAD
Line: Pin:500081,(M):6304982820 TELANGANA (36)
Line: Customer ID : 2457565262 PAN No :AALCC8489R
Line: Financed By : STATE BANK OF INDIA, MID CORPORAT BRANCH,
Line: GURUGRAM
Line: Invoice No. : 12/VSL/24002577
Line: Customer Aadhar No. :
Line: Invoice Date : 28/08/2024 04:34 PM
Line: Place of Supply : TELANGANA(36) Order No. : SOB24002691
Line: Vehicle ID : MA3JMTB1SRFB19421
Line: Order Date : 30/07/2024
Line: Customer Mobile No. : 6304982820
Line: Key No. : 2906
Line: Booking Dealer :
Line: Delivery Dealer : 4603
Line: Customer GST No. : 36AALCC8489R1ZE
Line: Dealer GST No. : 36AABCV2471Q1ZT
Line: Dealer PAN No. : AABCV2471Q
Line: IRN : 00cb4b642

# Extract data in all Images available in folder

In [14]:
import pytesseract
from PIL import Image
import re
import os

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image(image_path):
    """
    Extract text from an image using Tesseract OCR.
    
    Args:
        image_path (str): Path to the image file.
        
    Returns:
        str: Extracted text from the image.
    """
    try:
        # Open the image using PIL
        image = Image.open(image_path)
        
        # Use Tesseract to extract text
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        return f"Error: {e}"

def extract_field_values(text):
    """
    Extract specific field values from the OCR text using patterns.
    
    Args:
        text (str): Extracted text from the image.
        
    Returns:
        dict: A dictionary containing extracted field values.
    """
    try:
        # Patterns for extracting fields
        patterns = {
            "Chassis Number": r"Enter Search Element:\*\s*\|\s*([A-Z0-9]+)",
            "Registration Number": r"Registration No:?[\|]?\s*([A-Z0-9]+)\s*Fuel Type:",
            "Mfg. Year": r"Mfg\. Year:\s*\|?\s*([0-9/]+)",
        }
        
        # Extract values using the patterns
        extracted_values = {}
        for field, pattern in patterns.items():
            match = re.search(pattern, text)
            if match:
                extracted_values[field] = match.group(1)
            else:
                extracted_values[field] = "Not Found"
        
        # Extract 'Date Of Registration' field separately using custom logic
        extracted_values["Registration Date"] = extract_last_date(text)
        
        return extracted_values
    except Exception as e:
        return {"Error": str(e)}

def extract_last_date(text):
    """
    Extract the last date format from a line containing 'Date Of Registration'.
    
    Args:
        text (str): Input text containing the "Date Of Registration".
        
    Returns:
        str: Extracted registration date or 'Not Found'.
    """
    # Split the text into lines
    lines = text.splitlines()
    for line in lines:
        if "Date Of Registration" in line:
            print(f"Debug Line: {line}")  # Debugging: Print the line
            # Regex to capture the last date-like pattern in the line
            pattern = r"(\d{1,2}/\d{1,2}/\d{4})$"
            match = re.search(pattern, line)
            if match:
                return match.group(1).strip()
    return "Not Found"


def process_folder(folder_path):
    """
    Process all images in a given folder and extract field values.
    
    Args:
        folder_path (str): Path to the folder containing images.
    """
    # Supported image file extensions
    valid_extensions = {".jpg", ".jpeg", ".png", ".tiff", ".bmp", ".gif"}
    
    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Construct the full path to the file
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is an image based on the extension
        if os.path.isfile(file_path) and os.path.splitext(file_name.lower())[1] in valid_extensions:
            print(f"\nProcessing: {file_name}")
            
            # Extract text from the image
            extracted_text = extract_text_from_image(file_path)
            
            # Extract field values from the text
            field_values = extract_field_values(extracted_text)
            
            # Print the extracted values
            print("Extracted Field Values:\n", field_values)
            #print("Extracted Text:\n", extracted_text)
        else:
            continue
            # print(f"Skipping non-image file: {file_name}")

# Path to the folder containing images
folder_path = r"C:\Users\welcome\Documents\Carrum\Screenshot and pdf in screenshot"

# Process all images in the folder
process_folder(folder_path)



Processing: WhatsApp Image 2024-11-29 at 14.58.16_849056a5.jpg
Debug Line: Engine No:iK10CNC5XXXXX Date Of Registration:02/09/2024
Extracted Field Values:
 {'Chassis Number': 'MA3JMTB1SRFB18876', 'Registration Number': 'TGO7T5651', 'Mfg. Year': '06/06/2024', 'Registration Date': '02/09/2024'}

Processing: WhatsApp Image 2024-11-29 at 14.58.52_a0282b93.jpg
Debug Line: Engine No:IK10CNCSXXXXX Date Of Registration:02/09/2024
Extracted Field Values:
 {'Chassis Number': 'MA3JMTB1SRFB18866', 'Registration Number': 'TGO7T5655', 'Mfg. Year': '06/06/2024', 'Registration Date': '02/09/2024'}

Processing: WhatsApp Image 2024-11-29 at 15.00.06_4ec87ba9.jpg
Debug Line: Engine No:IK10CNC5XXXXX Date Of Registration:02/09/2024
Extracted Field Values:
 {'Chassis Number': 'MA3JMTB1SRFB18918', 'Registration Number': 'TGO7T5657', 'Mfg. Year': '06/06/2024', 'Registration Date': '02/09/2024'}

Processing: WhatsApp Image 2024-11-30 at 13.10.58_7068a1cc.jpg
Debug Line: Engine No:|K10CNC6XXXXX Date Of Registr

In [11]:
import re

def extract_registration_date(text):
    """
    Extract the last date format from a line containing 'Date Of Registration'.
    
    Args:
        text (str): Input text containing the "Date Of Registration".
        
    Returns:
        str: Extracted registration date or 'Not Found'.
    """
    # Ensure the line contains 'Date Of Registration'
    if "Date Of Registration" in text:
        # Regex to capture the last date-like pattern in the line
        pattern = r"Date Of Registration.*?([0-9]{1,2}/[0-9]{1,2}/[0-9]{4})$"
        
        # Search for the pattern
        match = re.search(pattern, text)
        if match:
            # Return the matched date
            return match.group(1).strip()
    
    # Return 'Not Found' if no date pattern is detected
    return "Not Found"

# Test cases
texts = [
    "Engine No:|K10CNC6XXXXX Date Of Registration:/3 1/08/2024",  # OCR noise
    "Engine No:IK10CNC5XXXXX Date Of Registration:02/09/2024",    # Valid date
    "Engine No:|K10CNC5XXXXX Date Of Registration:0/3 1/08/2024", # OCR noise before the valid date
    "Engine No:|K10CNC5XXXXX Date Of Registration:"               # Missing date
]

# Apply the function to each test case and print the results
for text in texts:
    print(f"Input: {text}")
    print(f"Extracted Date: {extract_registration_date(text)}\n")


Input: Engine No:|K10CNC6XXXXX Date Of Registration:/3 1/08/2024
Extracted Date: 1/08/2024

Input: Engine No:IK10CNC5XXXXX Date Of Registration:02/09/2024
Extracted Date: 02/09/2024

Input: Engine No:|K10CNC5XXXXX Date Of Registration:0/3 1/08/2024
Extracted Date: 1/08/2024

Input: Engine No:|K10CNC5XXXXX Date Of Registration:
Extracted Date: Not Found



# Extract Data in Images 

In [10]:
import pytesseract
from PIL import Image
import re

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image(image_path):
    """
    Extract text from an image using Tesseract OCR.
    
    Args:
        image_path (str): Path to the image file.
        
    Returns:
        str: Extracted text from the image.
    """
    try:
        # Open the image using PIL
        image = Image.open(image_path)
        
        # Use Tesseract to extract text
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        return f"Error: {e}"

def extract_field_values(text):
    """
    Extract specific field values from the OCR text using patterns.
    
    Args:
        text (str): Extracted text from the image.
        
    Returns:
        dict: A dictionary containing extracted field values.
    """
    try:
        # Patterns for extracting fields
        patterns = {
            "Chassis Number": r"Enter Search Element:\*\s*\|\s*([A-Z0-9]+)",
            "Registration Number": r"Registration No:\s*([A-Z0-9]+)\s*Fuel Type:",
            "Mfg. Year": r"Mfg\. Year:\|([0-9/]+)\s*",
            "Registration Date": r"Date Of Registration:([0-9/]+)"
        }
        
        # Extract values using the patterns
        extracted_values = {}
        for field, pattern in patterns.items():
            match = re.search(pattern, text)
            if match:
                extracted_values[field] = match.group(1)
            else:
                extracted_values[field] = "Not Found"
        
        return extracted_values
    except Exception as e:
        return {"Error": str(e)}

# Path to the uploaded image
image_path = r"C:\Users\welcome\Documents\Carrum\WhatsApp Image 2024-11-30 at 12.49.19_c8f14133.jpg"

# Extract text
extracted_text = extract_text_from_image(image_path)

# Extract field values from the text
field_values = extract_field_values(extracted_text)

#Print extracted values
print("Extracted Text:\n", extracted_text)
print("\nExtracted Field Values:\n", field_values)


Extracted Text:
 (>) M-wallet

Duorre gebds dare wD

OF REGISTRAION

Regn, Number : TG07T8371
_CARRUM MOBILITY
Regd. Owner “SOLU
_F NO 504 ANURAG TWRS
Address “100 FT RD, BEST
_MARUTI TOUR H3 CNG
Maker's Class “IL SMT BSVI-PH2
Vehicle Class : Motor Cab
Mth. Ye. of Mfg + 10/10/2024
Fucl Used : CNG PETROL

Chassis Number: MA3JMTB1SRKB85779

Engine Number K10CNC702804
Cubuie Capacity

Wheel Base

Seating Capacity +5

Unladen Weight

Color : SUPERIOR WHITE

0/11/2024
9/11/2026
8/11/2036

Date of Registration
Regn. Valid Upto
Tax

Hypothecated To

sult

Registering Authority

Signature of the Owner RTMRARGARERDS

INSURANCE

PTAILS

Vehicle Number BnGonEs374

SHARE THIS CARD DELETE THIS CARD



Extracted Field Values:
 {'Chassis Number': 'Not Found', 'Registration Number': 'Not Found', 'Mfg. Year': 'Not Found', 'Registration Date': 'Not Found'}


# Images In PDF

In [1]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_images_from_pdf(pdf_path):
    """
    Extract images from a PDF file and return a list of image objects.
    
    Args:
        pdf_path (str): Path to the PDF file.
        
    Returns:
        list: A list of PIL Image objects.
    """
    doc = fitz.open(pdf_path)
    images = []
    
    # Loop through each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # Get the images on the page
        image_list = page.get_images(full=True)
        
        for img_index, img in enumerate(image_list):
            xref = img[0]
            
            # Extract the image in a format that can be used by PIL
            base_image = doc.extract_image(xref)
            image_data = base_image["image"]
            
            # Convert the image data to a PIL Image
            image = Image.open(io.BytesIO(image_data))
            images.append(image)
    
    return images

def extract_text_from_image(image):
    """
    Extract text from an image using Tesseract OCR.
    
    Args:
        image (PIL.Image): The image to process.
        
    Returns:
        str: The extracted text from the image.
    """
    return pytesseract.image_to_string(image)

def extract_text_from_pdf(pdf_path):
    """
    Extracts all text from the images within a PDF file.
    
    Args:
        pdf_path (str): Path to the PDF file.
        
    Returns:
        str: The extracted text from all images.
    """
    images = extract_images_from_pdf(pdf_path)
    extracted_text = ""
    
    # Process each image and extract text
    for image in images:
        extracted_text += extract_text_from_image(image)
    
    return extracted_text

# Example usage
pdf_path = r"C:\Users\welcome\Documents\Carrum\Screenshot and pdf in screenshot\MA3JMTB1SRFB18876 Invoice & Policy.pdf"  # Path to your PDF file
extracted_text = extract_text_from_pdf(pdf_path)

# Print extracted text from images
print(extracted_text)


GST No. 36AABCV2471Q1ZT VARUN MOTORS PVT. LTD .marutiid$ suzur

MARUTI AUTHORISED DEALERS

BEGUMPET : 1-10-177,
"VARUN TOWERS", HYDERABAD -16. | Road No.2, HYDERABAD - 34.

Ph : 4460 7676, Fax : 6632 7676
varun.hyd.sal1 @ marutidealers.com

BANJARA HILLS : 8-2-120/76/115,] KUKATPALLY : Opp. Chermas,

Hydernagar, HYDERABAD - 72

Ph: 4488 7676. Telefax : 2360 7676 | Ph : 4458 7676, 2389 7676 :
varun.hyd.sal2@ marutidealers.com | varun.hyd.sal3@ marutidealers.com | varun-hyd.sal4@marutidealers.com

VANASTHALIPURAM : Plot No. VI & Vil, A&B,

Ward No. 4, Block No. 10, Autonagar,
HYDERABAD - 70. Ph: 2402 7676

GACHIBOWLI: Sy. No. 115/P,
Beside Infotech, Gachibowli,
HYDERABAD. Ph: 4949 7676,

varun.hyd.sal5 @ marutidealers.com

VARUN
eH NAT POR R R .
TAX / VEHICLE & CHARGES INVOICE io] poh h eal oat i]
Sold To : M/S. ORIX LEASING AND FINANCIAL SERVICES INDIALTD bial IGOR DER
Guardian of bs 5 = Barts etd ane
a Oe ea oO 5
Address > 8-2-293/82/A/974 JUBILEE HILLS ROAD NO 49 Serene Orta
HYDERABAD

In [54]:
!pip install --upgrade pandas openpyxl


Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.4
    Uninstalling pandas-2.1.4:
      Successfully uninstalled pandas-2.1.4
  Rolling back uninstall of pandas
  Moving to c:\users\welcome\anaconda3\lib\site-packages\pandas-2.1.4.dist-info\
   from C:\Users\welcome\anaconda3\Lib\site-packages\~andas-2.1.4.dist-info
  Moving to c:\users\welcome\anaconda3\lib\site-packages\pandas\__init__.py
   from C:\Users\welcome\AppData\Local\Temp\pip-uninstall-2ge125o3\__init__.py
  Moving to c:\users\welcome\anaconda3\lib\site-packages\pandas\__pycache__\
   from C:\Users\welcome\anaconda3\Lib\site-packages\pandas\~_pycache__
  Moving to c:\users\welcome\anaconda3\lib\site-packages\pandas\_config\
   from C:\Users\welcome\anaconda3\Lib\site-packages\pandas\~config
  Moving to c:\users\w

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\welcome\\anaconda3\\Lib\\site-packages\\pandas.libs\\msvcp140-0f2ea95580b32bcfc81c235d5751ce78.dll'
Consider using the `--user` option or check the permissions.

