In [None]:
import os
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Store extracted text
    extracted_text = ""

    # Loop through each page
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)

        # Extract text from the page
        text = page.get_text("text")

        # Append the text of the page to the full document text
        extracted_text += text

    # Close the PDF file
    pdf_document.close()

    return extracted_text

def save_text_to_file(text, file_path):
    # Write the extracted text to a .txt file
    with open(file_path, "w", encoding="utf-8") as text_file:
        text_file.write(text)

def process_multiple_pdfs(pdf_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Loop through each PDF file in the folder
    for pdf_filename in os.listdir(pdf_folder):
        if pdf_filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_filename)
            
            # Extract text from the PDF
            extracted_text = extract_text_from_pdf(pdf_path)

            # Generate the output .txt file path
            output_txt_path = os.path.join(output_folder, f"{os.path.splitext(pdf_filename)[0]}.txt")

            # Save the extracted text to a .txt file
            save_text_to_file(extracted_text, output_txt_path)

            print(f"Text successfully extracted from {pdf_filename} and saved to {output_txt_path}")

# Usage example
pdf_folder = "/Users/rishit/Desktop/Zolvit/Jan to Mar"
output_folder = "/Users/rishit/Desktop/Zolvit/Extracted_Texts"

# Process all PDFs in the folder
process_multiple_pdfs(pdf_folder, output_folder)

In [None]:
import re

# Function to extract relevant details from the text
import re

# Function to extract relevant details from the text
def extract_relevant_details(text):
    details = {}

    # Helper function to safely extract data using regex
    def safe_extract(pattern, text, group=1, default=""):
        match = re.search(pattern, text)
        if match:
            return match.group(group)
        return default

    # Extracting relevant details
    details["GSTIN"] = safe_extract(r"GSTIN\s+([A-Z0-9]+)", text)
    details["Mobile"] = safe_extract(r"Mobile\s+(\+?\d[\d\s\-]+)", text)
    details["Email"] = safe_extract(
        r"Email\s+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text
    )
    details["Invoice #"] = safe_extract(r"Invoice #:\s+(\S+)", text)
    details["Invoice Date"] = safe_extract(r"Invoice Date:\s+([0-9A-Za-z\s]+)\s+Due Date:", text)
    details["Due Date"] = safe_extract(r"Due Date:\s+([0-9A-Za-z\s]+)\s+Customer Details:", text)

    # More flexible Customer Details extraction
    details["Customer Details"] = safe_extract(r"Customer Details:?\s*(.+?)(?:\n|Ph:)", text)
    
    # Extract Customer Phone, but handle absence of phone numbers
    details["Customer Phone"] = safe_extract(r"Ph:\s+(\d+)", text, default="N/A")

    details["Place of Supply"] = safe_extract(r"Place of Supply:\s+([0-9A-Za-z\s-]+)", text)

    # Extract items
    items = extract_items(text)
    details["Items"] = items

    # Tax and summary details
    details["Taxable Amount"] = safe_extract(r"Taxable Amount\n₹([0-9.,]+)", text)
    details["IGST"] = safe_extract(r"IGST\s*[0-9.]+%\n₹([0-9.,]+)", text, default="N/A")

    # Extract multiple CGST and SGST rates and amounts
    cgst_matches = re.findall(r"CGST\s*([0-9.]+)%\n₹([0-9.,]+)", text)
    sgst_matches = re.findall(r"SGST\s*([0-9.]+)%\n₹([0-9.,]+)", text)

    details["CGST"] = [f"{rate}%: ₹{amount}" for rate, amount in cgst_matches] if cgst_matches else "N/A"
    details["SGST"] = [f"{rate}%: ₹{amount}" for rate, amount in sgst_matches] if sgst_matches else "N/A"
    
    # Handle round-off values, including negative values
    details["Round Off"] = safe_extract(r"Round Off\n([+-]?[0-9.,]+)", text, default="0.00")
  
    # Extract other key details
    details["Total"] = safe_extract(r"Total\n₹([0-9.,]+)", text)
    details["Total Discount"] = safe_extract(r"Total Discount\n₹([0-9.,]+)", text, default="N/A")
    details["Total Items / Qty"] = safe_extract(r"Total Items / Qty : ([0-9]+) / ([0-9.]+)", text)

    # Bank details
    details["Bank"] = safe_extract(r"Bank:\s+(.+)", text, default="N/A")
    details["IFSC Code"] = safe_extract(r"IFSC Code:\s+([A-Za-z0-9]+)", text, default="N/A")
    details["Account #"] = safe_extract(r"Account #:\s+(\d+)", text, default="N/A")
    details["Branch"] = safe_extract(r"Branch:\s+(.+)", text, default="N/A")

    # Total Amount in words
    details["Total Amount (in words)"] = safe_extract(
        r"Total amount \(in words\):\s+(.+)", text
    )

    return details

# Log missing fields for debugging purposes
def log_missing_fields(details, pdf_filename):
    missing_fields = [key for key, value in details.items() if not value]
    if missing_fields:
        print(f"Missing fields in {pdf_filename}: {', '.join(missing_fields)}")

# Function to extract items from the invoice
def extract_items(text):
    # Extract only after the item header to avoid capturing irrelevant parts
    item_start = re.search(r"Item\s+Rate / Item\s+Qty\s+Taxable Value\s+Tax Amount\s+Amount", text)
    if item_start:
        # Trim the text to start after the item header
        text = text[item_start.end():]

    # Now, extract items until we hit a recognizable "Total" or other section
    item_end = re.search(r"Taxable Amount|CGST|SGST|IGST|Round Off|Total", text)
    if item_end:
        text = text[:item_end.start()]  # Trim after the last item

    # Split the remaining text into lines to process each item
    lines = [line.strip() for line in text.split("\n") if line.strip()]

    items = []
    index = 0

    while index < len(lines):
        try:
            # Detect the start of a new item based on item number (digits)
            if re.match(r"^\d+$", lines[index]):  # Item number is a digit
                current_item = {
                    "Item #": lines[index]
                }

                # Collect the item description which may span multiple lines
                description_lines = []
                index += 1
                # Continue collecting until we hit a numeric value (Rate / Item)
                while index < len(lines) and not re.match(r"^[0-9,.]+$", lines[index]):
                    description_lines.append(lines[index])
                    index += 1

                # Join all the description lines into a single description
                current_item["Item"] = " ".join(description_lines).strip()

                # Check that we have enough remaining lines for the expected fields
                if index + 4 < len(lines):
                    # Now, handle the case where numbers may be split across multiple lines
                    current_item["Rate / Item"] = lines[index]

                    # Handle the cases where the next field may be split across lines
                    if "(" in lines[index + 1]:
                        current_item["Rate / Item"] += " " + lines[index + 1]
                        current_item["Qty"] = lines[index + 2]
                        current_item["Taxable Value"] = lines[index + 3]
                        current_item["Tax Amount"] = lines[index + 4]
                        current_item["Amount"] = lines[index + 5]
                        index += 6
                    else:
                        current_item["Qty"] = lines[index + 1]
                        current_item["Taxable Value"] = lines[index + 2]
                        current_item["Tax Amount"] = lines[index + 3]
                        current_item["Amount"] = lines[index + 4]
                        index += 5

                    # Append the item to the items list
                    items.append(current_item)

            else:
                index += 1  # Skip lines that don't match the expected item format

        except IndexError:
            # If we run into an index issue, skip this item and continue
            index += 1

    return items


In [None]:
import os
import csv

# Your existing extraction functions here
# (The extract_relevant_details and extract_items functions from before)

def process_text_files_and_extract_data(text_folder, output_csv_file):
    # List to hold extracted data
    output_results = []

    # Loop through all text files in the folder
    for text_filename in os.listdir(text_folder):
        if text_filename.endswith(".txt"):  # Process only .txt files
            text_file_path = os.path.join(text_folder, text_filename)

            # Read the contents of the text file
            with open(text_file_path, "r", encoding="utf-8") as file:
                extracted_text = file.read()

            # Extract the relevant details from the text
            extracted_details = extract_relevant_details(extracted_text)

            # Log missing fields for debugging purposes
            log_missing_fields(extracted_details, text_filename)

            # Append the extracted details to the results list
            output_results.append({
                "filename": text_filename,
                "extracted_details": extracted_details
            })

    # After processing all text files, save the results to a CSV file
    save_results_to_csv(output_results, output_csv_file)

def save_results_to_csv(output_results, output_csv_file):
    with open(output_csv_file, mode='w', newline='', encoding="utf-8") as file:
        writer = csv.writer(file)

        # Write the header - added "IGST"
        writer.writerow([
            "Filename", "GSTIN", "Mobile", "Email", "Invoice #", "Invoice Date", "Due Date",
            "Customer Details", "Customer Phone", "Place of Supply", "Items", "Taxable Amount",
            "CGST", "SGST", "IGST", "Round Off", "Total", "Total Discount", "Total Items / Qty",
            "Bank", "IFSC Code", "Account #", "Branch", "Total Amount (in words)"
        ])

        # Write the extracted details for each file
        for result in output_results:
            details = result["extracted_details"]
            writer.writerow([
                result["filename"],
                details.get("GSTIN", ""),
                details.get("Mobile", ""),
                details.get("Email", ""),
                details.get("Invoice #", ""),
                details.get("Invoice Date", ""),
                details.get("Due Date", ""),
                details.get("Customer Details", ""),
                details.get("Customer Phone", ""),
                details.get("Place of Supply", ""),
                details.get("Items", ""),  # You may want to format items more neatly
                details.get("Taxable Amount", ""),
                details.get("CGST", ""),
                details.get("SGST", ""),
                details.get("IGST", ""),  # New IGST field added here
                details.get("Round Off", ""),
                details.get("Total", ""),
                details.get("Total Discount", ""),
                details.get("Total Items / Qty", ""),
                details.get("Bank", ""),
                details.get("IFSC Code", ""),
                details.get("Account #", ""),
                details.get("Branch", ""),
                details.get("Total Amount (in words)", "")
            ])

# Example usage
text_folder = "/Users/rishit/Desktop/Zolvit/Extracted_Texts"
output_csv_file = "extracted_invoice_data.csv"

# Process text files and extract data to CSV
process_text_files_and_extract_data(text_folder, output_csv_file)

print(f"Data extraction completed. Results saved to {output_csv_file}")

In [None]:
import re
import pandas as pd
from datetime import datetime

# Validate GSTIN format
def validate_gstin(gstin):
    gstin_pattern = r"^[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z]{1}[1-9A-Z]{1}Z[0-9A-Z]{1}$"
    return re.match(gstin_pattern, gstin) is not None

# Validate email format
def validate_email(email):
    email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return re.match(email_pattern, email) is not None

# Validate dates
def validate_dates(invoice_date, due_date):
    try:
        inv_date = datetime.strptime(invoice_date, "%d %b %Y")
        due_date = datetime.strptime(due_date, "%d %b %Y")
        return inv_date <= due_date
    except ValueError:
        return False

# Cross-field validation for item amounts and total
def cross_field_validation(row):
    try:
        items = eval(row["Items"])
        taxable_amount = float(row["Taxable Amount"].replace(",", "")) if isinstance(row["Taxable Amount"], str) else row["Taxable Amount"]
        total = float(row["Total"].replace(",", "")) if isinstance(row["Total"], str) else row["Total"]
        round_off = float(row["Round Off"]) if row["Round Off"] else 0.0

        total_item_amounts = sum(float(item["Amount"].replace(",", "")) if isinstance(item["Amount"], str) else item["Amount"] for item in items)

        # Allow a small tolerance for rounding differences
        return abs(total_item_amounts + round_off - total) <= 0.02
    except Exception:
        return False

# Validate total with round-off and tax values
def validate_total(taxable_amount, cgst, sgst, igst, round_off, total):
    try:
        taxable_amount = float(taxable_amount.replace(",", "")) if isinstance(taxable_amount, str) else taxable_amount
        round_off = float(round_off) if isinstance(round_off, str) else round_off
        total = float(total.replace(",", "")) if isinstance(total, str) else total

        if isinstance(cgst, str):
            cgst = sum(float(t.split("₹")[1].replace(",", "")) for t in eval(cgst)) if cgst else 0
        elif isinstance(cgst, list):
            cgst = sum(cgst)
        else:
            cgst = float(cgst) if cgst else 0
        
        if isinstance(sgst, str):
            sgst = sum(float(t.split("₹")[1].replace(",", "")) for t in eval(sgst)) if sgst else 0
        elif isinstance(sgst, list):
            sgst = sum(sgst)
        else:
            sgst = float(sgst) if sgst else 0
        
        igst = float(igst.replace(",", "")) if isinstance(igst, str) and igst != "N/A" else 0

        calculated_total = taxable_amount + cgst + sgst + igst + round_off
        return abs(calculated_total - total) <= 0.02
    except Exception:
        return False

# Trust determination based on validation checks
def determine_trust(row):
    errors = []
    trust_level = "High Trust"
    
    # Field-level validation
    if not validate_gstin(row["GSTIN"]):
        errors.append("Invalid GSTIN")
    
    if not validate_email(row["Email"]):
        errors.append("Invalid Email")
    
    if not validate_dates(row["Invoice Date"], row["Due Date"]):
        errors.append("Invalid Date Range (Invoice Date after Due Date)")
    
    # Cross-field validation for total consistency
    if not cross_field_validation(row):
        errors.append("Cross-field Inconsistency")
        trust_level = "Low Trust"

    # Combine errors into a readable format
    if errors:
        return f"{trust_level} ({', '.join(errors)})"
    else:
        return trust_level

# Function to print validation summary for each field
def print_validation_summary(df):
    total_records = len(df)
    fields = {
        "GSTIN": lambda row: validate_gstin(row["GSTIN"]),
        "Email": lambda row: validate_email(row["Email"]),
        "Invoice Date": lambda row: bool(row["Invoice Date"]),
        "Due Date": lambda row: validate_dates(row["Invoice Date"], row["Due Date"]),
        "Total": lambda row: bool(row["Total"]),
        "Items": lambda row: bool(row["Items"]),
    }
    
    for field, validator in fields.items():
        total_present = df[field].notna().sum()
        total_correct = df.apply(validator, axis=1).sum()
        
        print(f"{field}:")
        print(f"  Total present: {total_present}/{total_records}")
        print(f"  Correctly extracted: {total_correct}/{total_present} ({(total_correct / total_present * 100) if total_present > 0 else 0:.2f}%)\n")

# Load the CSV data into a pandas DataFrame
df = pd.read_csv("extracted_invoice_data.csv")

# Apply the trust determination logic
df["Trust Level"] = df.apply(determine_trust, axis=1)

# Print validation summary for analysis
print_validation_summary(df)

# Save the updated CSV with the trust level
df.to_csv("extracted_invoice_data_with_cross_field_validation.csv", index=False)

print("Data with cross-field validation saved to extracted_invoice_data_with_cross_field_validation.csv")
print(df.to_string(index=False))

In [None]:
def extract_relevant_details_ocr(text):
    details = {}

    # Helper function to safely extract data using regex
    def safe_extract(pattern, text, group=1, default=""):
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(group).strip()
        return default

    # Extracting relevant details with adjusted patterns
    details["GSTIN"] = safe_extract(r"GSTIN\s*([A-Z0-9]+)", text)
    details["Mobile"] = safe_extract(r"Mobile\s*(\+?\d[\d\s\-]+)", text)
    details["Email"] = safe_extract(
        r"Email\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", text
    )
    details["Invoice #"] = safe_extract(r"Invoice #[:\s]+(\S+)", text)
    details["Invoice Date"] = safe_extract(r"Invoice Date[:\s]+([0-9A-Za-z\s]+)", text)
    details["Due Date"] = safe_extract(r"Due Date[:\s]+([0-9A-Za-z\s]+)", text)
    details["Customer Details"] = safe_extract(r"Customer Details[:\s]*([A-Za-z\s]+)", text)
    details["Place of Supply"] = safe_extract(r"Place of Supply[:\s]+([0-9A-Za-z\s-]+)", text)

    # Now call the `extract_items` function to extract items
    items = extract_items(text)
    details["Items"] = items

    # Tax and summary details
    details["Taxable Amount"] = safe_extract(r"Taxable Amount\s*([0-9,.\s]+)", text)
    details["IGST"] = safe_extract(r"IGST\s*[0-9.]+%\s*([0-9,.\s]+)", text, default="N/A")

    # Extract multiple CGST and SGST rates and amounts
    cgst_matches = re.findall(r"CGST\s*([0-9.]+)%\s*([0-9,.\s]+)", text)
    sgst_matches = re.findall(r"SGST\s*([0-9.]+)%\s*([0-9,.\s]+)", text)

    details["CGST"] = [f"{rate}%: ₹{amount}" for rate, amount in cgst_matches] if cgst_matches else "N/A"
    details["SGST"] = [f"{rate}%: ₹{amount}" for rate, amount in sgst_matches] if sgst_matches else "N/A"
    
    # Handle round-off values, including negative values
    details["Round Off"] = safe_extract(r"Round Off\s*([+-]?[0-9,.\s]+)", text, default="0.00")
  
    # Extract other key details
    details["Total"] = safe_extract(r"Total\s*([0-9,.\s]+)", text)
    details["Total Discount"] = safe_extract(r"Total Discount\s*([0-9,.\s]+)", text, default="N/A")
    details["Total Amount (in words)"] = safe_extract(r"Total amount (in words)[:\s]+(.+?)(?:\s+Amount Paid|$)", text)

    details["Total Items / Qty"] = safe_extract(r"Total Items / Qty\s*:\s*([0-9]+)\s*/\s*([0-9.]+)", text)

    # Bank details
    details["Bank"] = safe_extract(r"Bank[:\s]+(.+?)(?:IFSC|Account #|Branch|$)", text)
    details["IFSC Code"] = safe_extract(r"IFSC Code[:\s]+([A-Z0-9]+)", text)
    details["Account #"] = safe_extract(r"Account #[:\s]+(\d+)", text)
    details["Branch"] = safe_extract(r"Branch[:\s]+(.+?)(?:Authorized|$)", text)

    return details


# Function to extract items from the invoice
def extract_items(text):
    # Updated regex pattern to better match item structures
    item_pattern = re.compile(
        r"(\d+)\s+([A-Za-z\s\-0-9]+?)\s+([0-9,]+(?:\.[0-9]{2})?)\s+([0-9A-Za-z\s]+)\s+([0-9,]+(?:\.[0-9]{2})?)\s+([0-9,]+(?:\.[0-9]{2})?)\s*\((\d+%?)\)\s+([0-9,]+(?:\.[0-9]{2})?)",
        re.IGNORECASE
    )
    
    items = []
    
    # Find all matches using the pattern
    for match in item_pattern.finditer(text):
        item_details = {
            "Item #": match.group(1).strip(),
            "Item": match.group(2).strip(),
            "Rate / Item": match.group(3).strip(),
            "Qty": match.group(4).strip(),
            "Taxable Value": match.group(5).strip(),
            "Tax Amount": f"{match.group(7)}: ₹{match.group(6).strip()}",
            "Amount": match.group(8).strip(),
        }
        items.append(item_details)
    
    return items

In [None]:
import pandas as pd
import os
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import re

# Initialize the OCR engine
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def extract_text_with_ocr(pdf_path):
    # Convert PDF pages to images
    images = convert_from_path(pdf_path)
    extracted_text = ""

    for page_num, image in enumerate(images):
        image_path = f"page_{page_num}.png"
        image.save(image_path, "PNG")

        # Run OCR on the saved image
        ocr_result = ocr.ocr(image_path, cls=True)

        # Combine OCR results into a single string
        for line in ocr_result:
            for word_info in line:
                extracted_text += word_info[1][0] + " "
            extracted_text += "\n"

        # Remove the temporary image
        os.remove(image_path)

    return extracted_text

def process_low_trust_pdfs(csv_path, pdf_folder, output_csv_path):
    # Load the CSV data
    df = pd.read_csv(csv_path)
    
    # Filter out only low-trust PDFs
    low_trust_rows = df[df["Trust Level"].str.contains("Low Trust", na=False)]
    
    # Store results for updating the CSV
    updated_results = []

    # Process each low-trust PDF
    for index, row in low_trust_rows.iterrows():
        pdf_filename = row["Filename"]
        pdf_path = os.path.join(pdf_folder, pdf_filename.replace('.txt', '.pdf'))
        
        # Check if the PDF file exists
        if os.path.exists(pdf_path):
            print(f"Processing {pdf_filename} with OCR...")
            
            # Perform OCR and extract details
            ocr_text = extract_text_with_ocr(pdf_path)
            ocr_details = extract_relevant_details_ocr(ocr_text)
            
            # Update only the fields that are missing or incorrect
            for key, ocr_value in ocr_details.items():
                current_value = row[key]
                if not current_value or current_value in ["", "N/A"]:
                    df.at[index, key] = ocr_value
            
            # Recalculate the trust level after OCR extraction
            df.at[index, "Trust Level"] = determine_trust(df.loc[index])
        else:
            print(f"PDF file {pdf_path} not found. Skipping...")

    # Save the updated DataFrame back to a CSV file
    df.to_csv(output_csv_path, index=False)
    print(f"Updated data with OCR corrections saved to {output_csv_path}")

# Define paths
csv_path = "extracted_invoice_data_with_cross_field_validation.csv"
pdf_folder = "/path/to/your/pdf/folder"
output_csv_path = "extracted_invoice_data_with_ocr.csv"

# Process low-trust PDFs using OCR
process_low_trust_pdfs(csv_path, pdf_folder, output_csv_path)