In [1]:
!pip install PyPDF2 pytesseract pillow
!pip install easyocr






## preprocessing

In [2]:
import PyPDF2
import easyocr
from PIL import Image, ImageEnhance, ImageOps, ImageFilter
import cv2
import numpy as np
import re
import os
import io

def preprocess_image(img_path):
    """
    Preprocess the image for better OCR results.
    Steps: Grayscale -> Resize -> Adaptive Threshold -> Sharpen -> Auto Contrast
    """
    # Load with OpenCV
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

    # Resize for better OCR accuracy
    img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)

    # Apply adaptive thresholding
    img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 31, 15)

    # Convert back to PIL Image for further enhancements
    img = Image.fromarray(img)
    img = ImageEnhance.Sharpness(img).enhance(2.0)  # Sharpen
    img = ImageOps.autocontrast(img)  # Auto Contrast

    return np.array(img)  # Return as numpy array for OCR

In [42]:
def extract_key_data(results):
    """
    Extracts structured data with enhanced logic to extract Aadhaar numbers split across lines
    and the corresponding name above the Aadhaar number or DOB.
    """
    doc_type = None
    aadhaar_number = ""
    pan_number = ""
    passport_number = ""
    dates = []
    name = ""
    father_name = ""

    dob_line_index = -1
    id_number_index = -1
    aadhaar_parts = []

    for i, text in enumerate(results):
        text = text.strip()

         # Full Aadhaar number in one line
        if re.match(r'^\d{4}\s\d{4}\s\d{4}$', text):
            doc_type = 'aadhaar'
            aadhaar_number = text.replace(' ', '')
            id_number_index = i
            continue

        # Partial Aadhaar numbers
        if re.match(r'^\d{4}(\s\d{4})?$', text):
            aadhaar_parts.append(text.replace(' ', ''))
            if len(''.join(aadhaar_parts)) == 12:
                doc_type = 'aadhaar'
                aadhaar_number = ''.join(aadhaar_parts)
                id_number_index = i
                aadhaar_parts = []
            elif len(''.join(aadhaar_parts)) > 12:
                aadhaar_parts = []


        # Detect PAN number
        elif re.match(r'^[A-Z]{5}\d{4}[A-Z]$', text):
            doc_type = 'pan'
            pan_number = text
            id_number_index = i
        # Add this in the main loop after PAN number detection but before the date logic:

        # Passport number detection
        if re.match(r'^[A-Z]\d{7}$', text):
            doc_type = 'passport'
            passport_number = text
            id_number_index = i

        # Passport name detection (add this in the name detection section)
        elif doc_type == 'passport':
            # Look for surname
            if 'Surname' in text or any(word in text for word in ['RAI', 'SINGH', 'KUMAR']):
                surname_line = i
                surname = text.split('/')[-1].strip()

            # Look for given names
            elif 'Given Names' in text or 'GIVEN NAMES' in text:
                given_name_line = i
                if i + 1 < len(results):
                    given_name = results[i + 1].strip()
                    if given_name and re.match(r'^[A-Z\s]+$', given_name):
                        if surname:
                            name = f"{given_name} {surname}"
                        else:
                            name = given_name

# Modify the date extraction logic to handle passport dates:

        # Date extraction for passport
        if doc_type == 'passport':
            date_match = re.search(r'\d{2}/\d{2}/\d{4}', text)
            if date_match:
                date = date_match.group(0)
                if date not in dates:
                    dates.append(date)
                    print(f"Identified Passport Date: {date}")

            # Look for specific passport date markers
            if 'Date of Birth' in text or 'DOB' in text:
                dob_line_index = i
                if i + 1 < len(results):
                    next_line = results[i + 1].strip()
                    date_match = re.search(r'\d{2}/\d{2}/\d{4}', next_line)
                    if date_match and date_match.group(0) not in dates:
                        dates.insert(0, date_match.group(0))  # Ensure DOB is first

            if 'Date of Issue' in text:
                if i + 1 < len(results):
                    next_line = results[i + 1].strip()
                    date_match = re.search(r'\d{2}/\d{2}/\d{4}', next_line)
                    if date_match:
                        issue_date = date_match.group(0)
                        if issue_date not in dates:
                            dates.append(issue_date)

            if 'Date of Expiry' in text:
                if i + 1 < len(results):
                    next_line = results[i + 1].strip()
                    date_match = re.search(r'\d{2}/\d{2}/\d{4}', next_line)
                    if date_match:
                        expiry_date = date_match.group(0)
                        if expiry_date not in dates:
                            dates.append(expiry_date)

        date_match = re.search(r'\d{2}[/\-\.]\d{2}[/\-\.]\d{4}', text)
        if date_match:
            date = date_match.group(0)
            if date not in dates:
                dates.append(date)
                print(f"Identified Date from line: {date}")

        # Check for DOB markers and surrounding lines
        if any(marker in text.upper() for marker in ['DOB', 'जन्म', 'DATE OF BIRTH', 'BIRTH']):
            dob_line_index = i
            # Check current line again with more flexible pattern
            date_match = re.search(r'.*?(\d{2}[/\-\.]\d{2}[/\-\.]\d{4})', text)

            if date_match:
                date = date_match.group(1)
                if date not in dates:
                    dates.append(date)
                    print(f"Identified Date from DOB line: {date}")
            else:
                # Check surrounding lines (2 lines before and 2 lines after)
                for offset in [-2, -1, 1, 2]:
                    check_index = i + offset
                    if 0 <= check_index < len(results):
                        check_line = results[check_index].strip()
                        # More flexible date pattern to handle special characters
                        date_match = re.search(r'.*?(\d{2}[/\-\.]\d{2}[/\-\.]\d{4})', check_line)
                        if date_match:
                            date = date_match.group(1)
                            if date not in dates:
                                dates.append(date)
                                print(f"Identified Date from offset line {offset}: {date}")
                                break  # Stop after finding the first valid date

    # Keep existing name and father's name detection logic
    for i, text in enumerate(results):
        text = text.strip()

        # Skip invalid lines
        if len(text) < 2 or re.search(r'[^A-Za-z\s\.]', text):
            continue

        # Name detection
        if not name:
            if doc_type == 'aadhaar' and dob_line_index != -1:
                # Aadhaar name detection based on "Government" or "India"
                if 'government' in text.lower() or 'india' in text.lower():
                    # Check two indices below
                    if i + 2 < len(results):
                        potential_name = results[i + 2].strip()
                        if (re.match(r'^[A-Za-z\s\.]+$', potential_name) and
                            len(potential_name.split()) >= 2 and
                            not any(word in potential_name.lower() for word in ['government', 'india', 'department'])):
                            name = potential_name
                            print(f"Identified Name (Aadhaar): {name}")
                # Fallback to original Aadhaar name detection logic
                elif (i < dob_line_index and
                      re.match(r'^[A-Za-z\s\.]+$', text) and
                      len(text.split()) >= 2 and
                      not any(word in text.lower() for word in ['government', 'india', 'department'])):
                    name = text
                    print(f"Identified Name (Aadhaar): {name}")

            elif doc_type == 'pan':
                # PAN name detection
                prev_line = results[i-1].strip().lower() if i > 0 else ""
                if ('name' in prev_line and 'father' not in prev_line and
                    re.match(r'^[A-Za-z\s\.]+$', text) and
                    len(text.split()) >= 2):
                    name = text
                    print(f"Identified Name (PAN): {name}")

        # Father's name detection
        if not father_name:
            prev_line = results[i-1].strip().lower() if i > 0 else ""
            if (('father' in prev_line or 'पिता' in prev_line) and
                re.match(r'^[A-Za-z\s\.]+$', text) and
                len(text.split()) >= 2):
                father_name = text
                print(f"Identified Father's Name: {father_name}")

    # Return extracted details
    return {
        "Document Type": doc_type,
        "Aadhaar Number": aadhaar_number,
        "PAN Number": pan_number,
        "Passport Number": passport_number,
        "Dates": dates,
        "Name": name,
        "Father's Name": father_name
    }

                
        
    # Keep existing fallback for Aadhaar name
    

    



In [7]:
def extract_text_from_image(image_path, languages=["en"], use_gpu=True):
    """
    Extracts text from an image file using EasyOCR.
    """
    # Initialize OCR Reader
    reader = easyocr.Reader(languages, gpu=use_gpu)

    # Preprocess the image
    preprocessed_img = preprocess_image(image_path)

    # Perform OCR
    ocr_results = reader.readtext(preprocessed_img, detail=0)

    # Extract key structured data
    structured_data = extract_key_data(ocr_results)

    return {
        "Structured Data": structured_data,
        "Raw OCR": ocr_results
    }

In [49]:
def extract_text_from_pdf(pdf_path, languages=["en"], use_gpu=True):
    """
    Extracts text from a PDF, including embedded text and images.
    """
    text = ""
    reader = easyocr.Reader(languages, gpu=use_gpu)

    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            # Extract text using PyPDF2
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

            # Extract images for OCR
            if '/XObject' in page.get('/Resources', {}):
                xObject = page['/Resources']['/XObject'].get_object()
                for obj in xObject:
                    if xObject[obj]['/Subtype'] == '/Image':
                        data = xObject[obj].get_data()
                        img = Image.open(io.BytesIO(data)) 
                        img.save("temp_image.png")  # Save for processing

                        # Run OCR on the image
                        ocr_results = extract_text_from_image("temp_image.png")
                        text += "\n".join(ocr_results["Raw OCR"]) + "\n"

    # Extract structured data
    structured_data = extract_key_data(text.splitlines())

    return {
        "Structured Data": structured_data,
        "Raw Text": text.strip()
    }


import json

def process_file(file_path, use_gpu=True, output_json_path="extracted_data.json"):
    """
    Determines the type of file (PDF or image) and processes it.
    Saves structured data to JSON file and returns the results.
    """
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == ".pdf":
        print("Processing as PDF...")
        output = extract_text_from_pdf(file_path, use_gpu=use_gpu)
    elif file_extension in [".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"]:
        print("Processing as Image...")
        output = extract_text_from_image(file_path, use_gpu=use_gpu)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or an image file.")

    # Save structured data to JSON file
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(output["Structured Data"], f, indent=4, ensure_ascii=False)

    print(f"\nStructured data saved to: {output_json_path}")

    # Print the results
    print("\nStructured Data:")
    print(json.dumps(output["Structured Data"], indent=4, ensure_ascii=False))
    print("\nRaw OCR Output:")
    print("\n".join(output["Raw OCR"]) if "Raw OCR" in output else output["Raw Text"])

    return output

# Usage
file_path = "pansam.jpg"  # Replace with your file path (PDF or Image)
try:
    output = process_file(file_path, use_gpu=True)
except ValueError as e:
    print(e)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Processing as Image...
Identified Date from line: 12/08/1998

Structured data saved to: extracted_data.json

Structured Data:
{
    "Document Type": "pan",
    "Aadhaar Number": "",
    "PAN Number": "GAZPM3729R",
    "Passport Number": "",
    "Dates": [
        "12/08/1998"
    ],
    "Name": "",
    "Father's Name": ""
}

Raw OCR Output:
3T1a57
fatrT
TRT
TTR
INCOXE TAX DEPARTMEAT
GOVT OF INDIA
verrut Au H1M &6
Permanent Account Number Card
GAZPM3729R
TA/ Nama
MUSKAN KHAN
Fat #1 74
Fother $ Namo
AYUB KHAN
j9cs2i20
7 #1Tii
Dalo of Binth
12/08/1998
(ran / Signaturo
