In [42]:
!pip install pytesseract



In [43]:
!pip install faker



In [44]:
import os
import random
import string
from faker import Faker
from PIL import Image, ImageDraw, ImageFont
import cv2
import numpy as np
import pandas as pd

In [45]:
fake = Faker()
output_dir = "synthetic_dataset"
os.makedirs(output_dir, exist_ok=True)

In [46]:
def generate_pan():
    letters = string.ascii_uppercase
    return ''.join(random.choices(letters, k=5)) + \
           ''.join(random.choices(string.digits, k=4)) + \
           random.choice(letters)

In [47]:
def add_noise(img):
    img_np = np.array(img)
    noise = np.random.randint(0, 20, img_np.shape, dtype='uint8')
    img_np = cv2.add(img_np, noise)
    img_np = cv2.GaussianBlur(img_np, (3, 3), 0)
    return Image.fromarray(img_np)


In [48]:
def create_document(doc_id):
    name = fake.name()
    pan = generate_pan()
    salary = random.randint(20000, 150000)
    loan_amount = random.randint(50000, 1200000)

    img = Image.new("RGB", (800, 600), "white")
    draw = ImageDraw.Draw(img)

    try:
        font = ImageFont.truetype("arial.ttf", 28)
    except:
        font = ImageFont.load_default()

    draw.text((50, 50), f"Name: {name}", fill="black", font=font)
    draw.text((50, 100), f"PAN: {pan}", fill="black", font=font)
    draw.text((50, 150), f"Salary: {salary:,}", fill="black", font=font)
    draw.text((50, 200), f"Loan Amount: {loan_amount:,}", fill="black", font=font)

    img_noisy = add_noise(img)

    img_path = os.path.join(output_dir, f"doc_{doc_id}.png")
    img_noisy.save(img_path)

    with open(os.path.join(output_dir, f"doc_{doc_id}.txt"), "w") as f:
        f.write(f"Name: {name}\nPAN: {pan}\nSalary: {salary}\nLoanAmount: {loan_amount}")

In [49]:
for i in range(100):
    create_document(i)

print(f"Generated 100 synthetic documents in '{output_dir}'")


Generated 100 synthetic documents in 'synthetic_dataset'


In [50]:
import pytesseract
import regex as re
import os
import sys

In [51]:
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    denoised = cv2.fastNlMeansDenoising(gray, h=20)
    thresh = cv2.adaptiveThreshold(
        denoised, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 15, 9
    )
    return thresh

In [52]:
def extract_text(image):
    custom_config = r'--oem 3 --psm 6'
    return pytesseract.image_to_string(image, config=custom_config)

In [58]:
def extract_fields(text):
    fields = {}

    clean_text = text.replace("\n", " ").replace("  ", " ")

    clean_text = clean_text.replace("PAR ", "PAN ").replace("PAH ", "PAN ")
    clean_text = clean_text.replace("Salan", "Salary").replace("Selary", "Salary")
    clean_text = clean_text.replace("Lonn", "Loan").replace("Loen", "Loan")

    name_match = re.search(r'Name[:\s]+([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)', clean_text)
    if not name_match:
        name_match = re.search(r'^([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s+PAN', clean_text)

    pan_match = re.search(r'\b[A-Z]{5}\d{4}[A-Z]\b', clean_text)

    salary_match = re.search(r'Salary[:\s]*([\d,]+)', clean_text, re.IGNORECASE)

    loan_amt_match = re.search(r'Loan Amount[:\s]*([\d,]+)', clean_text, re.IGNORECASE)

    if name_match:
        fields['Name'] = name_match.group(1)
    if pan_match:
        fields['PAN'] = pan_match.group(0)
    if salary_match:
        fields['Salary'] = float(salary_match.group(1).replace(',', ''))
    if loan_amt_match:
        fields['LoanAmount'] = float(loan_amt_match.group(1).replace(',', ''))

    return fields


In [59]:
import math

def check_eligibility(fields):
    if 'Salary' not in fields or 'LoanAmount' not in fields:
        return 'Manual Review'

    monthly_income = fields['Salary']
    loan_amt = fields['LoanAmount']

    annual_interest_rate = 0.10
    tenure_years = 5
    max_emi_ratio = 0.4

    monthly_rate = annual_interest_rate / 12
    months = tenure_years * 12

    emi = (loan_amt * monthly_rate * (1 + monthly_rate)**months) / \
          ((1 + monthly_rate)**months - 1)

    if emi <= monthly_income * max_emi_ratio:
        return 'Eligible'
    else:
        return 'Not Eligible'


In [60]:
results=[]
def process_document(image_path):
    print(f"\nProcessing: {image_path}")
    image = preprocess_image(image_path)
    text = extract_text(image)
    fields = extract_fields(text)
    status = check_eligibility(fields)
    return fields, status

In [61]:
if __name__ == "__main__":
    folder = "synthetic_dataset1"
    if not os.path.isdir(folder):
        print(f"Error: '{folder}' is not a valid folder.")
        sys.exit(1)

    for file in os.listdir(folder):
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            extracted_fields, eligibility_status = process_document(os.path.join(folder, file))
            if extracted_fields:
                results.append(extracted_fields)


Processing: synthetic_dataset1/doc_13.png

Processing: synthetic_dataset1/doc_24.png

Processing: synthetic_dataset1/doc_96.png

Processing: synthetic_dataset1/doc_66.png

Processing: synthetic_dataset1/doc_33.png

Processing: synthetic_dataset1/doc_36.png

Processing: synthetic_dataset1/doc_29.png

Processing: synthetic_dataset1/doc_73.png

Processing: synthetic_dataset1/doc_27.png

Processing: synthetic_dataset1/doc_12.png

Processing: synthetic_dataset1/doc_19.png

Processing: synthetic_dataset1/doc_79.png

Processing: synthetic_dataset1/doc_62.png

Processing: synthetic_dataset1/doc_84.png

Processing: synthetic_dataset1/doc_22.png

Processing: synthetic_dataset1/doc_21.png

Processing: synthetic_dataset1/doc_31.png

Processing: synthetic_dataset1/doc_50.png

Processing: synthetic_dataset1/doc_75.png

Processing: synthetic_dataset1/doc_8.png

Processing: synthetic_dataset1/doc_18.png

Processing: synthetic_dataset1/doc_57.png

Processing: synthetic_dataset1/doc_9.png

Processing: 

In [62]:
df = pd.DataFrame(results)

In [63]:
df.to_excel("results.xlsx")