In [1]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [2]:
!pip install faker

Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.9 MB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m1.2/1.9 MB[0m [31m18.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m23.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.5.3


In [3]:
import os
import torch
from torchvision.utils import save_image
from torchvision import transforms
from PIL import Image
import requests
from io import BytesIO

In [4]:
os.makedirs("synthetic_dataset_gan", exist_ok=True)

In [6]:
model = torch.hub.load('facebookresearch/pytorch_GAN_zoo:hub', 'DCGAN', pretrained=True)

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorch_GAN_zoo_hub


Average network found !


In [9]:
def generate_synthetic_docs(num_images=5, out_dir="synthetic_dataset_gan"):
    for i in range(num_images):
        z = torch.randn(1, 120, 1, 1)
        with torch.no_grad():
            fake_img = model.test(z)
        fake_img = (fake_img + 1) / 2
        save_path = os.path.join(out_dir, f"doc_gan_{i+1}.png")
        save_image(fake_img, save_path)
        print(f"Saved: {save_path}")

In [10]:
generate_synthetic_docs(num_images=10)

Saved: synthetic_dataset_gan/doc_gan_1.png
Saved: synthetic_dataset_gan/doc_gan_2.png
Saved: synthetic_dataset_gan/doc_gan_3.png
Saved: synthetic_dataset_gan/doc_gan_4.png
Saved: synthetic_dataset_gan/doc_gan_5.png
Saved: synthetic_dataset_gan/doc_gan_6.png
Saved: synthetic_dataset_gan/doc_gan_7.png
Saved: synthetic_dataset_gan/doc_gan_8.png
Saved: synthetic_dataset_gan/doc_gan_9.png
Saved: synthetic_dataset_gan/doc_gan_10.png


In [17]:
import pytesseract
import regex as re
import os
import sys
import cv2

In [18]:
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    denoised = cv2.fastNlMeansDenoising(gray, h=20)
    thresh = cv2.adaptiveThreshold(
        denoised, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 15, 9
    )
    return thresh

In [19]:
def extract_text(image):
    custom_config = r'--oem 3 --psm 6'
    return pytesseract.image_to_string(image, config=custom_config)

In [21]:
def extract_fields(text):
    fields = {}

    clean_text = text.replace("\n", " ").replace("  ", " ")

    clean_text = clean_text.replace("PAR ", "PAN ").replace("PAH ", "PAN ")
    clean_text = clean_text.replace("Salan", "Salary").replace("Selary", "Salary")
    clean_text = clean_text.replace("Lonn", "Loan").replace("Loen", "Loan")

    name_match = re.search(r'Name[:\s]+([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)', clean_text)
    if not name_match:
        name_match = re.search(r'^([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s+PAN', clean_text)

    pan_match = re.search(r'\b[A-Z]{5}\d{4}[A-Z]\b', clean_text)

    salary_match = re.search(r'Salary[:\s]*([\d,]+)', clean_text, re.IGNORECASE)

    loan_amt_match = re.search(r'Loan Amount[:\s]*([\d,]+)', clean_text, re.IGNORECASE)

    if name_match:
        fields['Name'] = name_match.group(1)
    if pan_match:
        fields['PAN'] = pan_match.group(0)
    if salary_match:
        fields['Salary'] = float(salary_match.group(1).replace(',', ''))
    if loan_amt_match:
        fields['LoanAmount'] = float(loan_amt_match.group(1).replace(',', ''))

    return fields


In [22]:
import math

def check_eligibility(fields):
    if 'Salary' not in fields or 'LoanAmount' not in fields:
        return 'Manual Review'

    monthly_income = fields['Salary']
    loan_amt = fields['LoanAmount']

    annual_interest_rate = 0.10
    tenure_years = 5
    max_emi_ratio = 0.4

    monthly_rate = annual_interest_rate / 12
    months = tenure_years * 12

    emi = (loan_amt * monthly_rate * (1 + monthly_rate)**months) / \
          ((1 + monthly_rate)**months - 1)

    if emi <= monthly_income * max_emi_ratio:
        return 'Eligible'
    else:
        return 'Not Eligible'


In [23]:
results=[]
def process_document(image_path):
    print(f"\nProcessing: {image_path}")
    image = preprocess_image(image_path)
    text = extract_text(image)
    fields = extract_fields(text)
    status = check_eligibility(fields)
    return fields, status

In [25]:
if __name__ == "__main__":
    folder = "synthetic_dataset_gan"
    if not os.path.isdir(folder):
        print(f"Error: '{folder}' is not a valid folder.")
        sys.exit(1)

    for file in os.listdir(folder):
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            extracted_fields, eligibility_status = process_document(os.path.join(folder, file))
            if extracted_fields:
                results.append(extracted_fields)


Processing: synthetic_dataset_gan/doc_gan_5.png

Processing: synthetic_dataset_gan/doc_gan_6.png

Processing: synthetic_dataset_gan/doc_gan_9.png

Processing: synthetic_dataset_gan/doc_gan_7.png

Processing: synthetic_dataset_gan/doc_gan_1.png

Processing: synthetic_dataset_gan/doc_gan_3.png

Processing: synthetic_dataset_gan/doc_gan_2.png

Processing: synthetic_dataset_gan/doc_gan_4.png

Processing: synthetic_dataset_gan/doc_gan_8.png

Processing: synthetic_dataset_gan/doc_gan_10.png


In [27]:
import pandas as pd

In [31]:
df = pd.DataFrame(results)

In [29]:
df.to_excel("results.xlsx")