# Setup and Installation:

In [12]:
!pip install pytesseract



# Image Preprocessing:

In [1]:
import cv2
import pytesseract
from PIL import Image
import numpy as np

In [2]:
image = cv2.imread('jordan-id-card-template_3_cp.jpg')

cv2.imshow('Original Image', image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [3]:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edged = cv2.Canny(blurred, 50, 150)

In [4]:
contours, _ = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)

for c in contours:
    peri = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02 * peri, True)

    if len(approx) == 4:
        screenCnt = approx
        break

In [6]:
!pip install imutils

Collecting imutils
  Downloading imutils-0.5.4.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: imutils
  Building wheel for imutils (setup.py): started
  Building wheel for imutils (setup.py): finished with status 'done'
  Created wheel for imutils: filename=imutils-0.5.4-py3-none-any.whl size=25854 sha256=3e4c068201c070cc307b3c722ab3298a1e3fbac95c91f70770b66d674e6b0c50
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\31\d0\2c\87ce38f6052879e5b7b18f0f8b4a10ad2a9d210e908d449f16
Successfully built imutils
Installing collected packages: imutils
Successfully installed imutils-0.5.4


In [7]:
from imutils.perspective import four_point_transform

warped = four_point_transform(image, screenCnt.reshape(4, 2))

In [9]:
warped_gray = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)

thresh = cv2.adaptiveThreshold(warped_gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

In [10]:
cv2.imshow('Warped Image', warped)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Text Recognition with Tesseract OCR:

In [14]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'

In [16]:
import pytesseract
from PIL import Image


pil_image = Image.fromarray(image)

extracted_text = pytesseract.image_to_string(pil_image)

print(extracted_text)

The Hashemite Kingdom of Jordan
Ministry of Interior - Civil Status & Passport Dept.
IDCard

Aeedig! 2Sa,8) asta!
Silently Abad Jig Yi 5510 - a
ce

2 ABU

Sally Obl ye Opes)

Sy

0g00




In [17]:
custom_config = r'--psm 4'  # Assume a single column of text of variable sizes
extracted_text = pytesseract.image_to_string(pil_image, config=custom_config)

In [18]:
extracted_text

'The Hashemite Kingdom of Jordan Aeadigt Aba,¥) aster\nMinistry of Interior > Civil Status & Passport Dept. ij tasty AGN Jig) By bt0 Bate\n1DCard Maia Baie)\n\nSells Ob! 96 Ope!\n\noge0\n\n'

In [19]:
import re

# Example regex patterns for common ID fields
name_pattern = r"Name:\s*(\w+\s\w+)"
id_pattern = r"ID:\s*(\d+)"
dob_pattern = r"DOB:\s*(\d{2}/\d{2}/\d{4})"

# Search for patterns
name_match = re.search(name_pattern, extracted_text)
id_match = re.search(id_pattern, extracted_text)
dob_match = re.search(dob_pattern, extracted_text)

# Extract and print results
if name_match:
    print(f"Name: {name_match.group(1)}")
if id_match:
    print(f"ID: {id_match.group(1)}")
if dob_match:
    print(f"DOB: {dob_match.group(1)}")


# Data Structuring:

In [20]:
import pandas as pd

# Example extracted data
data = {
    'Name': [],
    'ID_Number': [],
    'Date_of_Birth': []
}

# Assuming we have multiple images and extracted texts
extracted_texts = [
    "Name: John Doe\nID: 12345678\nDOB: 01/01/1990",
    "Name: Jane Smith\nID: 87654321\nDOB: 02/02/1992"
]

# Define regex patterns
name_pattern = r"Name:\s*(\w+\s\w+)"
id_pattern = r"ID:\s*(\d+)"
dob_pattern = r"DOB:\s*(\d{2}/\d{2}/\d{4})"

# Extract data for each text
for text in extracted_texts:
    name_match = re.search(name_pattern, text)
    id_match = re.search(id_pattern, text)
    dob_match = re.search(dob_pattern, text)
    
    # Add to the data dictionary
    if name_match and id_match and dob_match:
        data['Name'].append(name_match.group(1))
        data['ID_Number'].append(id_match.group(1))
        data['Date_of_Birth'].append(dob_match.group(1))

# Create DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


         Name ID_Number Date_of_Birth
0    John Doe  12345678    01/01/1990
1  Jane Smith  87654321    02/02/1992


In [21]:
df['Date_of_Birth'] = pd.to_datetime(df['Date_of_Birth'], format='%d/%m/%Y', errors='coerce')

df = df.dropna(subset=['Date_of_Birth'])

print(df)


         Name ID_Number Date_of_Birth
0    John Doe  12345678    1990-01-01
1  Jane Smith  87654321    1992-02-02


In [22]:
df.to_csv('extracted_data.csv', index=False)

df.to_excel('extracted_data.xlsx', index=False)


# Testing and Validation:

In [25]:
test_images = ['images (1).jpeg', 'images (2).jpeg', 'images 3.jpeg']  # List of test image paths
extracted_data = []

for image_path in test_images:
    # Load and preprocess the image
    image = cv2.imread("jordan-id-card-template_3_cp.jpg")
    
    # Extract text
    pil_image = Image.fromarray(image)
    extracted_text = pytesseract.image_to_string(pil_image, config=custom_config)
    
    # Parse and structure the extracted text
    name = re.search(name_pattern, extracted_text)
    id_number = re.search(id_pattern, extracted_text)
    dob = re.search(dob_pattern, extracted_text)

    extracted_data.append({
        'Name': name.group(1) if name else '',
        'ID_Number': id_number.group(1) if id_number else '',
        'Date_of_Birth': dob.group(1) if dob else ''
    })

# Create DataFrame
df_extracted = pd.DataFrame(extracted_data)
