In [None]:
!pip install onnxruntime
!pip install imgocr

Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pack

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
from datasets import load_dataset

dataset_path = "/content/drive/MyDrive/HongKong_Dataset/train-00000-of-00002.parquet"

dataset = load_dataset("parquet", data_files=dataset_path)['train'].select(range(3))

print(dataset)

Dataset({
    features: ['image', 'ocr'],
    num_rows: 3
})


In [43]:
import cv2
import numpy as np
from PIL import Image, ImageEnhance
from imgocr import ImgOcr
import re
import json

# Initialize ImgOcr
m = ImgOcr(use_gpu=False, is_efficiency_mode=True)  # Using efficiency mode

def enhance_image(image):
    """Enhance image to improve OCR accuracy."""
    # Convert PIL image to OpenCV format
    image = np.array(image)

    # Increase image contrast using ImageEnhance
    pil_image = Image.fromarray(image)
    enhancer = ImageEnhance.Contrast(pil_image)
    image = enhancer.enhance(2)  # Adjust the factor for better contrast

    # Convert back to numpy array
    image = np.array(image)

    # Sharpen the image using a kernel for better text distinction
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])  # Sharpen kernel
    image = cv2.filter2D(image, -1, kernel)

    # Convert back to PIL image
    return Image.fromarray(image)


def extract_personal_number(ocr_result):
    """Extract personal number in the format 'XXXX XXXX XXXX'."""
    # Combine OCR result text into one string
    ocr_text = " ".join([i['text'] for i in ocr_result])

    # Remove any dots and the digit following the dot
    ocr_text_cleaned = re.sub(r"\.\d", "", ocr_text)  # Remove dots and digits after them

    # Use regular expression to find a 12-digit personal number, handling cases like overflow or dots
    match = re.search(r"(\d{4})[\s]?(\d{4})[\s]?(\d{4})", ocr_text_cleaned)

    if match:
        # Format the matched personal number in the desired format
        personal_number = f"{match.group(1)} {match.group(2)} {match.group(3)}"
        return personal_number
    return None

def clean_ocr_response(ocr_result):
    """Clean the OCR response by removing unwanted labels and getting Chinese name."""
    # List of unwanted labels or keywords to remove (e.g., Date of Birth, Date of Issue)
    unwanted_labels = [
        "Date of Birth", "DateofBirth", "Date of Issue", "DateofIssue",
        "出生日期", "签发日期", "Date", "Issue", "SAMPLE"
    ]

    # Filter out text entries containing unwanted labels
    cleaned_text = [i['text'] for i in ocr_result if not any(label in i['text'] for label in unwanted_labels)]

    # Extract all Chinese text
    chinese_text = [text for text in cleaned_text if re.search('[\u4e00-\u9fff]', text)]  # Match Chinese characters

    # Find the shortest Chinese text assuming it is the name
    chinese_full_name = min(chinese_text, key=len, default=None)

    return chinese_full_name

def extract_document_number(ocr_result):
    """Extract document number from OCR result based on specified format."""
    pattern = r"[A-Za-z][0-9]{6}\([0-9]\)$"

    last_text = ocr_result[-1]['text'].strip()

    if re.match(pattern, last_text):
        return {"document Number": last_text}
    elif re.match(r"[A-Za-z][0-9]{6}", last_text):
        return {"document Number": last_text}

    return {"document Number": None}  # Return null if no match found

def extract_date_of_birth(ocr_result):
    """Extract the date of birth in 'dd-mm-yyyy' format from OCR result."""
    # Combine OCR result text into one string
    ocr_text = " ".join([i['text'] for i in ocr_result])

    # Use regular expression to find dates in dd-mm-yyyy format
    match = re.search(r"(\d{2}-\d{2}-\d{4})", ocr_text)

    if match:
        # Return the date as 'dateOfBirth' in JSON format
        return match.group(1)
    return None

def extract_gender(ocr_result, date_of_birth):
    """Extract gender based on the date of birth and the line below it."""
    ocr_text = " ".join([i['text'] for i in ocr_result])

    # Find the position of the date of birth in the text
    position = ocr_text.find(date_of_birth)
    if position != -1:
        # Extract the next line after the date of birth
        following_text = ocr_text[position + len(date_of_birth):].strip()

        # Check if the gender indicators are present in the following line
        if re.search(r"男|M", following_text):
            return "Male"
        elif re.search(r"女|F", following_text):
            return "Female"

    # Return None (null) if no gender found on the line below the date of birth
    return None

def extract_dob_symbol(ocr_result):
    """Extract dobSymbol of format ***XX (with three asterisks followed by two alphabets)."""
    # Combine OCR result text into one string
    ocr_text = " ".join([i['text'] for i in ocr_result])

    # Use regular expression to find a pattern like "***AZ"
    match = re.search(r"\*\*\*([A-Za-z]{2})", ocr_text)

    if match:
        # Return dobSymbol in JSON format
        return {"dobSymbol": match.group(0)}
    return {"dobSymbol": None}  # Return null if no match found

def extract_issuing_date(ocr_result):
    """Extract issuing date in the format (MM-YY) from OCR result."""
    # Combine OCR result text into one string
    ocr_text = " ".join([i['text'] for i in ocr_result])

    # Use regular expression to find dates in (MM-YY) format
    match = re.search(r"\(\d{2}-\d{2}\)", ocr_text)

    if match:
        # Return the issuing date as 'issuing Date' in JSON format
        return {"issuing Date": match.group(0)[1:-1]}  # Remove parentheses
    return {"issuing Date": None}  # Return null if no match found


def extract_english_name(ocr_result):
    """Extract English full name, first name, and last name."""
    # Combine all OCR result text into a single string
    ocr_text = " ".join([i['text'] for i in ocr_result])

    # Remove unwanted parts like "AMPLE SAMPLE" and "SAMPLE" from the OCR text
    ocr_text = ocr_text.replace("AMPLE SAMPLE", "").replace("SAMPLE", "").strip()

    # Look for the pattern with a comma, which typically separates the surname and given name
    # Assuming that the full name is in the format "Surname, Givenname"
    match = re.search(r"([A-Za-z]+),([A-Za-z]+(?:\s[A-Za-z]+)*)", ocr_text)

    if match:
        # Extract surname (before comma) and given name (after comma)
        surname = match.group(1)
        given_name = match.group(2)

        # Clean up spaces, if any, around names
        surname = surname.strip()
        given_name = given_name.strip()

        # Return the JSON with English full name, first name, and last name
        return {
            "englishFullName": f"{surname},{given_name}",
            "englishGivenName": f"{surname},{given_name}",
            "firstName": given_name,
            "englishSurname": surname
        }

    return {
        "englishFullName": None,
        "englishGivenName": None,
        "firstName": None,
        "englishSurname": None
    }

def extract_chinese_surname(chinese_full_name):
    """Extract the Chinese surname from the full name."""
    if chinese_full_name:
        # Return the first character as surname
        surname = chinese_full_name[0]
        return {"chineseSurname": surname}
    return {"chineseSurname": None}

# Load image from drive
image_path = "/content/drive/MyDrive/HongKong_Dataset/new_hkid_front.png"  # Replace with your image path
image = Image.open(image_path)

# Enhance the image
enhanced_image = enhance_image(image)

# Perform OCR
result = m.ocr(enhanced_image)

# Print OCR results
print(f"🔹 OCR Results for {image_path}:")
for i in result:
    print(i['text'])

# Extract document number from the OCR result
document_number_json = extract_document_number(result)

# Print the document number JSON response (null if not found)
print("🔹 Document Number extracted:")
print(document_number_json)

# Extract Chinese Full Name
chinese_full_name = clean_ocr_response(result)

# Return the Chinese Full Name as JSON, ensure Chinese characters are not escaped
chinese_given_name_json = json.dumps({"chineseGivenName": chinese_full_name if chinese_full_name else None}, ensure_ascii=False)

chinese_full_name_json = json.dumps({"chineseFullName": chinese_full_name if chinese_full_name else None}, ensure_ascii=False)

# Print the Chinese Full Name response
print("🔹 Chinese Full Name extracted:")
print(chinese_full_name_json)

print("🔹 Chinese Given Name extracted:")
print(chinese_given_name_json)

# Extract personal number from the OCR result
personal_number = extract_personal_number(result)

# Return the personal number
print("🔹 Personal Number extracted:")
print(personal_number)

# Extract DOB (Date of Birth)
dob = extract_date_of_birth(result)

# Print the DOB
print("🔹 Date of Birth extracted:")
print(dob)

# Extract gender (assuming next line contains gender info)
gender = extract_gender(result, dob)

# Print gender
print("🔹 Gender extracted:")
print(gender)

# Extract dobSymbol
dob_symbol_json = extract_dob_symbol(result)

# Print dobSymbol response
print("🔹 dobSymbol extracted:")
print(dob_symbol_json)

# Extract issuing date
issuing_date_json = extract_issuing_date(result)

# Print issuing date response
print("🔹 Issuing Date extracted:")
print(issuing_date_json)

# Extract English name components
english_name_json = extract_english_name(result)

# Print English name JSON response
print("🔹 English Full Name extracted:")
print(json.dumps(english_name_json, ensure_ascii=False))

# Extract Chinese surname (if applicable)
chinese_surname_json = extract_chinese_surname(chinese_full_name)

# Print Chinese surname JSON response
print("🔹 Chinese Surname extracted:")
print(json.dumps(chinese_surname_json, ensure_ascii=False))


🔹 OCR Results for /content/drive/MyDrive/HongKong_Dataset/new_hkid_front.png:
香港永久性居民身份證
HONG KONG PERMANENTIDENTITY CARD
m
樂永晴
Z683365
LoK,wing
AMPLE SAMPLE 
Ching
2867.3
3057 2532
出生日期DateofBirth
03-06-1985
***AZ
签发日期 Date of Issue 
SAMPLE
(06-96)
26-11-18
Z683365(5)
🔹 Document Number extracted:
{'document Number': 'Z683365(5)'}
🔹 Chinese Full Name extracted:
{"chineseFullName": "樂永晴"}
🔹 Chinese Given Name extracted:
{"chineseGivenName": "樂永晴"}
🔹 Personal Number extracted:
2867 3057 2532
🔹 Date of Birth extracted:
03-06-1985
🔹 Gender extracted:
Male
🔹 dobSymbol extracted:
{'dobSymbol': '***AZ'}
🔹 Issuing Date extracted:
{'issuing Date': '06-96'}
🔹 English Full Name extracted:
{"englishFullName": "LoK,wing", "englishGivenName": "LoK,wing", "firstName": "wing", "englishSurname": "LoK"}
🔹 Chinese Surname extracted:
{"chineseSurname": "樂"}


In [44]:
import json

def generate_final_json(document_number, chinese_full_name, chinese_given_name, english_name, chinese_surname, gender, dob, dob_symbol, issuing_date, personal_number):
    """Generate the final JSON output in the desired format."""
    final_output = {
        "documentType": "national_identity_card",
        "issuingCountry": "HKG",
        "extractedOcrData": {
            **extract_document_number(result),
            "chineseFullName": chinese_full_name,
            "chineseGivenName": chinese_given_name,
            "englishFullName": english_name.get("englishFullName"),
            "englishGivenName": english_name.get("englishGivenName"),
            "firstName": english_name.get("firstName"),
            "englishSurname": english_name.get("englishSurname"),
            "chineseSurname": chinese_surname,
            "gender": gender,
            "dateofBirth": dob,
            "dateOfExpiry": None,  # Always keep null
            "dobSymbol": dob_symbol.get("dobSymbol"),
            "issuing Date": issuing_date.get("issuing Date"),
            "personalNumber": personal_number
        }
    }
    return json.dumps(final_output, ensure_ascii=False, indent=4)

final_json = generate_final_json(
    document_number=document_number_json,
    chinese_full_name=chinese_full_name,
    chinese_given_name=chinese_full_name,
    english_name=english_name_json,
    chinese_surname=chinese_surname_json.get("chineseSurname"),
    gender=gender,
    dob=dob,
    dob_symbol=dob_symbol_json,
    issuing_date=issuing_date_json,
    personal_number=personal_number
)


print(final_json)


{
    "documentType": "national_identity_card",
    "issuingCountry": "HKG",
    "extractedOcrData": {
        "document Number": "Z683365(5)",
        "chineseFullName": "樂永晴",
        "chineseGivenName": "樂永晴",
        "englishFullName": "LoK,wing",
        "englishGivenName": "LoK,wing",
        "firstName": "wing",
        "englishSurname": "LoK",
        "chineseSurname": "樂",
        "gender": "Male",
        "dateofBirth": "03-06-1985",
        "dateOfExpiry": null,
        "dobSymbol": "***AZ",
        "issuing Date": "06-96",
        "personalNumber": "2867 3057 2532"
    }
}
