In [None]:
import numpy as np
# from paddleocr import PaddleOCR
from collections import Counter
from typing import Dict

FRONT_USELESS_ITEMS = [
    "REPUBLIQUE DU CAMEROUN",
    "NATIONAL IDENITY CARD",
    "REPUBLIC OF CAMEROON",
    "CARTE NATIONALE DIDENTITE",
    "CARTE NATIONALE D'IDENTITE",
    ""
]

FRONT_BOUNDS = [
    "NOM/SURNAME",
    "PRENOMS/GIVEN NAMES",
    "DATE DE NAISSANCE/DATEOF BIRTH",
    "LIEU DE NAISSANCE/PLACE OF BIRTH",
    "SEXE/SEX",
    "TAILLE/HEIGHT",
    "PROFESSION/OCCUPATION",
    "SIGNATURE",
]

BACK_USELESS_ITEMS = [
    "AUTORITE/AUTHORITY",
    "DATEOFISSUE", #repair
    "IDENTIFICATION POS", #repair
    "DATE OFEXPIRY",# repair
    "URIOURIDENTIFIER",# repair
    "CAM動UN",
    "CAMERO",
    ""
]

BACK_BOUNDS = [
    "PERE/FATHER",
    "MERE/MOTHER",
    "S.P./S.M.",
    "DATEDEDELIVRANCE/",
    "POSTEDIDENTIFICATCN",
    "ADRESSE/ADDRESS",
    "DATED'EXPIRATION/",
    "IDENTIFIANT UNIQUE",
]

COMPARISON_TRHESHOLD = 0.8

class Extractor2018:
    def __init__(self):
        self.result = None
        self.ocr_result = None
        self.COMPARISON_TRHESHOLD = COMPARISON_TRHESHOLD
        self.BOUNDS = []
        self.USELESS_ITEMS = []
        self.front_dict = ["name", "given_names", "birth_date", "birth_place"]

    def execute(self, ocr_result: Dict, bounds, useless_items):
        self.BOUNDS = bounds
        self.USELESS_ITEMS = useless_items

        ocr_data = self.ocr_result[0]
        self._remove_useless_text(ocr_data)
        self._remove_items_with_minoritary_text_orientation(ocr_data)
        self._remove_elements_with_score_less_than_average(ocr_data)
        self._extract_text(ocr_data)

        return self._return_result()

    def _execute_orc(self, image_path):
        self.ocr_result = self.ocr.predict(image_path)

    def _filter_by_indices(self, ocr_result, indices):
        for key in ["textline_orientation_angles", "rec_texts", "rec_scores", "rec_polys", "rec_boxes"]:
            ocr_result[key] = [ocr_result[key][i] for i in indices]

    def _remove_items_with_minoritary_text_orientation(self, ocr_result):
        angles = ocr_result["textline_orientation_angles"]
        major_angle = Counter(angles).most_common(1)[0][0]
        indices = [i for i, v in enumerate(angles) if v == major_angle]
        self._filter_by_indices(ocr_result, indices)

    def _remove_useless_text(self, ocr_result):
        indices_to_remove = set()
        for element in self.USELESS_ITEMS:
            for i, text in enumerate(ocr_result["rec_texts"]):
                if StringComparator.compare_strings(text, element) > self.COMPARISON_TRHESHOLD:
                    indices_to_remove.add(i)
        indices = set(range(len(ocr_result["rec_texts"]))) - indices_to_remove
        self._filter_by_indices(ocr_result, list(indices))

    def _remove_elements_with_score_less_than_average(self, ocr_result):
        scores = ocr_result["rec_scores"]
        average_score = np.mean(scores) * 0.6
        indices = [i for i, score in enumerate(scores) if score >= average_score]
        self._filter_by_indices(ocr_result, indices)

    def _extract_text(self, ocr_result):
        self.result = dict()
        bounds_indices = []
        for bound in self.BOUNDS:
            indice, score = None, None
            for i, text in enumerate(ocr_result["rec_texts"]):
                similarity_score = StringComparator.compare_strings(text, bound)
                if similarity_score > self.COMPARISON_TRHESHOLD:
                    if score is None or similarity_score > score:
                        score = similarity_score
                        indice = i
            bounds_indices.append(indice)

        cleaned_bounds_indices = [i for i in bounds_indices if i is not None]
        idx = 0
        while idx < len(cleaned_bounds_indices) - 1:
            bound_idx = cleaned_bounds_indices[idx]
            next_bound_idx = cleaned_bounds_indices[idx + 1]
            bounds_batch = [bound_idx]
            while next_bound_idx is not None and next_bound_idx == bound_idx + 1:
                bounds_batch.append(next_bound_idx)
                bound_idx = next_bound_idx
                next_idx = idx + 1 + len(bounds_batch) - 1
                if next_idx < len(cleaned_bounds_indices):
                    next_bound_idx = cleaned_bounds_indices[next_idx]
                else:
                    next_bound_idx = None

            if len(bounds_batch) < 2:
                if bound_idx is not None and next_bound_idx is not None:
                    self.result[ocr_result["rec_texts"][bound_idx]] = {
                        "text": " ".join(ocr_result["rec_texts"][bound_idx + 1: next_bound_idx]),
                        "score": float(np.mean(ocr_result["rec_scores"][bound_idx + 1: next_bound_idx])),
                        "box(es)": ocr_result["rec_boxes"][bound_idx + 1: next_bound_idx],
                        "poly(s)": ocr_result["rec_polys"][bound_idx + 1: next_bound_idx],
                    }
            else:
                can_go_ahead = True
                if can_go_ahead:
                    for _bound_idx in bounds_batch:
                        idx_offset = _bound_idx + len(bounds_batch)
                        self.result[ocr_result["rec_texts"][_bound_idx]] = {
                            "text": ocr_result["rec_texts"][idx_offset],
                            "score": float(ocr_result["rec_scores"][idx_offset]),
                            "box(es)": ocr_result["rec_boxes"][idx_offset],
                            "poly(s)": ocr_result["rec_polys"][idx_offset],
                        }
            idx += len(bounds_batch)

    def _return_result(self, include_all_details=False):
        if self.result is None:
            raise ValueError("No result to return. Please execute the OCR first.")
        if include_all_details:
            return self.result
        else:
            return {key: value["text"] for key, value in self.result.items()}

class StringComparator:
    @staticmethod
    def compare_strings(string1: str, string2: str) -> float:
        if not string1 or not string2:
            return 0.0
        string1 = string1.lower().replace(" ", "")
        string2 = string2.lower().replace(" ", "")
        max_length = max(len(string1), len(string2))
        if max_length == 0:
            return 1.0
        matching_chars = sum(1 for a, b in zip(string1, string2) if a == b)
        return matching_chars / max_length

In [None]:
import os
import json

input_dir = "output/images/old"
output_dir = "output/extraction/old"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        input_path = os.path.join(input_dir, filename)
        with open(input_path, "r", encoding="utf-8") as f:
            ocr_result_dict = json.load(f)

        extractor = Extractor2018()
        extractor.ocr_result = [ocr_result_dict]

        if "front" in filename.lower():
            bounds = FRONT_BOUNDS
            useless_items = FRONT_USELESS_ITEMS
        else:
            bounds = BACK_BOUNDS
            useless_items = BACK_USELESS_ITEMS

        result = extractor.execute("", bounds, useless_items)

        output_path = os.path.join(output_dir, filename)
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)