In [1]:
import re
import random   
import io
import time
from shutil import move

from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams

from pdf2image import convert_from_path
from PIL import ImageDraw, ImageFont

from navec import Navec
from slovnet import NER

import extractors
from utils import createDirIfNotExist, createDir, get_files
from words_handler import parse_obj, highlightObjects, DPI_SCALE, STANDART_DPI
from object_handlers import handleNumbers, handlePersons, handleDates, handleMoney, handleOrgs, handlePassport

In [2]:
def saveToXML(objects, docName, images, xml, maxPageNum, processedText, text_lines, drawnBoxes, xmlsBoxes, border_colors):  
  font = ImageFont.truetype("Arsenal-Regular.otf", 20)

  unique_values = set()
  objects = [o for o in objects
      if (o['text_value'], o['bbox']) not in unique_values
      and not unique_values.add((o['text_value'], o['bbox']))]

  nums_objs = handleNumbers(objects, images[0].size[1])
  persons_objs = handlePersons(objects, images[0].size[1], processedText, text_lines, drawnBoxes, xmlsBoxes, 0, border_colors)
  dates_objs = handleDates(objects, images[0].size[1])
  money_objs = handleMoney(objects, images[0].size[1])
  orgs_objs = handleOrgs(objects, images[0].size[1], images[0].size[0])
  passport_objs = handlePassport(objects, images[0].size[0])
  
  objects = persons_objs + dates_objs + nums_objs + money_objs + orgs_objs + passport_objs
  
    
  for pageNum in range(0, maxPageNum):
    im_height = images[pageNum].size[1]
    image_drawer = ImageDraw.Draw(images[pageNum])
    
    for obj in objects:
      if obj["pageNum"] != pageNum:
        continue
      
      bbox = (obj["bbox"][0], int(im_height - obj["bbox"][1] - pageNum * 10000),
          obj["bbox"][2], int(im_height - obj["bbox"][3] - pageNum * 10000))

      image_drawer.rectangle((bbox[0], bbox[1], bbox[2], bbox[3] + random.randint(2, 10)), 
                  outline=border_colors[obj['field_type']], width=3)
                  # outline=obj["border_color"], width=3)
      
      image_drawer.rectangle((bbox[0], bbox[3], bbox[2], bbox[3]-20), fill="white")

      image_drawer.text((bbox[0], bbox[3]-20), obj['field_type'] + " | " + obj["text_value"], 
                font = font, fill=border_colors[obj['field_type']])

      xml.write('  <{field_type} value="{value}" confidence="100" page="{page}" left="{x}" top="{y}" width="{w}" height="{h}"/>' \
            .format(field_type=obj['field_type'], value=obj["text_value"].replace('"', '').replace('<', '').replace('>', ''),
                x=bbox[2], y=bbox[1], w=bbox[2]-bbox[0], h=bbox[1]-bbox[3], page=pageNum)
            + '\n')
      
    images[pageNum].save("results/" + docName + "_" + str(pageNum) + ".jpg", "JPEG")
    

In [3]:
if __name__ == '__main__':
  MODEL_NAME = 'model'
  PDF_PATH = "input/"
  HANDLED_PATH = "handled/"
  SLEEP_TIME = 5

  border_colors = {
    'Number': 'red',
    'Date': 'deepskyblue',
    'Org': 'blue',
    'Money': 'green',
    'Debtor': 'orange',
    'Person': 'black',
    'Judge': 'yellow',
    'DebtorDate': 'purple',
    'LetterNumber': 'pink',
    'LetterDate': 'gray', 
    'PassportNum': 'red',
    'PassportSeries': 'red'
  }

  la_params = LAParams()
  la_params.line_margin = 1.6
  la_params.boxes_flow = 0.5

  createDirIfNotExist(PDF_PATH)
  createDirIfNotExist(HANDLED_PATH)
  createDir("results/", ".jpg")
  createDir("xmls/", ".xml")

  navec = Navec.load('vocab.tar')
  ner = NER.load(MODEL_NAME + '.tar')
  ner.navec(navec)

  while True:
    for doc_name in get_files(PDF_PATH, ".pdf"):
      images = convert_from_path(PDF_PATH + doc_name, dpi = STANDART_DPI * DPI_SCALE)

      fp = open(PDF_PATH + doc_name, 'rb')
      parser = PDFParser(fp)
      document = PDFDocument(parser)
      
      xml = io.open("xmls/" + doc_name.replace('.pdf', '') + ".xml", "w", encoding="utf-8")
      
      xml.write('<?xml version="1.0" encoding="UTF-8"?>' + '\n')
      xml.write('<idcard>' + '\n')

      drawnBoxes = []
      xmlsBoxes = []
      maxPageNum = 0
      
      for pageNum, page in enumerate(PDFPage.create_pages(document)):
        actName = doc_name.split(".")[0] + "_" + str(pageNum) 
        print(actName)

        text_lines = []
        
        rsr_mgr = PDFResourceManager()
        device = PDFPageAggregator(rsr_mgr, laparams=la_params)
        
        interpreter = PDFPageInterpreter(rsr_mgr, device)
        interpreter.process_page(page)
        
        layout = device.get_result()
        rawText = parse_obj(layout._objs, text_lines)

        processedText = re.sub(r"__+", r" ", rawText)

        if len(processedText) == 0:
          break

        markup = ner(processedText)
            
        extracted_nums = extractors.extractNums(processedText)
        highlightObjects(extracted_nums, 'Number', text_lines, drawnBoxes, xmlsBoxes, pageNum, border_colors)

        extracted_dates = extractors.extractDates(processedText)
        highlightObjects(extracted_dates, 'Date', text_lines, drawnBoxes, xmlsBoxes, pageNum, border_colors)

        extracted_orgs, extracted_persons = extractors.extractOrgs(markup)
        highlightObjects(extracted_orgs, 'Org', text_lines, drawnBoxes, xmlsBoxes, pageNum, border_colors)

        extracted_persons = extractors.extractPersons(processedText, markup)
        highlightObjects(extracted_persons, 'Person', text_lines, drawnBoxes, xmlsBoxes, pageNum, border_colors)

        extracted_debtors = extractors.extractDebtors(processedText)
        highlightObjects(extracted_debtors, 'Debtor', text_lines, drawnBoxes, xmlsBoxes, pageNum, border_colors)

        extracted_money = extractors.extractMoney(processedText)
        highlightObjects(extracted_money, 'Money', text_lines, drawnBoxes, xmlsBoxes, pageNum, border_colors)

        extracted_passport_series = extractors.extractPassportSeries(processedText)
        highlightObjects(extracted_passport_series, 'PassportSeries', text_lines, drawnBoxes, xmlsBoxes, pageNum, border_colors)

        extracted_passport_nums = extractors.extractPassportNums(processedText)
        highlightObjects(extracted_passport_nums, 'PassportNum', text_lines, drawnBoxes, xmlsBoxes, pageNum, border_colors)

        maxPageNum += 1

        
      saveToXML(xmlsBoxes, doc_name.replace('.pdf', ''), images, xml, maxPageNum, processedText, text_lines, drawnBoxes, xmlsBoxes, border_colors)
      xml.write('</idcard>' + '\n')
      xml.close()

      fp.close()
      move(PDF_PATH + doc_name, HANDLED_PATH + doc_name)

    time.sleep(SLEEP_TIME)

1_0
(1503, 2101, 1624, 2186)
(1616, 2101, 1787, 2186)
2_0
(1411, 2123, 1527, 2208)
(1516, 2123, 1678, 2208)
3_0
(2181, 2147, 2306, 2232)
(326, 2089, 505, 2174)
4_0
(389, 1418, 513, 1503)
(699, 1418, 868, 1504)
6_0
(1203, 1444, 1353, 1529)
(1356, 1444, 1530, 1529)
7_0
(907, 2102, 1033, 2187)
(1202, 2102, 1382, 2187)
9_0
(1372, 2075, 1494, 2160)
(1559, 2075, 1730, 2160)
Scan2021-12-08_145113_0
(1836, 1366, 1976, 1455)
(2033, 1366, 2232, 1455)
Scan2021-12-08_145113_006_0
(1056, 1081, 1185, 1171)
(1178, 1081, 1361, 1171)
Scan2021-12-08_145113_007_0
Scan2021-12-08_145113_012_0
(1194, 1988, 1337, 2077)
(1329, 1988, 1529, 2077)
Scan2021-12-08_145249_0
(1043, 1579, 1166, 1664)
(1170, 1579, 1342, 1664)
Scan2021-12-08_145249_002_0
(2083, 2439, 2223, 2532)
(317, 2373, 514, 2466)
Scan2021-12-08_145249_003_0
(1709, 1996, 1830, 2081)
(1880, 1996, 2054, 2081)
Scan2021-12-08_145249_005_0
(1554, 2485, 1679, 2571)
(1699, 2485, 1874, 2571)
Scan2021-12-08_145249_007_0
(773, 2499, 916, 2584)
(1000, 2499, 1

KeyboardInterrupt: 