In [1]:
import os
import re
import random   
import io
import time
from shutil import move

import pdfminer
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams

from pdf2image import convert_from_path
from PIL import Image, ImageDraw, ImageFont

from navec import Navec
from slovnet import NER
from ipymarkup import show_span_ascii_markup as show_markup

import extractors
from utils import checkSimilarity

In [2]:
RECEIVER_ORGS = {'Коллек':'ООО «АйДи Коллект'}
RECEIVER_PERSONS = []

In [3]:
def get_files(dirpath, ext):
  files = [s for s in os.listdir(dirpath)
     if os.path.isfile(os.path.join(dirpath, s)) and os.path.splitext(s)[1] == ext]
  files.sort()
  return files


def createDir(dirpath, ext):
  if os.path.exists(dirpath):
    for file in get_files(dirpath, ext):
      os.remove(os.path.join(dirpath, file))
  else:
    os.mkdir(dirpath)
    
def createDirIfNotExist(dirpath):
  if not os.path.exists(dirpath):
    os.mkdir(dirpath)

In [4]:
def draw_words(first_line_index, last_line_index, words, text_lines,
        drawn_boxes, xmls_boxes, pageNum, field_type, border_color):
  
  if first_line_index == last_line_index:
    
    first_line = text_lines[first_line_index]
    chars = []
    for elem in first_line._objs:
      if isinstance(elem, pdfminer.layout.LTChar):
        chars.append(elem)    
    first_line._objs = chars
    first_line_text = first_line.get_text()

    x_left = first_line_text.index(words[0])
    offset = x_left
    for word in words[:len(words)-1]:
      offset += len(word)

    if len(words) == 1:
      x_right = x_left + len(words[0]) - 1
    else:
      x_right = first_line_text.find(words[len(words) - 1], offset) + len(words[len(words) - 1]) - 1
    
    if x_right == -1:
      return False
    
    return drawElement(
        first_line._objs[x_left].bbox[0] - 2, 
        first_line._objs[x_left].bbox[1] - 3,
        first_line._objs[x_right].bbox[2] + 2,
        first_line._objs[x_right].bbox[3] + 3,
        ' '.join(words),
        drawn_boxes, xmls_boxes, pageNum,
        field_type=field_type, border_color=border_color
    )
  else:
    
    first_line = text_lines[first_line_index]
    chars = []
    for elem in first_line._objs:
      if isinstance(elem, pdfminer.layout.LTChar):
        chars.append(elem)    
    first_line._objs = chars
    first_line_text = first_line.get_text()

    x_left = first_line_text.index(words[0])

    if drawElement(
        first_line._objs[x_left].bbox[0] - 2, 
        first_line._objs[x_left].bbox[1] - 3,
        first_line._objs[len(first_line._objs) - 1].bbox[2] + 2,
        first_line._objs[len(first_line._objs) - 1].bbox[3] + 3,
        ' '.join(words),
        drawn_boxes, xmls_boxes, pageNum,
        field_type=field_type, border_color=border_color):
      
      last_line = text_lines[last_line_index]
      chars = []
      for elem in last_line._objs:
        if isinstance(elem, pdfminer.layout.LTChar):
          chars.append(elem)    
      last_line._objs = chars
      last_line_text = last_line.get_text()

      x_right = last_line_text.index(words[len(words) - 1]) + len(words[len(words) - 1]) - 1

      drawElement(
          last_line._objs[0].bbox[0] - 2, 
          last_line._objs[0].bbox[1] - 3,
          last_line._objs[x_right].bbox[2] + 2,
          last_line._objs[x_right].bbox[3] + 3,
          ' '.join(words),
          drawn_boxes, xmls_boxes, pageNum,
          field_type=field_type, border_color=border_color)
      
      for middle_line_index in range(first_line_index + 1, last_line_index):
        middle_line = text_lines[middle_line_index]
        chars = []
        for elem in middle_line._objs:
          if isinstance(elem, pdfminer.layout.LTChar):
            chars.append(elem)    
        middle_line._objs = chars

        drawElement(
            middle_line._objs[0].bbox[0] - 2, 
            middle_line._objs[0].bbox[1] - 3,
            middle_line._objs[len(middle_line._objs) - 1].bbox[2] + 2,
            middle_line._objs[len(middle_line._objs) - 1].bbox[3] + 3,
            ' '.join(words),
            drawn_boxes, xmls_boxes, pageNum,
            field_type=field_type, border_color=border_color)

      return True
    
    return False
  

In [5]:
def highLightWords(words, text_lines, drawn_boxes, xmls_boxes, pageNum,
           field_type=None, border_color="green", after_word=None):
  
  after_word_found = True if after_word is None else False
  
  for first_line_index in range(len(text_lines)):
    first_line = text_lines[first_line_index]
    chars = []
    for elem in first_line._objs:
      if isinstance(elem, pdfminer.layout.LTChar):
        chars.append(elem)    
        
    first_line._objs = chars
    first_line_text = first_line.get_text()
    
    founded_words = []
    
    if not after_word_found:
      if after_word not in first_line_text:
        continue
      else:
        after_word_found = True

    if words[0] not in first_line_text:
      continue

    added_new_word = True
    line_index = first_line_index
    
    while added_new_word:
      added_new_word = False
      
      if line_index == len(text_lines):
        return False
      
      line = text_lines[line_index]
    
      chars = []
      for elem in line._objs:
        if isinstance(elem, pdfminer.layout.LTChar):
          chars.append(elem)    
          
      line._objs = chars
      line_text = line.get_text()
      
      position = 0
      for word in words[len(founded_words):]:
        if word in line_text:
          position = line_text.find(word, position)
          if position != -1:
            founded_words.append(word)
            added_new_word = True
          else:
            break
        else:
          break
          
      if len(words) == len(founded_words):
        if draw_words(first_line_index, line_index, 
              words, text_lines,
              drawn_boxes, xmls_boxes, pageNum,
              field_type=field_type, border_color=border_color) == True:
          return True
        else:
          break
        
      line_index += 1
  return False


In [6]:
def drawElement(x0, y0, x1, y1, text_value, drawn_boxes, xmls_boxes, pageNum,
        field_type=None, border_color="green"):

  bbox = (int(DPI_SCALE * x0 - 5), int(DPI_SCALE * y0 - 5) - 10000 * pageNum,
        int(DPI_SCALE * x1 + 5), int(DPI_SCALE * y1 + 5) - 10000 * pageNum)

  if not (bbox, text_value) in drawn_boxes:
    drawn_boxes.append((bbox, text_value))
    if field_type is not None:
      xmls_boxes.append({'field_type': field_type, 'text_value': text_value, 
                 'bbox': bbox, "pageNum": pageNum, "border_color": border_color})
    return True

  return False


def parse_obj(lt_objs, text_lines_to_handle):
  global rawText

  for text_box in lt_objs:
    if isinstance(text_box, pdfminer.layout.LTTextBoxHorizontal):
      for line in sorted(text_box._objs, key=lambda obj: obj.y1, reverse=True):
        text = line.get_text()
        if len(text) > 5:
          text_lines_to_handle.append(line)
          rawText += text[:-1]
          if text[-1:] == "\n":
            rawText += " "  
          else:
            if text[-1:] == " ":
              rawText += " "  
            else:
              rawText += text[-1:] + " "  


# Markup docs

In [7]:
def highlightObjects(objects, field_type):
  # print("\nHighlight", field_type)
  for words in objects:
    # print(' '.join(words))
    highLightWords(words, text_lines, drawnBoxes, xmlsBoxes, pageNum,
     field_type=field_type, border_color=border_colors[field_type])

def findAndHighlight(string, field_type):
    new_highlights = [name.split(' ') for name in re.findall(string, processedText)]
    highlightObjects(new_highlights, field_type)
  

In [8]:
def handleNumbers(objects, im_height):
  numbers_objs = list(filter(lambda obj: obj['field_type'] == 'Number'
              and obj["bbox"][3] > 0
              and (len(obj["text_value"]) > 6
              and obj["bbox"][3] > im_height * 0.8), objects))
  numbers_objs = sorted(numbers_objs, key = lambda obj: obj['bbox'][3], reverse=True)

  nums = []
  if len(numbers_objs) > 0:
    numbers_objs[0]['field_type'] = 'LetterNumber'
    nums.append(numbers_objs[0])

  return nums

def handleMoney(objects, im_height):
  money_objs = list(filter(lambda obj: obj['field_type'] == 'Money', objects))
  money_objs = sorted(money_objs, key = lambda obj: obj['bbox'][3], reverse=True)

  return money_objs

 
def handleDates(objects, im_height):
  dates = []
  dates_objs = list(filter(lambda obj: obj['field_type'] == 'Date'
              and obj["bbox"][3] > 0, objects))
  dates_objs = sorted(dates_objs, key = lambda obj: obj['bbox'][3], reverse=True)

  debtors_objs = list(filter(lambda obj: (obj['field_type'] == 'Debtor'), objects))
  debtors_objs = sorted(debtors_objs, key = lambda obj: obj["bbox"][3], reverse=True)

  for index, date_obj in enumerate(dates_objs):
    for debtor_obj in debtors_objs:
      if (date_obj["bbox"][3] > debtor_obj["bbox"][1] - 20) and (date_obj["bbox"][3] < debtor_obj["bbox"][3] + 40):
        dates_objs[index]['field_type'] = 'DebtorDate'
        break
            
  debtor_dates_objs = list(filter(lambda obj: (obj['field_type'] == 'DebtorDate'), dates_objs))

  if len(debtor_dates_objs) > 0:
    dates.append(debtor_dates_objs[0])

  LetterNumbers_objs = list(filter(lambda obj: obj['field_type'] == 'LetterNumber', objects))
  if len(LetterNumbers_objs) > 0:
    letterNumber_obj = LetterNumbers_objs[0]
    for index, date_obj in enumerate(dates_objs):
      if (date_obj["bbox"][3] > letterNumber_obj["bbox"][1] - 100) and (date_obj["bbox"][3] < letterNumber_obj["bbox"][3] + 40):
        dates_objs[index]['field_type'] = 'LetterDate'
        break
  else:
    if len(dates_objs) > 0:
      if dates_objs[0]['field_type'] == 'Date':
        dates_objs[0]['field_type'] = 'LetterDate'


  letter_dates_objs = list(filter(lambda obj: (obj['field_type'] == 'LetterDate'), dates_objs))

  if len(letter_dates_objs) > 0:
    dates.append(letter_dates_objs[0])

  return dates


def handlePersons(objects, im_height):
  top_person_objs = list(filter(lambda obj: (obj['field_type'] == 'Person' 
                    and len(re.findall(r"[А-Я]", obj['text_value'])) >= 3
                    and (len(obj["text_value"].split(" ")) > 1 or len(obj["text_value"].split(".")) > 1)
                    and 'мировой' not in obj['text_value'].lower()
                    and obj["bbox"][3] > im_height * 0.6), objects))
  top_person_objs = sorted(top_person_objs, key = lambda obj: obj["bbox"][3], reverse=True)

  top_debtors_objs = list(filter(lambda obj: (obj['field_type'] == 'Debtor'), objects))
  top_debtors_objs = sorted(top_debtors_objs, key = lambda obj: obj["bbox"][3], reverse=True)

  if len(top_person_objs) > 2:
    top_person_objs = top_person_objs[0:2]

  if len(top_debtors_objs) > 1:
    top_debtors_objs = top_debtors_objs[0:1]
  
  if len(top_person_objs) > 0:
    if len(top_debtors_objs) > 0:
      top_person = top_person_objs[0]["text_value"]
      debtor_person = top_debtors_objs[0]["text_value"]
      if not checkSimilarity(top_person, debtor_person):
        top_person_objs[0]['field_type'] = 'Judge'
    else:
      top_person_objs[0]['field_type'] = 'Judge'
      if len(top_person_objs) > 1:
        top_person_objs[1]['field_type'] = 'Debtor'
    
  top_person_objs = list(filter(lambda obj: (obj['field_type'] != 'Person'), top_person_objs))
  top_debtors_objs = list(filter(lambda obj: (obj['field_type'] != 'Person'), top_debtors_objs))

  debtor_name = ''
  if len(top_debtors_objs) > 0:
    debtor_name = top_debtors_objs[0]["text_value"]
  elif len(top_person_objs) > 1:
    debtor_name = top_person_objs[1]["text_value"]

  if debtor_name != '':
    findAndHighlight(debtor_name, 'Debtor')
    persons_objs = list(filter(lambda obj: (obj['field_type'] == 'Person'), objects))
    for index, person_obj in enumerate(persons_objs):
      person_name = person_obj["text_value"]
      if checkSimilarity(person_name, debtor_name):
        persons_objs[index]['field_type'] = 'Debtor'
        findAndHighlight(person_name, 'Debtor')
  
  return top_person_objs + top_debtors_objs


def handleOrgs(objects, im_height, im_width):
  top_orgs_objs = list(filter(lambda obj: (obj['field_type'] == "Org" 
                      and (len(obj["text_value"].split(" ")) > 1 or len(obj["text_value"].split("«")) > 1)
                      and obj["bbox"][3] > im_height * 0.6), objects))
  top_orgs_objs = sorted(top_orgs_objs, key = lambda obj: obj["bbox"][3], reverse=True)

  return top_orgs_objs


def saveToXML(objects, docName, images, xml, maxPageNum):  
  font = ImageFont.truetype("Arsenal-Regular.otf", 20)

  unique_values = set()
  objects = [o for o in objects
      if (o['text_value'], o['bbox']) not in unique_values
      and not unique_values.add((o['text_value'], o['bbox']))]

  nums_objs = handleNumbers(objects, images[0].size[1])
  persons_objs = handlePersons(objects, images[0].size[1])
  dates_objs = handleDates(objects, images[0].size[1])
  money_objs = handleMoney(objects, images[0].size[1])
  orgs_objs = handleOrgs(objects, images[0].size[1], images[0].size[0])
  
  # objects = nums_objs + dates_objs + persons_objs + orgs_objs
  objects = persons_objs + dates_objs + nums_objs + money_objs + orgs_objs
  
    
  for pageNum in range(0, maxPageNum):
    im_height = images[pageNum].size[1]
    image_drawer = ImageDraw.Draw(images[pageNum])
    
    for obj in objects:
      if obj["pageNum"] != pageNum:
        continue
      
      bbox = (obj["bbox"][0], int(im_height - obj["bbox"][1] - pageNum * 10000),
          obj["bbox"][2], int(im_height - obj["bbox"][3] - pageNum * 10000))

      image_drawer.rectangle((bbox[0], bbox[1], bbox[2], bbox[3] + random.randint(2, 10)), 
                  outline=border_colors[obj['field_type']], width=3)
                  # outline=obj["border_color"], width=3)
      
      image_drawer.rectangle((bbox[0], bbox[3], bbox[2], bbox[3]-20), fill="white")

      image_drawer.text((bbox[0], bbox[3]-20), obj['field_type'] + " | " + obj["text_value"], 
                font = font, fill=border_colors[obj['field_type']])

      xml.write('  <{field_type} value="{value}" confidence="100" page="{page}" left="{x}" top="{y}" width="{w}" height="{h}"/>' \
            .format(field_type=obj['field_type'], value=obj["text_value"].replace('"', '').replace('<', '').replace('>', ''),
                x=bbox[2], y=bbox[1], w=bbox[2]-bbox[0], h=bbox[1]-bbox[3], page=pageNum)
            + '\n')
      
    images[pageNum].save("results/" + docName + "_" + str(pageNum) + ".jpg", "JPEG")
    

In [9]:
if __name__ == '__main__':
  MODEL_NAME = 'model'
  DPI_SCALE = 4.17
  STANDART_DPI = 72
  PDF_PATH = "input/"
  HANDLED_PATH = "handled/"
  SLEEP_TIME = 5

  border_colors = {
    'Number': 'red',
    'Date': 'deepskyblue',
    'Org': 'blue',
    'Money': 'green',
    'Debtor': 'orange',
    'Person': 'black',
    'Judge': 'yellow',
    'DebtorDate': 'purple',
    'LetterNumber': 'pink',
    'LetterDate': 'gray'
  }

  la_params = LAParams()
  la_params.line_margin = 1.6
  la_params.boxes_flow = 0.5

  createDirIfNotExist(PDF_PATH)
  createDirIfNotExist(HANDLED_PATH)
  createDir("results/", ".jpg")
  createDir("xmls/", ".xml")

  navec = Navec.load('vocab.tar')
  ner = NER.load(MODEL_NAME + '.tar')
  ner.navec(navec)

  while True:
      
    for doc_name in get_files(PDF_PATH, ".pdf"):
      images = convert_from_path(PDF_PATH + doc_name, dpi = STANDART_DPI * DPI_SCALE)

      fp = open(PDF_PATH + doc_name, 'rb')
      parser = PDFParser(fp)
      document = PDFDocument(parser)
      
      xml = io.open("xmls/" + doc_name.replace('.pdf', '') + ".xml", "w", encoding="utf-8")
      
      xml.write('<?xml version="1.0" encoding="UTF-8"?>' + '\n')
      xml.write('<idcard>' + '\n')

      drawnBoxes = []
      xmlsBoxes = []
      maxPageNum = 0
      
      for pageNum, page in enumerate(PDFPage.create_pages(document)):
        actName = doc_name.split(".")[0] + "_" + str(pageNum) 
        print(actName)
      
        rawText = ""
        text_lines = []
        
        rsr_mgr = PDFResourceManager()
        device = PDFPageAggregator(rsr_mgr, laparams=la_params)
        
        interpreter = PDFPageInterpreter(rsr_mgr, device)
        interpreter.process_page(page)
        
        layout = device.get_result()
        parse_obj(layout._objs, text_lines)

        processedText = re.sub(r"__+", r" ", rawText)

        # print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        # if len(processedText) == 0:
        #   print("No text")
        # else:
        #   print(processedText)
        # print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")

        if len(processedText) == 0:
          break

        markup = ner(processedText)
            
        extracted_nums = extractors.extractNums(processedText)
        highlightObjects(extracted_nums, 'Number')

        extracted_dates = extractors.extractDates(processedText)
        highlightObjects(extracted_dates, 'Date')

        extracted_orgs, extracted_persons = extractors.extractOrgs(markup)
        highlightObjects(extracted_orgs, 'Org')
        # highlightObjects(extracted_persons, 'Person')

        extracted_persons = extractors.extractPersons(processedText, markup)
        highlightObjects(extracted_persons, 'Person')

        extracted_debtors = extractors.extractDebtors(processedText)
        highlightObjects(extracted_debtors, 'Debtor')

        extracted_money = extractors.extractMoney(processedText)
        highlightObjects(extracted_money, 'Money')

        # extracted_addresses = extractors.extractAddresses(text_lines)
        # print(extracted_addresses)

        maxPageNum += 1

        
      saveToXML(xmlsBoxes, doc_name.replace('.pdf', ''), images, xml, maxPageNum)
      xml.write('</idcard>' + '\n')
      xml.close()

      fp.close()
      # move(PDF_PATH + doc_name, HANDLED_PATH + doc_name)

    time.sleep(SLEEP_TIME + 10000000)

1_0
2_0
3_0
4_0
6_0
7_0
9_0
Scan2021-12-08_145113_0
Scan2021-12-08_145113_006_0
Scan2021-12-08_145113_007_0
Scan2021-12-08_145113_012_0
Scan2021-12-08_145249_0
Scan2021-12-08_145249_002_0
Scan2021-12-08_145249_003_0
Scan2021-12-08_145249_005_0
Scan2021-12-08_145249_007_0
Scan2021-12-08_145249_010_0
Scan2021-12-08_145249_012_0
Scan2021-12-08_145249_013_0
Scan2021-12-08_145249_015_0
Scan2021-12-08_145249_016_0
Scan2021-12-08_145249_018_0
Scan2021-12-08_145249_020_0
Scan2021-12-08_145249_021_0
Scan2021-12-08_145249_023_0
Scan2021-12-08_145249_025_0
Scan2021-12-08_145249_026_0
Scan2021-12-08_145249_027_0
Scan2021-12-08_145420_0
Scan2021-12-08_145420_1
Scan2021-12-08_145420_001_0
Scan2021-12-08_145420_001_1
Scan2021-12-08_145420_002_0
Scan2021-12-08_145420_002_1
Scan2021-12-08_145420_006_0
Scan2021-12-08_145420_006_1


KeyboardInterrupt: 