In [1]:
import re
import json
import html_to_json
import sys, fitz
import spacy # flair / nltk (NER)

import en_core_web_sm

from time import time
from markdown import markdown as markdown_to_html

In [2]:
nlp = en_core_web_sm.load()

In [34]:
def read_text(file: str = ''):
    reader = fitz.open(file)

    isJournal = False
    content = ''

    doc = fitz.open(file)

    for page in doc:

        text = page.get_text().encode("utf8")  # get plain text (is in UTF-8)
        decoded = text.decode()
        
        result = re.sub(
            r'([\w\W])\s\n([\w\W])|(\-)\n([\w\W])', '\g<1> \g<2>', decoded
        , 0, re.MULTILINE)
        
        hasJournalKeyword = re.search(r'\s?(Abstract|Abstrak|ABSTRACT|ABSTRAK)\s?', result)

        if hasJournalKeyword:
            isJournal = True
        
        content += result

    return ( isJournal, content )

def extract(file: str = ''):
    
    ( journal, content ) = read_text(file)

    result = ''
    
    if journal:

        content = re.sub(
            r'^([0-9])\n([a-zA-Z])', '\g<1>. \g<2>', re.sub(
                r'^([a-zA-Z])\n([a-zA-Z])', '\g<1>\g<2>', re.sub(
                    r'\.{2,}', '', re.sub(
                        r'\s((m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3}))|(M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}))|\d+)\.\s', '\n\g<1>. ', re.sub(
                            r'\[(.*)\]\s?\n(.*?)\n', '[\\g<1>](\\g<2>)\n', re.sub(
                                r'((m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3}))|\d+)\)\n([A-Z])', '\g<1>) \g<6>', re.sub( 
                                    r'((M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}))|\d+)\.\n([A-Z])', '\g<1>. \g<6>', content
                                , 0, re.MULTILINE)
                            , 0, re.MULTILINE)
                        , 0, re.MULTILINE)
                    , 0, re.MULTILINE)
                , 0, re.MULTILINE)
            , 0, re.MULTILINE)
        , 0, re.MULTILINE)

        result = ''

        for line in content.split('\n'):
            hasFormula = re.search(r'\s{2}\s+', line)

            if re.search(r'^([0-9a-zA-Z]|\[)', line) and len(line) > 7.5 and hasFormula == None:
                result += line + '\n\n'

        result = re.sub(r'([\w\W]{30})\n{2}([a-z])', '\g<1> \g<2>', result)

    else:
        
        result = content
    
    nlp_entities = []
    nlp_result = nlp(result)
    
    for item in nlp_result.ents:
        nlp_entities.append({
            'label': item.label_,
            'text': item.text
        })
    
    print(result, nlp_entities)




In [35]:
extract('./test/pdf/01-simple.pdf')

 
 PDF Test File  
Congratulations, your computer is equipped with a PDF (Portable Document Format) reader!  You should be able to view any of the PDF documents and forms available on our site.  PDF forms are indicated by these icons:   or  .    
Yukon Department of Education Box 2703 Whitehorse,Yukon Canada Y1A 2C6  
Please visit our website at:  http://www.education.gov.yk.ca/
   
 [{'label': 'ORG', 'text': 'PDF Test File'}, {'label': 'ORG', 'text': 'PDF'}, {'label': 'PERSON', 'text': 'Document Format'}, {'label': 'ORG', 'text': 'PDF'}, {'label': 'ORG', 'text': 'PDF'}, {'label': 'ORG', 'text': 'Yukon Department of Education'}, {'label': 'CARDINAL', 'text': '2703'}, {'label': 'ORG', 'text': 'Yukon Canada'}, {'label': 'CARDINAL', 'text': '2C6'}]


In [None]:
extract('./test/pdf/02-text-image.pdf')

In [None]:
extract('./test/pdf/03-invoice.pdf')

In [None]:
extract('./test/pdf/04-journal.pdf')

In [None]:
extract('./test/pdf/05-complex.pdf')