In [18]:
import re
import json
import base64
import sys, fitz
import pytesseract

from pdf2image import convert_from_bytes
from PIL import Image
from time import time
from io import BytesIO
from textsearch import TextSearch

In [19]:
def read_text(base64_string: str = ''):
    content = ''
    content_type = ''

    # Try to extract with PDF reader
    try:
        temporary = BytesIO()
        temporary.write(base64.b64decode(base64_string))
        temporary.seek(0)

        doc = fitz.open(stream=temporary, filetype="pdf")

        for page in doc:
            text = page.get_text().encode("utf8")  # get plain text (is in UTF-8)
            content += text.decode()
        
        if content != '':
            content_type = 'PDF'
    except:
        pass
    
    # Try to extract with PDF to Image OCR
    if content == '':
        try:
            pages = convert_from_bytes(base64.b64decode(base64_string), 500)

            for pageNum, imgBlob in enumerate(pages):
                content += pytesseract.image_to_string(imgBlob)
        
            if content != '':
                content_type = 'PDF_Image'
        except:
            pass
    
    # Try to extract with Image OCR
    if content == '':
        try:
            content = pytesseract.image_to_string(Image.open(temporary))
        
            if content != '':
                content_type = 'Image'
        except:
            pass
    
    if content_type != '' and content != '':
        return ( content_type, content )
    else:
        return ( 'Failed', 'Cannot read uploaded document' )

def clean_up_text(text: str = ''):
    return re.sub(
        r'\s\s+', ' ', re.sub(
            r'\n', ' ', re.sub(
                r'[^\w\s]', ' ', text
            )
        )
    ).lower().strip()

def extract(base64_string: str = '', search: dict = {}):
    
    ( content_type, content_text ) = read_text(base64_string)
    
    result = {
        'error': True,
        'message': content_text,
        'data': None
    }

    if content_type != 'Failed':
        
        content = clean_up_text(content_text)
        text_search = TextSearch(case="ignore", returns="match")
        text_found = 0
        text_data = {
            'type': content_type,
            'results': {
                'similar': 0,
                'keywords': {}
            },
            'text': content,
        }
        
        for item in content.split(' '):
            text_search.add([item])
        
        for prop in search:
            text_search_result = text_search.findall(clean_up_text(search[prop]))
            
            if len(text_search_result) > 0:
                start = content.find(clean_up_text(search[prop]))
                length = len(' '.join(text_search_result))
                text_data['results']['keywords'][prop] = {
                    'position': {
                        'start': start,
                        'end': (start + length),
                        'length': length,
                    },
                    'query': search[prop],
                    'property': prop,
                    'similar': text_search_result
                }
                text_found += 1
                
        if text_found > 0:
            
            text_data['results']['similar'] = text_found
            
            result['error'] = False
            result['message'] = 'Found similar string for {} keywords'.format(text_found)
            result['data'] = text_data
    
    print(json.dumps(result, indent = 2))

In [21]:
with open('./test/images/01-handwritten.png', 'rb') as file:
    encoded_image = base64.b64encode(file.read())
    extract(encoded_image, {
        'sample': 'HandWritten',
        'sample2': 'good'
    })

{
  "error": false,
  "message": "Found similar string for 1 keywords",
  "data": {
    "type": "Image",
    "results": {
      "similar": 1,
      "keywords": {
        "sample": {
          "position": {
            "start": 10,
            "end": 21,
            "length": 11
          },
          "query": "HandWritten",
          "property": "sample",
          "similar": [
            "handwritten"
          ]
        }
      }
    },
    "text": "this is a handwritten example write as qooal as you can"
  }
}


In [23]:
with open('./test/images/02-receipt.png', 'rb') as file:
    encoded_image = base64.b64encode(file.read())
    extract(encoded_image, {
        'sample': 'G2 RETRACT BOLD',
        'sample2': 'Reg 1/4.69'
    })

{
  "error": false,
  "message": "Found similar string for 2 keywords",
  "data": {
    "type": "Image",
    "results": {
      "similar": 2,
      "keywords": {
        "sample": {
          "position": {
            "start": 179,
            "end": 194,
            "length": 15
          },
          "query": "G2 RETRACT BOLD",
          "property": "sample",
          "similar": [
            "g2",
            "retract",
            "bold"
          ]
        },
        "sample2": {
          "position": {
            "start": 222,
            "end": 232,
            "length": 10
          },
          "query": "Reg 1/4.69",
          "property": "sample2",
          "similar": [
            "reg",
            "1",
            "4",
            "69"
          ]
        }
      }
    },
    "text": "store 05666 3515 del mar hts rd san diego ca 92130 858 792 7040 register 4 transaction 571140 cashier 56661020 8 20 17 5 45pm wellnesst with plenti plenti card 31xxxxxxxxxx4553 1 g2 retrac

In [298]:
with open('./test/images/03-numeric.png', 'rb') as file:
    encoded_image = base64.b64encode(file.read())
    extract(encoded_image, {
        'sample': '01234567890',
        'sample2': 'STUFF'
    })

{
  "error": false,
  "message": "Found similar string for 2 keywords",
  "data": {
    "type": "Image",
    "results": {
      "similar": 2,
      "keywords": {
        "sample": {
          "position": {
            "start": 19,
            "end": 30,
            "length": 11
          },
          "query": "01234567890",
          "property": "sample",
          "similar": [
            "01234567890"
          ]
        },
        "sample2": {
          "position": {
            "start": 13,
            "end": 18,
            "length": 5
          },
          "query": "STUFF",
          "property": "sample2",
          "similar": [
            "stuff"
          ]
        }
      }
    },
    "text": "explain that stuff 01234567890"
  }
}


In [299]:
with open('./test/images/04-paragraph.png', 'rb') as file:
    encoded_image = base64.b64encode(file.read())
    extract(encoded_image, {
        'sample': 'Infinite.',
        'sample2': 'Very Unlikely'
    })

{
  "error": false,
  "message": "Found similar string for 2 keywords",
  "data": {
    "type": "Image",
    "results": {
      "similar": 2,
      "keywords": {
        "sample": {
          "position": {
            "start": 172,
            "end": 180,
            "length": 8
          },
          "query": "Infinite.",
          "property": "sample",
          "similar": [
            "infinite"
          ]
        },
        "sample2": {
          "position": {
            "start": 308,
            "end": 321,
            "length": 13
          },
          "query": "Very Unlikely",
          "property": "sample2",
          "similar": [
            "very",
            "unlikely"
          ]
        }
      }
    },
    "text": "problem now we are creating an ocr for handwritten bengali text the main problem arises due to the fact that we are doing it for handwritten text so our sample set is very infinite also different samples have different characteristics the handwriting sampl

In [300]:
with open('./test/images/05-gradient.png', 'rb') as file:
    encoded_image = base64.b64encode(file.read())
    extract(encoded_image, {
        'sample': 'Q 5850',
        'sample2': '1-ID 5850'
    })

{
  "error": false,
  "message": "Found similar string for 2 keywords",
  "data": {
    "type": "Image",
    "results": {
      "similar": 2,
      "keywords": {
        "sample": {
          "position": {
            "start": 88,
            "end": 94,
            "length": 6
          },
          "query": "Q 5850",
          "property": "sample",
          "similar": [
            "q",
            "5850"
          ]
        },
        "sample2": {
          "position": {
            "start": 150,
            "end": 159,
            "length": 9
          },
          "query": "1-ID 5850",
          "property": "sample2",
          "similar": [
            "1",
            "id",
            "5850"
          ]
        }
      }
    },
    "text": "having just recently reviewed the ati radeo we were keen to get our hands on the cheape q 5850 though both cards were announced week until the radeon 1 id 5850 could be sh have finally dug up a production model from"
  }
}


In [301]:
with open('./test/images/06-scan-document.pdf', 'rb') as file:
    encoded_image = base64.b64encode(file.read())
    extract(encoded_image, {
        'sample': 'Dr. P.N. Cundall,',
        'sample2': 'P.J. CROSS'
    })

{
  "error": false,
  "message": "Found similar string for 2 keywords",
  "data": {
    "type": "PDF_Image",
    "results": {
      "similar": 2,
      "keywords": {
        "sample": {
          "position": {
            "start": 144,
            "end": 158,
            "length": 14
          },
          "query": "Dr. P.N. Cundall,",
          "property": "sample",
          "similar": [
            "dr",
            "p",
            "n",
            "cundall"
          ]
        },
        "sample2": {
          "position": {
            "start": 995,
            "end": 1004,
            "length": 9
          },
          "query": "P.J. CROSS",
          "property": "sample2",
          "similar": [
            "p",
            "j",
            "cross"
          ]
        }
      }
    },
    "text": "the slerexe company limited sapors lane boole dorset bh 25 8 er telephone boole 945 13 51617 telex 123456 our ref 350 pjc eac 18th january 1972 dr p n cundall mining surveys ltd holroy

In [302]:
with open('./test/images/07-scan-monochrome.pdf', 'rb') as file:
    encoded_image = base64.b64encode(file.read())
    extract(encoded_image, {
        'sample': '"Fi-5900C"',
        'sample2': 'Minus 6%'
    })

{
  "error": false,
  "message": "Found similar string for 2 keywords",
  "data": {
    "type": "PDF_Image",
    "results": {
      "similar": 2,
      "keywords": {
        "sample": {
          "position": {
            "start": 717,
            "end": 725,
            "length": 8
          },
          "query": "\"Fi-5900C\"",
          "property": "sample",
          "similar": [
            "fi",
            "5900c"
          ]
        },
        "sample2": {
          "position": {
            "start": 1481,
            "end": 1488,
            "length": 7
          },
          "query": "Minus 6%",
          "property": "sample2",
          "similar": [
            "minus",
            "6"
          ]
        }
      }
    },
    "text": "pfu business report new customer s development and increasing the sale of product my country economy at this season keeps escaping from odoba of business though holds a crude oil high so on unstable element that continues still and recovering g

In [303]:
with open('./test/pdf/01-simple.pdf', 'rb') as file:
    encoded_image = base64.b64encode(file.read())
    extract(encoded_image, {
        'sample': 'Box 2703 Whitehorse',
        'sample2': 'http://www.education.gov.yk.ca'
    })

{
  "error": false,
  "message": "Found similar string for 2 keywords",
  "data": {
    "type": "PDF",
    "results": {
      "similar": 2,
      "keywords": {
        "sample": {
          "position": {
            "start": 255,
            "end": 274,
            "length": 19
          },
          "query": "Box 2703 Whitehorse",
          "property": "sample",
          "similar": [
            "box",
            "2703",
            "whitehorse"
          ]
        },
        "sample2": {
          "position": {
            "start": 324,
            "end": 352,
            "length": 28
          },
          "query": "http://www.education.gov.yk.ca",
          "property": "sample2",
          "similar": [
            "http",
            "www",
            "education",
            "gov",
            "yk",
            "ca"
          ]
        }
      }
    },
    "text": "pdf test file congratulations your computer is equipped with a pdf portable document format reader you should be a