# Document Extraction

Extract and normalize any information that available inside document, image, or scanned document. The information include extension, mime type, text content, colors, angle rotation, and file size.

#### Import any used dependencies

In [64]:
import re
import os
import cv2
import textract
import tempfile
import mimetypes
import pytesseract
import numpy

from PIL import Image, ImageStat
from colorthief import ColorThief
from deskew import determine_skew as DetermineSkew
from skimage.color import rgb2gray as Grayscale
from skimage.transform import rotate as Rotate
from pdf2image import convert_from_path as ConvertToImage

#### Straighten Image

Utility function to rotate or tilt the image from unsuitable to proper rotation, to help OCR (Optical Character Recognition) engine to read the text.

In [65]:
def straighten(filepath: str = '', extension: str = '.jpg'):
    image     = cv2.imread(filepath, cv2.IMREAD_COLOR)
    grayscale = Grayscale(image)
    angle     = DetermineSkew(grayscale)
    rotated   = Rotate(image, angle, resize = True) * 255
    result    = rotated.astype(numpy.uint8)

    temporary_filename = tempfile.gettempdir() + extension

    cv2.imwrite(temporary_filename, result)

    return (temporary_filename, angle)

#### Color Extraction

Utility function to extract any possible color and type of image (grayscale, colorize, or black white).

In [66]:
def colors(filepath, thumb_size=40, error_tolerant=22, adjust_color_bias=True):
    color = ColorThief(filepath)

    grayscale = False
    colorize = False
    black_white = False

    image = Image.open(filepath)
    bands = image.getbands()

    if bands == ('R', 'G', 'B') or bands == ('R', 'G', 'B', 'A'):
        thumb = image.resize((thumb_size, thumb_size))
        factor, bias = 0, [0, 0, 0]

        if adjust_color_bias:
            bias = ImageStat.Stat(thumb).mean[:3]
            bias = [b - sum(bias)/3 for b in bias]

        for pixel in thumb.getdata():
            mu = sum(pixel)/3
            factor += sum(
                (pixel[i] - mu - bias[i]) *
                (pixel[i] - mu - bias[i]) for i in [0, 1, 2]
            )

        # MSE = Mean Square Error
        MSE = float(factor)/(thumb_size*thumb_size)

        if MSE <= error_tolerant:
            grayscale = True
        else:
            colorize = True

    elif len(bands) == 1:
        black_white = True
    else:
        pass

    dominant_color = []

    for palette in color.get_palette(color_count=6):
        palette_list = []

        for pixel in palette:
            palette_list.append(str(pixel))
        
        dominant_color.append('rgb({})'.format(','.join(palette_list)))

    return {
        'grayscale': grayscale,
        'colorize': colorize,
        'black_white': black_white,
        'dominant': dominant_color
    }

#### Extraction Function

A main function to extract any information using OCR engine and some of utilities above.

Supported Extension .csv .doc .docx .epub .gif .jpg .jpeg .json .html .htm .odt .pdf .png .pptx .rtf .tiff .tif .txt .xlsx .xls

In [83]:

def extract(filepath: str = ''):    

    name, extension = os.path.splitext(filepath)
    mime = mimetypes.guess_type(filepath)
    mime = mime[0] # root of mime type
    angle = 0
    encoding = 'utf-8'
    colorize = None

    try:
        colorize = colors(filepath=filepath)
    except:
        pass

    content = textract.process(filename=filepath, encoding=encoding, extension=extension).strip()

    if len(content) == 0:
        try:
            pages = ConvertToImage(filepath)
            page_string = ''

            for _, page_image in enumerate(pages):
                page_string += str(pytesseract.image_to_string(page_image))

            if len(page_string) > 0:
                mime = 'application/pdf'
                extension = '.pdf'
                content = page_string
        except:
            pass

    if len(content) == 0:
        (straighted, angle) = straighten(filepath=filepath, extension=extension)
        content = textract.process(filename=straighted, encoding=encoding, extension=extension).strip()

        os.remove(straighted)

    size = os.stat(filepath).st_size

    try:
        content = content.decode(encoding)
    except:
        pass

    try:
        content = str(content, encoding)
    except:
        pass

    regex_multiple_dots = r'\.{2,}'
    regex_splitted_text = r'^([0-9])\n([a-zA-Z])'
    regex_splitted_alpha = r'^([a-zA-Z])\n\n?([a-zA-Z])'
    regex_splitted_romanian = r'\s((m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3}))|(M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}))|\d+)\.\s'
    regex_splitted_references = r'\[(.*?)\]\s?\n(.*?)\n'
    regex_lowercase_romanian = r'((m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3}))|\d+)\)\n([A-Z])'
    regex_uppercase_romanian = r'((M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}))|\d+)\.\n([A-Z])'

    content = re.sub(
        regex_multiple_dots, '', re.sub(
            regex_splitted_text, '\g<1>. \g<2>', re.sub(
                regex_splitted_alpha, '\g<1>\g<2>', re.sub(
                    regex_splitted_romanian, '\n\g<1>. ', re.sub(
                        regex_splitted_references, '[\\g<1>](\\g<2>)\n', re.sub(
                            regex_lowercase_romanian, '\g<1>) \g<6>', re.sub( 
                                regex_uppercase_romanian, '\g<1>. \g<6>', content
                            , 0, re.MULTILINE)
                        , 0, re.MULTILINE)
                    , 0, re.MULTILINE)
                , 0, re.MULTILINE)
            , 0, re.MULTILINE)
        , 0, re.MULTILINE)
    , 0, re.MULTILINE)

    bracket = ''

    for line in content.split('\n'):
        line_has_formula = re.search(r'\s{2}\s+', line)

        if re.search(r'^([0-9a-zA-Z]|\[)', line) and len(line) > 7.5 and line_has_formula == None:
            bracket += line + '\n\n'

    content = re.sub(r'([\w\W]{10})\n{2}([\w\W])', '\g<1> \g<2>', bracket).strip()

    return {
        'format': extension,
        'mime'  : mime,
        'color' : colorize,
        'angle' : angle,
        'size'  : size,
        'text'  : content
    }

#### Handwritten Image

<img alt="Image" style="width:400px;" src="https://github.com/TheOwlEngine/Research/blob/main/test/images/01-handwritten.png?raw=true"/>

In [84]:
extract(filepath='./test/images/01-handwritten.png')

{'format': '.png',
 'mime': 'image/png',
 'color': {'grayscale': False,
  'colorize': True,
  'black_white': False,
  'dominant': ['rgb(167,167,167)',
   'rgb(34,34,34)',
   'rgb(84,84,84)',
   'rgb(109,109,109)',
   'rgb(92,92,92)',
   'rgb(124,124,124)']},
 'angle': 0,
 'size': 261736,
 'text': 'This is a handwritten Write as qooal as you can.'}

#### Receipt Image

<img alt="Image" style="width:400px;" src="https://github.com/TheOwlEngine/Research/blob/main/test/images/02-receipt.png?raw=true"/>

In [85]:
extract(filepath='./test/images/02-receipt.png')

{'format': '.png',
 'mime': 'image/png',
 'color': {'grayscale': True,
  'colorize': False,
  'black_white': False,
  'dominant': ['rgb(84,84,76)',
   'rgb(140,140,132)',
   'rgb(52,36,36)',
   'rgb(196,188,188)',
   'rgb(20,12,12)',
   'rgb(68,44,20)']},
 'angle': 0,
 'size': 48514,
 'text': 'Store #05666 3515 DEL MAR HTS,RD SAN DIEGO, CA 92130 Register #4 Transaction #571140 Cashier #56661020 8/20/17 5:45PM wellness+ with Plenti Plenti Card#: 31XXXXXXXXXX4553 1 G2 RETRACT BOLD BLK 2PK 1.99 T SALE 1/1.99, Reg 1/4.69 Discount 2.70- 1 Items Subtotal 1.99 xMASTER* 2.14 MASTER card * #XXXXXXXXXXXX548S App #AA APPROVAL AUTO Ref # 05639E Entry Method: Chip'}

#### Alpha Numeric Image

<img alt="Image" style="width:400px;" src="https://github.com/TheOwlEngine/Research/blob/main/test/images/03-numeric.png?raw=true"/>

In [86]:
extract(filepath='./test/images/03-numeric.png')

{'format': '.png',
 'mime': 'image/png',
 'color': {'grayscale': False,
  'colorize': False,
  'black_white': True,
  'dominant': ['rgb(7,7,7)',
   'rgb(145,145,145)',
   'rgb(251,251,251)',
   'rgb(109,109,109)',
   'rgb(196,196,196)',
   'rgb(124,124,124)']},
 'angle': 0,
 'size': 6552,
 'text': '01234567890'}

#### Paragraph Image

<img alt="Image" style="width:400px;" src="https://github.com/TheOwlEngine/Research/blob/main/test/images/04-paragraph.png?raw=true"/>

In [87]:
extract(filepath='./test/images/04-paragraph.png')

{'format': '.png',
 'mime': 'image/png',
 'color': {'grayscale': True,
  'colorize': False,
  'black_white': False,
  'dominant': ['rgb(214,214,214)',
   'rgb(79,79,79)',
   'rgb(14,14,14)',
   'rgb(120,120,120)',
   'rgb(60,60,60)',
   'rgb(100,100,100)']},
 'angle': 0,
 'size': 70313,
 'text': 'Now we are creating an OCR for handwritten Bengali text. The main problem arises due to the fact that we are doing it for handwritten text. So our sample set is very infinite. Also different samples have different characteristics. The handwriting samples are collected from different persons, hence it is very unlikely that they will follow a similar pattern.'}

#### Gradient Image

<img alt="Image" style="width:400px;" src="https://github.com/TheOwlEngine/Research/blob/main/test/images/05-gradient.png?raw=true"/>

In [88]:
extract(filepath='./test/images/05-gradient.png')

{'format': '.png',
 'mime': 'image/png',
 'color': {'grayscale': False,
  'colorize': True,
  'black_white': False,
  'dominant': ['rgb(221,221,212)',
   'rgb(49,56,81)',
   'rgb(121,69,46)',
   'rgb(121,155,189)',
   'rgb(171,127,81)',
   'rgb(84,148,204)']},
 'angle': 0,
 'size': 149082,
 'text': 'Having just recently reviewed the ATI Radeo: we were Keen to get our hands on the cheape 5850. Though both cards were announced | week until the Radeon 1-ID 5850 could be sh« have finally dug up a production model from'}

#### PDF File (Scanned)

In [89]:
extract(filepath='./test/images/06-scan-document.pdf')

{'format': '.pdf',
 'mime': 'application/pdf',
 'color': None,
 'angle': 0,
 'size': 21530,
 'text': 'THE SLEREXE COMPANY LIMITED SAPORS LANE - BOOLE - DORSET - BH 25 8ER TELEPHONE BOOLE (945 13) 51617 - TELEX 123456 Our Ref. 350/PJC/EAC 18th January, Dr. P.N. Cundall, Mining Surveys Ltd., Holroyd Road, Reading,\n\nDear Pete, Permit me to introduce you to the facility of facsimile transmission. In facsimile a photocell is caused to perform a raster scan over the subject copy. The variations of print density on the document cause the photocell to generate an analogous electrical video signal. This signal is used to modulate a carrier, which is transmitted to a remote destination over a radio or cable communications link. At the remote terminal, demodulation reconstructs the video signal, which is used to modulate the density of print produced by a printing device. This device is scanning in a raster scan synchronised with that at the transmitting terminal. As a result, a facsimile copy 

#### Skewed Book

<img alt="Image" style="width:400px;" src="https://github.com/TheOwlEngine/Research/blob/main/test/images/08-book-skew.jpg?raw=true"/>

In [90]:
extract(filepath='./test/images/08-book-skew.jpg')

{'format': '.jpg',
 'mime': 'image/jpeg',
 'color': {'grayscale': False,
  'colorize': True,
  'black_white': False,
  'dominant': ['rgb(39,53,93)',
   'rgb(211,197,187)',
   'rgb(199,155,116)',
   'rgb(145,77,74)',
   'rgb(167,122,70)',
   'rgb(124,127,158)']},
 'angle': 0,
 'size': 3685140,
 'text': 't) Tim P | enyusun NAMA :Deiia juniyAnti “Nn { FEELAS :XI| MIPA 1 BELAJAR PRAKTIS PENDIDIKAN PANCASILA DAN KEWARGANEG MATA PELAJARAN WAJIB Untuk SMA/MA Kelas XI Semester 1 Dilengkapi dengan: Aktivitas\n\nUji Kompetensi Penilaian Diri Penilaian Harian Remedial dan Pengayaan Penilaian Tengah Semester Penilaian Akhir Semester yA W iicsat Fears GUN (4'}

#### Rotated Book

<img alt="Image" style="width:400px;" src="https://github.com/TheOwlEngine/Research/blob/main/test/images/09-book-rotated.jpg?raw=true"/>

In [91]:
extract(filepath='./test/images/09-book-rotated.jpg')

{'format': '.jpg',
 'mime': 'image/jpeg',
 'color': {'grayscale': False,
  'colorize': True,
  'black_white': False,
  'dominant': ['rgb(43,53,94)',
   'rgb(213,199,188)',
   'rgb(202,156,117)',
   'rgb(143,76,75)',
   'rgb(124,127,158)',
   'rgb(166,119,71)']},
 'angle': -27.0,
 'size': 3799030,
 'text': 'Tim Penyusun NAMA :Deia yunyanti -N EELAS -X! MIPA | BELAJAR PRAKTIS PENDIDIKAN PANCASILA DAN KEWARGANEGARAAN MATA PELAJARAN WAJIB Intuk. SMA/MA Kelas XI semester | Dilengkapi dengan: ; Aktivitas [ Uji Kompetensi i Penilaian Diri Penilaian Harian Remedial dan Pengayaan Penilaian Tengah Semester Penilaian Akhir Semester'}

#### Straighted Book

<img alt="Image" style="width:400px;" src="https://github.com/TheOwlEngine/Research/blob/main/test/images/10-book-straight.jpg?raw=true"/>

In [92]:
extract(filepath='./test/images/10-book-straight.jpg')

{'format': '.jpg',
 'mime': 'image/jpeg',
 'color': {'grayscale': False,
  'colorize': True,
  'black_white': False,
  'dominant': ['rgb(210,197,188)',
   'rgb(44,49,90)',
   'rgb(204,157,116)',
   'rgb(71,179,222)',
   'rgb(140,76,75)',
   'rgb(168,124,70)']},
 'angle': 0,
 'size': 3783865,
 'text': 'Tim Penyusun NAMA :Deia \\uiyanti-n EELAS : XI MIPA | BELAJAR PRAKTIS PENDIDIKAN PANCASILA DAN KEWARGANEGARAAN — MATA PELAJARAN WAVIB Untuk SMA/MA Kelas XI Semester 1 Dilengkapi dengan: Aktivitas\n\nPenilaian Diri Penilaian Harian Remedial dan Pengayaan Penilaian Tengah Semester Penilaian Akhir Semester dooa Thins\n\nA REE LL ETN'}

#### PDF File (simple)

In [93]:
extract(filepath='./test/pdf/01-simple.pdf')

{'format': '.pdf',
 'mime': 'application/pdf',
 'color': None,
 'angle': 0,
 'size': 20597,
 'text': 'PDF Test File Congratulations, your computer is equipped with a PDF (Portable Document Format) reader! You should be able to view any of the PDF documents and forms available on our site. PDF forms are indicated by these icons: Box 2703\n\nWhitehorse,Yukon Please visit our website at: http://www.education.gov.yk.ca/'}

#### PDF File (text & image)

In [94]:
extract(filepath='./test/pdf/02-text-image.pdf')

{'format': '.pdf',
 'mime': 'application/pdf',
 'color': None,
 'angle': 0,
 'size': 69432,
 'text': 'Welcome to Smallpdf Ready to take document management to the next level? Digital Documents—All In One Place With the new Smallpdf experience, you can freely upload, organize, and share digital documents. When you enable the ‘Storage’ option, we’ll also store all processed files here. Enhance Documents in One Click When you right-click on a file, we’ll present you with an array of options to convert, compress, or modify it. Access Files Anytime, Anywhere You can access files stored on Smallpdf from your computer, phone, or tablet. We’ll also sync files from the Smallpdf Mobile App to our online portal Collaborate With Others Forget mundane administrative tasks. With Smallpdf, you can request e-signatures, send large files, or even enable the Smallpdf G Suite App for your entire organization.'}

#### PDF File (invoice)

In [95]:
extract(filepath='./test/pdf/03-invoice.pdf')

{'format': '.pdf',
 'mime': 'application/pdf',
 'color': None,
 'angle': 0,
 'size': 43627,
 'text': 'DEMO - Sliced Invoices Suite 5A-1204 123 Somewhere Street Your City AZ 12345 admin@slicedinvoices.com Invoice Number INV-3337\n\nOrder Number Invoice Date January 25, 2016 Due Date\n\nJanuary 31, 2016 Total Due\n\nWeb Design This is a sample description ANZ Bank\n\nACC # 1234 1234 BSB # 4321 432 Test Business 123 Somewhere St Melbourne, VIC 3000 test@test.com Rate/Price\n\nSub Total Sub Total\n\nPayment is due within 30 days from date of invoice. Late payment is subject to fees of 5% per month. Thanks for choosing DEMO - Sliced Invoices | admin@slicedinvoices.com Page 1/1'}

#### XLS File

In [96]:
extract(filepath='./test/xls/01-tests.xls')

{'format': '.xls',
 'mime': 'application/vnd.ms-excel',
 'color': None,
 'angle': 0,
 'size': 16384,
 'text': 'MC What is 2+2? 4.0 correct 3.0 incorrect MA What C datatypes are 8 bits? (assume i386) int float double char TF Bagpipes are awesome. true ESS How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years? ORD Rank the following in their order of operation. Parentheses Exponents Division Addition FIB The student activities fee is 95.0 dollars for students enrolled in 19.0 units or more, MAT Match the lower-case greek letter with its capital form. λ Λ α γ Γ φ Φ http://www.cmu.edu/blackboard Question Format Abbreviations Abbreviation Question Type MC Multiple Choice MA Multiple Answer TF True/False ESS Essay\n\nORD Ordering MAT Matching FIB Fill in the Blank FIL File response NUM Numeric Response SR Short response OP Opinion\n\nFIB_PLUS Multiple Fill in the Blank JUMBLED_SENTENCE Jumbled Sentence QUIZ_BOWL Quiz Bowl http://www.c

#### DOC File

In [97]:
extract(filepath='./test/document/sample-doc-file.doc')

{'format': '.doc',
 'mime': 'application/msword',
 'color': None,
 'angle': 0,
 'size': 4767232,
 'text': '[pic](Curabitur bibendum ante urna, sed blandit libero  egestas  id.  Pellentesque) rhoncus elit in lacus ultrices fringilla. Nam  ac  metus  eu  turpis  mattis rutrum.  Mauris  mattis  sem  ex,  facilisis  molestie  sapien  luctus  non. Vestibulum tincidunt urna at odio suscipit, vel congue felis  cursus.  Etiam tellus magna, egestas ac suscipit in, laoreet quis felis. Proin non orci  id dui tincidunt egestas. Vestibulum eleifend, ligula a scelerisque vehicula,  risus  justo  ultricies ligula, et interdum lorem ex eget  ex.  Duis  dignissim  lacus  vitae  velit laoreet, vitae placerat velit aliquet. Etiam eget mollis nulla, ac  vehicula mi. Etiam non sollicitudin velit, imperdiet commodo mi.  Fusce  quis  tellus tellus. Donec dictum euismod risus non tempus. Duis quis pellentesque  nunc. Praesent elementum condimentum mollis. Phasellus dapibus quam a hendrerit placerat. Sed ultri

#### DOCX File

In [98]:
extract(filepath='./test/document/sample-docx-file.docx')

{'format': '.docx',
 'mime': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'color': None,
 'angle': 0,
 'size': 3909414,
 'text': 'Curabitur bibendum ante urna, sed blandit libero egestas id. Pellentesque rhoncus elit in lacus ultrices fringilla. Nam ac metus eu turpis mattis rutrum. Mauris mattis sem ex, facilisis molestie sapien luctus non. Vestibulum tincidunt urna at odio suscipit, vel congue felis cursus. Etiam tellus magna, egestas ac suscipit in, laoreet quis felis. Proin non orci id dui tincidunt egestas. Vestibulum eleifend, ligula a scelerisque vehicula, risus justo ultricies ligula, et interdum lorem ex eget ex. Duis dignissim lacus vitae velit laoreet, vitae placerat velit aliquet. Etiam eget mollis nulla, ac vehicula mi. Etiam non sollicitudin velit, imperdiet commodo mi. Fusce quis tellus tellus. Donec dictum euismod risus non tempus. Duis quis pellentesque nunc. Praesent elementum condimentum mollis. Phasellus dapibus quam a hendrerit placer