In [None]:
import re
import os
import cv2
import textract
import tempfile
import mimetypes
import pytesseract
import numpy as np

from PIL import Image, ImageStat
from colorthief import ColorThief
from deskew import determine_skew
from skimage.color import rgb2gray
from skimage.transform import rotate
from pdf2image import convert_from_path

In [None]:
def straighten(filepath: str = '', extension: str = '.jpg'):
    image = cv2.imread(filepath, cv2.IMREAD_COLOR)
    grayscale = rgb2gray(image)
    angle = determine_skew(grayscale)
    rotated = rotate(image, angle, resize=True) * 255
    image = rotated.astype(np.uint8)

    temp = tempfile.gettempdir() + extension

    cv2.imwrite(temp, image)

    return temp

def colors(filepath, thumb_size=40, MSE_cutoff=22, adjust_color_bias=True):
    thief = ColorThief(filepath)

    grayscale = False
    colorize = False
    black_white = False

    image = Image.open(filepath)
    bands = image.getbands()

    if bands == ('R', 'G', 'B') or bands == ('R', 'G', 'B', 'A'):
        thumb = image.resize((thumb_size, thumb_size))
        SSE, bias = 0, [0, 0, 0]

        if adjust_color_bias:
            bias = ImageStat.Stat(thumb).mean[:3]
            bias = [b - sum(bias)/3 for b in bias]

        for pixel in thumb.getdata():
            mu = sum(pixel)/3
            SSE += sum(
                (pixel[i] - mu - bias[i]) *
                (pixel[i] - mu - bias[i]) for i in [0, 1, 2]
            )

        MSE = float(SSE)/(thumb_size*thumb_size)

        if MSE <= MSE_cutoff:
            grayscale = True
        else:
            colorize = True

    elif len(bands) == 1:
        black_white = True
    else:
        pass

    thief_palette = []

    for palette in thief.get_palette(color_count=6):
        palette_list = []

        for pixel in palette:
            palette_list.append(str(pixel))
        
        thief_palette.append('rgb({})'.format(','.join(palette_list)))

    return {
        'grayscale': grayscale,
        'colorize': colorize,
        'black_white': black_white,
        'dominant': thief_palette
    }

def read_text(filepath: str = ''):
    # Supported extension .csv, .doc, .docx, .epub, .gif, .jpg, .jpeg, .json, .html, .htm, .odt, .pdf, .pdf (scan), .png, .pptx, .rtf, .tiff, .tif, .txt, .xlsx, .xls

    name, extension = os.path.splitext(filepath)
    mime = mimetypes.guess_type(filepath)
    mime = mime[0]
    encoding = 'utf-8'
    colorize = None

    try:
        colorize = colors(filepath=filepath)
    except:
        pass

    content = textract.process(filename=filepath, encoding=encoding, extension=extension).strip()

    if len(content) == 0:
        try:
            pages = convert_from_path(filepath)
            page_string = ''

            for _, imgBlob in enumerate(pages):
                page_string += str(pytesseract.image_to_string(imgBlob))

            if len(page_string) > 0:
                mime = 'application/pdf'
                extension = '.pdf'
                content = page_string
        except:
            pass

    if len(content) == 0:
        straighten_image = straighten(filepath=filepath, extension=extension)
        content = textract.process(filename=straighten_image, encoding=encoding, extension=extension).strip()
        os.remove(straighten_image)

    size = os.stat(filepath).st_size

    try:
        content = content.decode(encoding)
    except:
        pass

    try:
        content = str(content, encoding)
    except:
        pass

    content = re.sub(
        r'^([0-9])\n([a-zA-Z])', '\g<1>. \g<2>', re.sub(
            r'^([a-zA-Z])\n\n?([a-zA-Z])', '\g<1>\g<2>', re.sub(
                r'\.{2,}', '', re.sub(
                    r'\s((m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3}))|(M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}))|\d+)\.\s', '\n\g<1>. ', re.sub(
                        r'\[(.*?)\]\s?\n(.*?)\n', '[\\g<1>](\\g<2>)\n', re.sub(
                            r'((m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3}))|\d+)\)\n([A-Z])', '\g<1>) \g<6>', re.sub( 
                                r'((M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}))|\d+)\.\n([A-Z])', '\g<1>. \g<6>', content
                            , 0, re.MULTILINE)
                        , 0, re.MULTILINE)
                    , 0, re.MULTILINE)
                , 0, re.MULTILINE)
            , 0, re.MULTILINE)
        , 0, re.MULTILINE)
    , 0, re.MULTILINE)

    bracket = ''

    for line in content.split('\n'):
        hasFormula = re.search(r'\s{2}\s+', line)

        if re.search(r'^([0-9a-zA-Z]|\[)', line) and len(line) > 7.5 and hasFormula == None:
            bracket += line + '\n\n'

    content = re.sub(r'([\w\W]{10})\n{2}([\w\W])', '\g<1> \g<2>', bracket).strip()

    return (extension, mime, content, size, colorize)

#### Test DOC File

In [None]:
read_text(filepath='./test/document/sample-doc-file.doc')

#### Test DOC File

In [None]:
read_text(filepath='./test/document/sample-docx-file.docx')

In [None]:
read_text(filepath='./test/images/01-handwritten.png')

In [None]:
read_text(filepath='./test/images/02-receipt.png')

In [None]:
read_text(filepath='./test/images/03-numeric.png')

In [None]:
read_text(filepath='./test/images/04-paragraph.png')

In [None]:
read_text(filepath='./test/images/05-gradient.png')

In [None]:
read_text(filepath='./test/images/06-scan-document.pdf')

In [None]:
read_text(filepath='./test/images/07-scan-monochrome.pdf')

In [None]:
read_text(filepath='./test/images/08-book-skew.jpg')

In [None]:
read_text(filepath='./test/images/09-book-rotated.jpg')

In [None]:
read_text(filepath='./test/images/10-book-straight.jpg')

In [None]:
read_text(filepath='./test/images/11-book-small-rotated.jpg')

In [None]:
read_text(filepath='./test/images/12-book-script-straight.jpg')

In [None]:
read_text(filepath='./test/images/13-book-script-rotated.jpg')

In [None]:
read_text(filepath='./test/pdf/01-simple.pdf')

In [None]:
read_text(filepath='./test/pdf/02-text-image.pdf')

In [None]:
read_text(filepath='./test/pdf/03-invoice.pdf')

In [None]:
read_text(filepath='./test/pdf/04-journal.pdf')

In [None]:
read_text(filepath='./test/pdf/05-complex.pdf')

In [None]:
read_text(filepath='./test/xls/01-tests.xls')