In [18]:
pip install img2table

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install pytesseract


In [None]:
pip install PyMuPDF

In [None]:
pip install numpy

In [None]:
pip install matplotlib

In [None]:
pip install opencv-python

In [21]:
import fitz
import numpy
from PIL import Image
import io
from matplotlib import pyplot as plt
import cv2
import numpy as np
import os
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

## Image processing functions

In [3]:
def pdf_to_img(path):
    pdf = fitz.open(path)
    counter = 1
    pages = []
    for i in range (len(pdf)):
        page = pdf[i]
        images = page.get_images()
        for image in images:
            base_img = pdf.extract_image(image[0])
            image_data = base_img["image"]
            img = Image.open(io.BytesIO(image_data))
            #extension = base_img["ext"]
            #img.save(open(f"image{counter}.{extension}", "wb"))
            pages.append(img)
            #counter += 1

    return pages

In [4]:
def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width = im_data.shape[:2]

    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

In [5]:
def getSkewAngle(cvImage) -> float:
    # Prep image, copy, convert to gray scale, blur, and threshold
    gray = cvImage.copy()
    newImage = cvImage.copy()
    blur = cv2.GaussianBlur(gray, (9, 9), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Apply dilate to merge text into meaningful lines/paragraphs.
    # Use larger kernel on X axis to merge characters into single line, cancelling out any spaces.
    # But use smaller kernel on Y axis to separate between different blocks of text
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5))
    dilate = cv2.dilate(thresh, kernel, iterations=2)

    # Find all contours
    contours, hierarchy = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key = cv2.contourArea, reverse = True)
    for c in contours[:1]:
        rect = cv2.boundingRect(c)
        x,y,w,h = rect
        cv2.rectangle(newImage,(x,y),(x+w,y+h),(0,255,0),2)

    # Find largest contour and surround in min area box
    largestContour = contours[0]
    minAreaRect = cv2.minAreaRect(largestContour)
    # Determine the angle. Convert it to the value that was originally used to obtain skewed image
    angle = minAreaRect[-1]
    if angle > 45:
        angle = angle - 90
    return -1.0 * angle
# Rotate the image around its center
def rotateImage(cvImage, angle: float):
    newImage = cvImage.copy()
    (h, w) = newImage.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    newImage = cv2.warpAffine(newImage, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return newImage

# Deskew image
def deskew(cvImage):
    angle = getSkewAngle(cvImage)
    return rotateImage(cvImage, -1.0 * angle)

In [6]:
def noise_removal(image):
    import numpy as np
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.dilate(image, kernel, iterations=1)
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.erode(image, kernel, iterations=1)
    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)
    image = cv2.medianBlur(image, 3)
    return (image)

In [7]:
def find_tables(image):
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    vertical = horizontal = img_bin.copy()
    SCALE = 5
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    
    mask = horizontally_dilated + vertically_dilated
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )

    MIN_TABLE_AREA = 1e5
    contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.1 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]

    # The link where a lot of this code was borrowed from recommends an
    # additional step to check the number of "joints" inside this bounding rectangle.
    # A table should have a lot of intersections. We might have a rectangular image
    # here though which would only have 4 intersections, 1 at each corner.
    # Leaving that step as a future TODO if it is ever necessary.
    images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
    return images

In [8]:
def preprocessing_assembler(image):
    image = np.array(image)
    thresh, thresholdedPage = cv2.threshold(image, 200, 255, cv2.THRESH_BINARY)
    image = thresholdedPage
    image = noise_removal(image)
    image = deskew(image)
    return image

## Image preprocessing

In [10]:
pdfs = []

for filename in os.listdir("Probe"):
    pdfs.append(f"Probe/{filename}")

for pdf in pdfs[:1]: 
    pages = pdf_to_img(pdf)
    text = []
    
    for i in range(len(pages)):
        pages[i] = np.array(pages[i])
        thresh, thresholdedPage = cv2.threshold(pages[i], 200, 255, cv2.THRESH_BINARY)
        pages[i] = thresholdedPage
        pages[i] = noise_removal(pages[i])
        pages[i] = deskew(pages[i])
        text.append(pytesseract.image_to_string(Image.fromarray(pages[i])))

In [None]:
proba1 = pdf_to_img("proba1.pdf")

text = pytesseract.image_to_string(proba1[0])

print(text)

In [None]:
text2 = pytesseract.image_to_string(preprocessing_assembler(proba1[0]))
print(text2)

In [78]:
text2 = pytesseract.image_to_string(preprocessing_assembler(Image.open("example-table.png")), config="--psm 6")
print(text2)

Adresa $au-zona | 7 ‘<: -|\dob&ndirii Suprafata parte| dobandire’"{ =" Titularul
PODU DAMBOVIJEI PITEA NICULINA 3/6
JUD. ARGES CASA DE ~ PITEA NICOLETA 1/6
LOCUIT 2006 194 mp 1/6 | SUCCESIUNE PITEA ION 1/6
LAMBESCU DANIELAI/6
PODU DAMBOVITEI PITEA NICULINA 3/6
JUD. ARGES PITEA NICOLETA 1/6
ANEXA 2006 40 mp 1/46 | SUCCESIUNE PITEA ION 1/6
LAMBESCU DANIELA1/6



In [65]:
proba5 = pdf_to_img("proba1.pdf")

page = np.array(proba5[0])   # n-a gasit primul tabel, l-a dat direct pe al doilea

page = find_tables(preprocessing_assembler(page))[0]                 #da tabelele de jos in sus

cv2.imwrite("example-table.png", page)

True

## Experimental:

In [80]:
from sklearn.cluster import AgglomerativeClustering
from pytesseract import Output
from tabulate import tabulate
import pandas as pd
import numpy as np
import pytesseract
import argparse
import imutils
import cv2

In [95]:
args = {}
args["min_conf"] = 2
args["min_size"] = 2
args["dist_thresh"] = 2

In [96]:
# set the PSM mode to detect sparse text, and then localize text in
# the table
options = "--psm 6"
results = pytesseract.image_to_data(
	preprocessing_assembler(Image.open("example-table.png")),
	config=options,
	output_type=Output.DICT)
# initialize a list to store the (x, y)-coordinates of the detected
# text along with the OCR'd text itself
coords = []
ocrText = []

# loop over each of the individual text localizations
for i in range(0, len(results["text"])):
	# extract the bounding box coordinates of the text region from
	# the current result
	x = results["left"][i]
	y = results["top"][i]
	w = results["width"][i]
	h = results["height"][i]
	# extract the OCR text itself along with the confidence of the
	# text localization
	text = results["text"][i]
	conf = int(results["conf"][i])
	# filter out weak confidence text localizations
	if conf > args["min_conf"]:
		# update our text bounding box coordinates and OCR'd text,
		# respectively
		coords.append((x, y, w, h))
		ocrText.append(text)


# extract all x-coordinates from the text bounding boxes, setting the
# y-coordinate value to zero
xCoords = [(c[0], 0) for c in coords]
# apply hierarchical agglomerative clustering to the coordinates
clustering = AgglomerativeClustering(
	n_clusters=None,
	metric ="manhattan",
	linkage="complete",
	distance_threshold=args["dist_thresh"])
clustering.fit(xCoords)
# initialize our list of sorted clusters
sortedClusters = []


# loop over all clusters
for l in np.unique(clustering.labels_):
	# extract the indexes for the coordinates belonging to the
	# current cluster
	idxs = np.where(clustering.labels_ == l)[0]
	# verify that the cluster is sufficiently large
	if len(idxs) > args["min_size"]:
		# compute the average x-coordinate value of the cluster and
		# update our clusters list with the current label and the
		# average x-coordinate
		avg = np.average([coords[i][0] for i in idxs])
		sortedClusters.append((l, avg))
# sort the clusters by their average x-coordinate and initialize our
# data frame
sortedClusters.sort(key=lambda x: x[1])
df = pd.DataFrame()


# loop over the clusters again, this time in sorted order
for (l, _) in sortedClusters:
	# extract the indexes for the coordinates belonging to the
	# current cluster
	idxs = np.where(clustering.labels_ == l)[0]
	# extract the y-coordinates from the elements in the current
	# cluster, then sort them from top-to-bottom
	yCoords = [coords[i][1] for i in idxs]
	sortedIdxs = idxs[np.argsort(yCoords)]
	# generate a random color for the cluster
	color = np.random.randint(0, 255, size=(3,), dtype="int")
	color = [int(c) for c in color]

	# loop over the sorted indexes
	for i in sortedIdxs:
		# extract the text bounding box coordinates and draw the
		# bounding box surrounding the current element
		(x, y, w, h) = coords[i]
		cv2.rectangle(table, (x, y), (x + w, y + h), color, 2)
	# extract the OCR'd text for the current column, then construct
	# a data frame for the data where the first entry in our column
	# serves as the header
	cols = [ocrText[i].strip() for i in sortedIdxs]
	currentDF = pd.DataFrame({cols[0]: cols[1:]})
	# concatenate *original* data frame with the *current* data
	# frame (we do this to handle columns that may have a varying
	# number of rows)
	df = pd.concat([df, currentDF], axis=1)

print("[INFO] saving CSV file to disk...")
df.to_csv(args["output"], index=False)

NameError: name 'table' is not defined

## Experiment 2: 

In [20]:
from img2table.document import Image
from img2table.ocr import TesseractOCR

# Instantiation of the image
doc = Image(src="example-table.png", detect_rotation=False)
ocr = TesseractOCR(n_threads=1, lang="eng")

extracted_tables = doc.extract_tables(ocr=ocr,
                                      implicit_rows=False,
                                      borderless_tables=False,
                                      min_confidence=50)

# Result of table identification
img_tables

[ExtractedTable(title=None, bbox=(10, 8, 745, 314),shape=(6, 3)),
 ExtractedTable(title=None, bbox=(936, 9, 1129, 111),shape=(2, 2))]

ModuleNotFoundError: No module named 'img2table.tables.image'