In [1]:
from imutils.perspective import four_point_transform
from matplotlib import pyplot as plt
from imutils import contours
import pytesseract
from skimage.measure import find_contours
import numpy as np
import argparse
import os
import imutils
import cv2
from skimage.morphology import skeletonize, thin
import skimage.io as io
from scipy import stats
import arabic_reshaper
import math


def show_images(images,titles=None):
    n_ims = len(images)
    if titles is None: titles = ['(%d)' % i for i in range(1,n_ims + 1)]
    fig = plt.figure()
    n = 1
    for image,title in zip(images,titles):
        a = fig.add_subplot(1,n_ims,n)
        if image.ndim == 2: 
            plt.gray()
        plt.imshow(image)
        a.set_title(title)
        n += 1
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims)
    plt.show()

In [2]:
# return the Canny edge detection of image
def getEdges(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    edged = cv2.Canny(blurred, 75, 200)
    return edged
# return the Canny edge detection of Table by increase the height parameter
def getEdgesTable(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 1)
    edged = cv2.Canny(blurred, 30, 100)
    return edged

In [3]:
# return the biggest square COntour of image
def getContours(edged):
    cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)
    docCnt = None
    boarderd = []
    if len(cnts) > 0:
        cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
        for c in cnts:
            peri = cv2.arcLength(c, True)
            approx = cv2.approxPolyDP(c, 0.02 * peri, True)
            points = len(approx)
            if (points == 4):
                docCnt = approx
                break
    return docCnt

In [4]:
# remove the border of Image here it used for TAbles
def CropImg(img):
    cropped_image = img[int(img.shape[0] * .01):int(img.shape[0] * .98), int(img.shape[1] * .02):int(img.shape[1] * .98)]
    return cropped_image

In [5]:
#extract the Table from the image and modify point of view
def exetractTable(image,docCnt):
    Table = four_point_transform(image, docCnt.reshape(4, 2))
    return Table

In [6]:
# frinction to remove the internal details of the image to make the paper itself the bigest contour 
def RemoveText(image):
    kernel = np.ones((8,8),np.uint8)
    img = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel, iterations= 3)
    return img

In [7]:
# return the position of the vertical lines of the table
def get_Ver_Lines(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    VerImg =np.zeros(image.shape).astype(np.uint8)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,25))
    vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    goodc=[]
    
    image_height=image.shape[0]
    min_height=int(.8*image_height)
    
    for c in cnts:
        rect = cv2.minAreaRect(c) 
        width = rect[1][0]
        height = rect[1][1]
        if ((width >= min_height) or (height >= min_height)):
                goodc.append(c)
    
    LinesPos = []
    # print(len(goodc))
    for c in goodc:
        cv2.drawContours(VerImg, [c], -1, (255,255,255), 2)
        LinesPos.append([ min(c[:,0][:,0]), max(c[:,0][:,0]) ])
    LinesPos.sort()

    
    # show_images([VerImg],["VerImg"])
    
    return LinesPos,


In [8]:
#save the image to cells Directory
def Save_img(imageo,Name):
    cv2.imwrite(os.path.join("./Cells" , Name), imageo)

In [9]:
#exctract that exact colum from colum image 
def exestractCloum(colum):
    edged = getEdges(colum)
    kernel = np.ones((20,20), np.uint8)
    img_dilation = cv2.dilate(edged, kernel, iterations=1)
    docCnt = getContours(img_dilation)
    PureColum = colum
    if(docCnt is not None):
        PureColum = exetractTable(colum,docCnt)
    show_images([colum, edged , PureColum],["colum ", "edged" , "PureColum"])

In [10]:
def dist(p1, p2):
    (x1, y1), (x2, y2) = p1, p2
    return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

#get the avagrage diffrent between the vertical lines
def getAvgDiff(LinesPos):
    LinesPos.sort()
    avg_distance=0
    for i in range(1,len(LinesPos)):
        avg_distance += abs(LinesPos[i] - LinesPos[i-1])
    avg_distance = avg_distance / len(LinesPos)
    return avg_distance


In [11]:
# get the position of the Horizontal lines of the Table
def get_HOR_Lines(image,lineWidthPercenatge):

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 41, 5)
    
    HorImg =np.zeros(image.shape).astype(np.uint8)

    # Find number of rows 
    image_width=image.shape[1]
    min_width=int(image_width*lineWidthPercenatge)

    horizontal_kernel_Small = cv2.getStructuringElement(cv2.MORPH_RECT , (min_width//2,1))
    thresh_lttleBigger = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel_Small, iterations=1)


    kernal_Diolate_before_hor = np.ones((10,min_width), np.uint8)
    VerticalDiolateBeforeHor = cv2.morphologyEx(thresh_lttleBigger, cv2.MORPH_CLOSE, kernal_Diolate_before_hor)


    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT , (min_width,1))
    horizontal = cv2.morphologyEx(VerticalDiolateBeforeHor, cv2.MORPH_OPEN, horizontal_kernel, iterations=1)
    
    
    kernal_Diolate = np.ones((3,int(image_width*1.6)), np.uint8)
    HorImgDia = cv2.morphologyEx(horizontal, cv2.MORPH_DILATE, kernal_Diolate)
    
    Thin_HOR = thin(HorImgDia, 10).astype(np.uint8)


    cnts = cv2.findContours(Thin_HOR, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]

    goodc=[]
    min_width_contours=int(.6*image_width)
    
    for c in cnts:
        rect = cv2.minAreaRect(c) 
        width = rect[1][0]
        height = rect[1][1]
        if (( (width >= min_width) or (height >= min_width) )):
                goodc.append(c)
                
    LinesPos=[]
    for c in goodc:
        cv2.drawContours(HorImg, [c], -1, (255,255,255), thickness=3)
        LinesPos.append(min(c[:,0][:,1]))
    LinesPos.sort()

    accepted = [] 
    accepted = [LinesPos[0]]
    avgCellHeight = getAvgDiff(LinesPos)
    for i in range(1,len(LinesPos)):
        if(LinesPos[i] - LinesPos[i-1] >= int(avgCellHeight*.5)):
            accepted.append(LinesPos[i])

    if(len(accepted) != len(LinesPos)):
        accepted[0] -= 15

    return accepted

In [12]:
# get little wider table to show the outlines vertical lines
def widerTable(docCnt, imageShape):
    heightincrese = int(imageShape[0]*.015)
    widthincrese = int(imageShape[1]*.01)
    halfHor = int(imageShape[1]/2)
    halfVer = int(imageShape[0]/2)
    
    widerdoc = np.copy(docCnt)
    for c in docCnt:
        c[0][0] += (c[0][0] > halfHor) * widthincrese
        c[0][0] -= (c[0][0] < halfHor) * widthincrese
        
        c[0][1] += (c[0][1] > halfVer) * heightincrese
        c[0][1] -= (c[0][1] < halfVer) * heightincrese
        
    return docCnt

In [13]:
# exstract the Table From all images in the Directory 
def GetTagbleOfImagesInDir(folder_dir):
    for image_name in os.listdir(folder_dir):
        print("Start : "+folder_dir+image_name)
        image = cv2.imread(folder_dir+image_name,1)
        Paper = getPaper(image)
        Table = getTable(Paper,image)
        Save_img(Table,image_name)
        print("Done : "+folder_dir+image_name)
        
# exstract the Cells From all images in the Directory 
def GetCellsOfImagesInDir(folder_dir):
    for image_name in os.listdir(folder_dir):
        print("Start : "+folder_dir+image_name)
        image = cv2.imread(folder_dir+image_name,1)
        Paper = getPaper(image)
        Table = getTable(Paper,image)
        Cells = get_cells(Table)
        print("Done : "+folder_dir+image_name)
        
# exctract the Paper From all images in the Directory 
def getPaper(image):
    edged = getEdges(image)
    kernel = np.ones((15, 15), np.uint8)
    img_dilation = cv2.dilate(edged, kernel, iterations=1)
    docCnt = getContours(img_dilation)
    Paper = exetractTable(image,docCnt)
    imageArea , PaperArea = image.shape[0] * image.shape[1] , Paper.shape[0] * Paper.shape[1]
    
    if(imageArea  > PaperArea * 2):
        docCnt = widerTable(docCnt,Paper.shape)
        
    return Paper

# extract the Table image 
def getTable(Paper,image):
    Table=Paper
    CropPaper = CropImg(Paper)  
    imageArea , PaperArea = image.shape[0] * image.shape[1] , Paper.shape[0] * Paper.shape[1]
    if(imageArea  < PaperArea * 2):
        edged = getEdgesTable(CropPaper)
        kernel2 = np.ones((5,1), np.uint8)
        img_dilation = cv2.dilate(edged, kernel2, iterations=1)
        docCnt = getContours(img_dilation)
        Table = exetractTable(CropPaper,docCnt)
        docCnt = widerTable(docCnt, Table.shape)
        Table = exetractTable(CropPaper,docCnt)
    return Table

# remove all noise from lines that very near togther
def remove_nears(arr,K):
    arr = sorted(arr)
    idx = 0
    while idx < len(arr) - 1:
        if arr[idx] + K > arr[idx + 1]:
            del arr[idx + 1]
        else:
            idx += 1
    return arr

#get the cells from all images and return it as 2d array
def get_cells(image):
    get_Ver_Linesee = get_Ver_Lines(image)
    Ver_min_max_pos = get_Ver_Linesee[0]
    get_HOR_Linesee = get_HOR_Lines(image,.4)
    
    cellsImage=np.zeros(image.shape).astype(np.uint8)
    cuttenCells = []
    for i  in range(1,len(Ver_min_max_pos)):
        Row=[]
        for j  in range(1,len(get_HOR_Linesee)):
                smallcell = image[ get_HOR_Linesee[j-1]: get_HOR_Linesee[j]  ,Ver_min_max_pos[i-1][0]:Ver_min_max_pos[i][1]]
                Row.append(smallcell)
                cellsImage[ get_HOR_Linesee[j-1]: get_HOR_Linesee[j]  ,Ver_min_max_pos[i-1][1]:Ver_min_max_pos[i][0]] = image[ get_HOR_Linesee[j-1]: get_HOR_Linesee[j]  ,Ver_min_max_pos[i-1][1]:Ver_min_max_pos[i][0]]
        cuttenCells.append(Row)

    return cuttenCells,len(Ver_min_max_pos)-1
    

In [14]:
# return all cells of image 
def getCellsofImg(image):
    Paper = getPaper(image)
    Table = getTable(Paper,image)
    Cells, ColumsNumber = get_cells(Table)
    return  Cells,ColumsNumber

In [15]:
# return the text inside image using already made OCR
def getId(idCell,typeee="eng"):
    cellid = pytesseract.image_to_string(idCell,lang=typeee)
    cellid = cellid.replace("|", "")
    cellid = cellid.replace("/", "")
    return cellid

In [19]:

# function to convert all images in directory with same but 1,0 images
def changeToBW(folder_dir):
    for image_name in os.listdir(folder_dir):
        image = cv2.imread(folder_dir+image_name,1)
        graycell = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        threadhold = cv2.threshold(graycell,0,255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
        cv2.imwrite(os.path.join(folder_dir , image_name ), threadhold)

In [20]:
# function to return cells give it the sheet and it will return its cells as 2d array => getCellsofImg(image)
# function to  get the id if the cell using Already made OCR => getId(idCell) give it the cell and it will return string contain the value inside
# function to return Chars of cell  as images => getchars(cell) give it cell and it will return the array of images each one have one char of this cell 
# function to return Symobls of cell  as images => getSymbols(cell) give it cell and it will return the array of images each one have one symbol of this cell 
# function to return the cell without its blackBorders => removeCellBlackBorder(Borderdcell) return image without BlackBorder