In [3]:
import cv2
import os
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
import pytesseract
from pytesseract import Output
import matplotlib.pyplot as plt
pytesseract.pytesseract.tesseract_cmd="/home/aman/anaconda3/envs/tallyInvoiceParser.env/bin/tesseract"
os.environ['TESSDATA_PREFIX'] = "/home/aman/anaconda3/envs/tallyInvoiceParser.env/share/tessdata"
import import_ipynb
from textPreProcessor import *
from imagePreProcessor import *
from datetime import datetime
import math

In [4]:
imagesFolderPath = '/home/aman/Documents/Tally/DocumentAI/Code/Tesseract-Modelv3/Dataset/SampleImages/'

In [6]:
def preProcessDataFrame(df, image):
    #drop null values
    df.dropna(inplace=True)
    
    #columns having same values throughout removed
    toDrop = ["level", "page_num"]
    df.drop(columns=toDrop, inplace=True)
    
    #rows having text as a stopword removed
    indexesToDrop = []
    stopwords = ["", " ", "  ", "/", ":", "-", ".", ",", "\n", "\t", "\\", "(", ")", "[", "]", "{", "}", "*", "&", "%", "$", "#", "|"]
    for index in df.index:
        if df["conf"][index]<=40:
            indexesToDrop.append(index)
        elif df["text"][index] in stopwords:
            indexesToDrop.append(index)   
    df.drop(indexesToDrop, inplace=True)
    
    #resetting the index
    df.reset_index(inplace=True)
    df.drop(columns=["index"], inplace=True)
    
    #creating new columns for centroid information
    df["x"] = df["left"] + df["width"] 
    df["y"] = df["top"] + df["height"]
    
    df["PageHeight"] = image.shape[0]
    df["PageWidth"] = image.shape[1]
    
    df["isTop"] = 0
    df["isBottom"] = 0
    df["isRight"] = 0
    df["isLeft"] = 0

In [7]:
def findDate(dateDF):
    dates = []
    format = "%d-%m-%Y"
    for index in dateDF.index:
        text = dateDF["text"][index]
        res = True
        try:
            res = bool(datetime.strptime(text, format))
        except ValueError:
            res = False
        if str(res)=="True":
            dates.append(dateDF["text"][index])
    format = "%m-%d-%Y"
    for index in dateDF.index:
        text = dateDF["text"][index]
        res = True
        try:
            res = bool(datetime.strptime(text, format))
        except ValueError:
            res = False
        if str(res)=="True":
            dates.append(dateDF["text"][index])
    format = "%y-%m-%d"
    for index in dateDF.index:
        text = dateDF["text"][index]
        res = True
        try:
            res = bool(datetime.strptime(text, format))
        except ValueError:
            res = False
        if str(res)=="True":
            dates.append(dateDF["text"][index])
    return dates

In [None]:
def extractDateDataFrame(df):
    dateDF = df.copy()
    for index in dateDF.index:
        text = dateDF["text"][index]
        dateDF["text"][index] = preProcessText(text)
    dates = findDate(dateDF)
    return dateDF, dates
    
        

In [None]:
def returnDist(x1, y1, x2, y2):
    p = [x1, y1]
    q = [x2, y2]
    return math.dist(p, q)

In [None]:
def allocateNeighbours(df, neighbourFields):
    for index in df.index:
        x1 = df["x"][index]
        y1 = df["y"][index]
        block = df["block_num"][index]
        para = df["par_num"][index]
        neighbours = []
        for index2 in df.index:
            x2 = df["x"][index2]
            y2 = df["y"][index2]
            block2 = df["block_num"][index2]
            para2 = df["par_num"][index2]
            dist = returnDist(x1, y1, x2, y2)
            hr = df["PageHeight"][index2]// df["PageWidth"][index2]
            x = df["PageWidth"][index2]*0.2
            y = df["PageHeight"][index2]*0.2*hr
            radius = math.sqrt((x**2) + (y**2))
            if dist<radius:
                neighbours.append(df["text"][index2].lower())
            if abs(y1-y2)<df["PageHeight"][index2]*0.05 and (x2-x1)<df["PageWidth"][index2] and abs(block2-block)<=1 and abs(para2==para)<=1:
                neighbours.append(df["text"][index2].lower())
        for n in neighbourFields:
            if n in neighbours:
                df[n][index]=1

In [None]:
def addNeighbours(df):
    neighbourFields = ["date", "dated", "invoice", "delivery", "order", "due", "payment", "tax", "bill", "receipt", "issue"]
    for col in neighbourFields:
        df[col]=0
    allocateNeighbours(df, neighbourFields)
    

In [None]:
def findDateDF(dates, dateDF):
    indexes = []
    for date in dates:
        i = dateDF[dateDF["text"]==date].index
        for index in i:
            indexes.append(index)
    return indexes

In [None]:
def dropIndexes(indexes, df):
    indexesToDrop=[]
    for index in df.index:
        if index not in indexes:
            indexesToDrop.append(index)
    df.drop(indexesToDrop, inplace=True)

In [None]:
def create(imagePath):
    image = cv2.imread(imagePath)
    preProcessedImage = preProcessImage(imagePath)
    data = pytesseract.image_to_data(preProcessedImage, output_type=Output.DATAFRAME)
    df = pd.DataFrame(data)
    preProcessDataFrame(df, preProcessedImage)
    dateDF, dates = extractDateDataFrame(df)
    addNeighbours(df)
    indexes = findDateDF(dates, dateDF)
    resultDF = df.copy()
    dropIndexes(indexes, resultDF)
    return resultDF
    
    

In [None]:
def batchCreate(folderPath):
    os.chdir(folderPath)
    images = os.listdir()
    dfList = []
    for imagePath in images:
        #print(imagePath)
        df = create(imagePath)
        df["imageName"] = imagePath
        df["output"] = 0
        dfList.append(df)
    return dfList

In [None]:
#dfList = batchCreate(imagesFolderPath)

In [None]:
#len(dfList)

In [None]:
#resultDF = pd.concat(dfList)

In [None]:
#resultDF.shape

In [None]:
#resultDF

In [None]:
#resultDF.reset_index(inplace=True)

In [None]:
#resultDF.drop(columns=["index"], inplace=True)

In [None]:
#resultDF.head()

In [None]:
#pwd

In [None]:
#os.chdir('/home/aman/Documents/Tally/DocumentAI/Code/Tesseract-Modelv2/Dataset/')

In [None]:
#pwd

In [None]:
#resultDF.to_csv("tesseract.csv")