In [3]:
#importing libraries

import cv2
import os
import numpy as np
import pandas as pd
import pytesseract
from pytesseract import Output
import matplotlib.pyplot as plt
pytesseract.pytesseract.tesseract_cmd="/home/aman/anaconda3/envs/tallyInvoiceParser.env/bin/tesseract"
os.environ['TESSDATA_PREFIX'] = "/home/aman/anaconda3/envs/tallyInvoiceParser.env/share/tessdata"
import import_ipynb
from imagePreProcessor import *
from TesseractModify import *
from datetime import datetime
import math
import dateparser
import datefinder

In [4]:
imagesFolderPath = '/home/aman/Documents/Tally/Git-Document-AI/Document-AI/InvoiceDateModel/Dataset/SampleImages/'

In [5]:
def preProcessDataFrame(df, image):
    
    #drop null values
    df.dropna(inplace=True)
    
    
    #rows having text as a stopword removed
    indexesToDrop = []
    stopwords = ["", " ", "  ", "/", ":", "-", ".", ",", "\n", "\t", "\\", "(", ")", "[", "]", "{", "}", "*", "&", "%", "$", "#", "|"]
    for index in df.index:
        if df["text"][index] in stopwords:
            indexesToDrop.append(index)   
    df.drop(indexesToDrop, inplace=True)
    
    #resetting the index
    df.reset_index(inplace=True)
    df.drop(columns=["index"], inplace=True)
    
    #creating new columns for centroid information
    df["x"] = df["left"] + df["width"] 
    df["y"] = df["top"] + df["height"]
    
    #new columns for page width and height for further normalization
    df["PageHeight"] = image.shape[0]
    df["PageWidth"] = image.shape[1]
    
    #adding columns for zonal info (4 Zone)
    df["isTop"] = 0
    df["isBottom"] = 0
    df["isRight"] = 0
    df["isLeft"] = 0
    
    #is the date past date or future date
    df["isPast"] = 0
    df["isFuture"] = 0

In [6]:
def verifyDate(text):
    
    dates = list(datefinder.find_dates(text, strict=True))
    
    if len(dates)==0:
        return False
    else:
        for date in dates:
            year = date.year
            presentYear = date.today().year
            if abs(year-presentYear)<=2:
                print(dates)
                return True
            else:
                return False

In [7]:
def findDate(dateDF):
    dates = []
    
    for index in dateDF.index:
        text = dateDF["text"][index]
        if verifyDate(text):
            dates.append(text)
            Dates = list(datefinder.find_dates(text))
            dateToday = Dates[0].today()
            if Dates[0]<=dateToday:
                dateDF["isPast"][index]=1
            else:
                dateDF["isFuture"][index]=1
            
    
    return dates

In [8]:
def extractDateDataFrame(df):
    dateDF = df.copy()
    for index in dateDF.index:
        text = dateDF["text"][index]
        dateDF["text"][index] = text.lower()
    dates = findDate(dateDF)
    return dateDF, dates
    
        

In [9]:
def returnDist(x1, y1, x2, y2):
    p = [x1, y1]
    q = [x2, y2]
    return math.dist(p, q)

In [10]:
def allocateNeighbours(df, neighbourFields):
    for index in df.index:
        x1 = df["x"][index]/df["PageWidth"][index]
        y1 = df["y"][index]/df["PageHeight"][index]
        block = df["block_num"][index]
        neighbours = {}
        for index2 in df.index:
            x2 = df["x"][index2]/df["PageWidth"][index2]
            y2 = df["y"][index2]/df["PageHeight"][index2]
            block2 = df["block_num"][index2]
            dist = returnDist(x1, y1, x2, y2)
            if dist<0.2:
                neighbours[df["text"][index2].lower()]=dist
            if abs(y1-y2)<0.2 and abs(block2-block)<=1<=1:
                neighbours[df["text"][index2].lower()] = dist
        for n in neighbourFields:
            if n in neighbours:
                df[n][index]=neighbours[n]

In [11]:
def addNeighbours(df):
    neighbourFields = ["date", "dated", "invoice", "delivery", "order", "due", "payment", "tax", "bill", "receipt", "issue"]
    for col in neighbourFields:
        df[col]=0
    allocateNeighbours(df, neighbourFields)
    

In [12]:
def findDateDF(dates, dateDF):
    indexes = []
    for date in dates:
        i = dateDF[dateDF["text"]==date].index
        for index in i:
            indexes.append(index)
    return indexes

In [13]:
def dropIndexes(indexes, df):
    indexesToDrop=[]
    for index in df.index:
        if index not in indexes:
            indexesToDrop.append(index)
    df.drop(indexesToDrop, inplace=True)

In [14]:
def create(imagePath):
    image = cv2.imread(imagePath)
    preProcessedImage = preProcessImage(imagePath)
    data = pytesseract.image_to_data(preProcessedImage, output_type=Output.DATAFRAME)
    data.dropna(inplace=True)
    data.reset_index(inplace=True)
    data.drop(columns=["index"], inplace=True)
    processData(data)
    df = pd.DataFrame(data)
    preProcessDataFrame(df, preProcessedImage)
    dateDF, dates = extractDateDataFrame(df)
    addNeighbours(df)
    indexes = findDateDF(dates, dateDF)
    resultDF = df.copy()
    dropIndexes(indexes, resultDF)
    return resultDF
    
    

In [15]:
def batchCreate(folderPath):
    os.chdir(folderPath)
    images = os.listdir()
    dfList = []
    for imagePath in images:
        #print(imagePath)
        df = create(imagePath)
        df["imageName"] = imagePath
        df["output"] = 0
        dfList.append(df)
    return dfList

In [16]:
#dfList = batchCreate(imagesFolderPath)

In [17]:
#len(dfList)

In [18]:
#resultDF = pd.concat(dfList)

In [19]:
#resultDF.shape

In [20]:
#len(resultDF["imageName"].unique())

In [21]:
#resultDF

In [22]:
#resultDF.reset_index(inplace=True)

In [23]:
#resultDF.drop(columns=["index", "nextDist", "block_num", "line_num"], inplace=True)

In [24]:
#resultDF.head()

In [25]:
#pwd

In [28]:
#os.chdir('/home/aman/Documents/Tally/Git-Document-AI/Document-AI/InvoiceDateModel/Dataset/')

In [29]:
#pwd

In [30]:
#resultDF.to_csv("tesseract.csv")