In [None]:
import cv2
import os
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
import pytesseract
from pytesseract import Output
import matplotlib.pyplot as plt
pytesseract.pytesseract.tesseract_cmd="/home/aman/anaconda3/envs/tallyInvoiceParser.env/bin/tesseract"
os.environ['TESSDATA_PREFIX'] = "/home/aman/anaconda3/envs/tallyInvoiceParser.env/share/tessdata"
import import_ipynb
from textPreProcessor import *
from imagePreProcessor import *
from datetime import datetime
import math

In [2]:
imagesFolderPath = '/home/aman/Documents/Tally/DocumentAI/Code/Tesseract-Modelv4/Dataset/SampleImages/'

In [3]:
def preProcessDataFrame(df, image):
    #drop null values
    df.dropna(inplace=True)
    
    #columns having same values throughout removed
    toDrop = ["level", "page_num"]
    df.drop(columns=toDrop, inplace=True)
    
    #rows having text as a stopword removed
    indexesToDrop = []
    stopwords = ["", " ", "  ", "/", ":", "-", ".", ",", "\n", "\t", "\\", "(", ")", "[", "]", "{", "}", "*", "&", "%", "$", "#", "|"]
    for index in df.index:
        if df["conf"][index]<=40:
            indexesToDrop.append(index)
        elif df["text"][index] in stopwords:
            indexesToDrop.append(index)   
    df.drop(indexesToDrop, inplace=True)
    
    #resetting the index
    df.reset_index(inplace=True)
    df.drop(columns=["index"], inplace=True)
    
    #creating new columns for centroid information
    df["x"] = df["left"] + df["width"] 
    df["y"] = df["top"] + df["height"]
    
    df["PageHeight"] = image.shape[0]
    df["PageWidth"] = image.shape[1]
    
    df["isTop"] = 0
    df["isBottom"] = 0
    df["isRight"] = 0
    df["isLeft"] = 0

In [4]:
def findDate(dateDF):
    dates = []
    
    format = "%d-%m-%Y"
    for index in dateDF.index:
        text = dateDF["text"][index]
        res = True
        try:
            res = bool(datetime.strptime(text, format))
        except ValueError:
            res = False
        if str(res)=="True":
            dates.append(dateDF["text"][index])
            
    format = "%m-%d-%Y"
    for index in dateDF.index:
        text = dateDF["text"][index]
        res = True
        try:
            res = bool(datetime.strptime(text, format))
        except ValueError:
            res = False
        if str(res)=="True":
            dates.append(dateDF["text"][index])
            
    format = "%y-%m-%d"
    for index in dateDF.index:
        text = dateDF["text"][index]
        res = True
        try:
            res = bool(datetime.strptime(text, format))
        except ValueError:
            res = False
        if str(res)=="True":
            dates.append(dateDF["text"][index])
            
    return dates

In [5]:
def extractDateDataFrame(df):
    dateDF = df.copy()
    for index in dateDF.index:
        text = dateDF["text"][index]
        dateDF["text"][index] = preProcessText(text)
    dates = findDate(dateDF)
    return dateDF, dates
    
        

In [6]:
def returnDist(x1, y1, x2, y2):
    p = [x1, y1]
    q = [x2, y2]
    return math.dist(p, q)

In [7]:
def allocateNeighbours(df, neighbourFields):
    for index in df.index:
        x1 = df["x"][index]/df["PageWidth"][index]
        y1 = df["y"][index]/df["PageHeight"][index]
        block = df["block_num"][index]
        neighbours = {}
        for index2 in df.index:
            x2 = df["x"][index2]/df["PageWidth"][index2]
            y2 = df["y"][index2]/df["PageHeight"][index2]
            block2 = df["block_num"][index2]
            dist = returnDist(x1, y1, x2, y2)
            if dist<0.2:
                neighbours[df["text"][index2].lower()]=dist
            if abs(y1-y2)<0.2 and abs(block2-block)<=1<=1:
                neighbours[df["text"][index2].lower()] = dist
        for n in neighbourFields:
            if n in neighbours:
                df[n][index]=neighbours[n]

In [8]:
def addNeighbours(df):
    neighbourFields = ["date", "dated", "invoice", "delivery", "order", "due", "payment", "tax", "bill", "receipt", "issue"]
    for col in neighbourFields:
        df[col]=0
    allocateNeighbours(df, neighbourFields)
    

In [9]:
def findDateDF(dates, dateDF):
    indexes = []
    for date in dates:
        i = dateDF[dateDF["text"]==date].index
        for index in i:
            indexes.append(index)
    return indexes

In [10]:
def dropIndexes(indexes, df):
    indexesToDrop=[]
    for index in df.index:
        if index not in indexes:
            indexesToDrop.append(index)
    df.drop(indexesToDrop, inplace=True)

In [11]:
def create(imagePath):
    image = cv2.imread(imagePath)
    preProcessedImage = preProcessImage(imagePath)
    data = pytesseract.image_to_data(preProcessedImage, output_type=Output.DATAFRAME)
    df = pd.DataFrame(data)
    preProcessDataFrame(df, preProcessedImage)
    dateDF, dates = extractDateDataFrame(df)
    addNeighbours(df)
    indexes = findDateDF(dates, dateDF)
    resultDF = df.copy()
    dropIndexes(indexes, resultDF)
    return resultDF
    
    

In [12]:
def batchCreate(folderPath):
    os.chdir(folderPath)
    images = os.listdir()
    dfList = []
    for imagePath in images:
        #print(imagePath)
        df = create(imagePath)
        df["imageName"] = imagePath
        df["output"] = 0
        dfList.append(df)
    return dfList

In [13]:
dfList = batchCreate(imagesFolderPath)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dateDF["text"][index] = preProcessText(text)
A valu

In [14]:
len(dfList)

53

In [15]:
resultDF = pd.concat(dfList)

In [16]:
resultDF.shape

(44, 31)

In [17]:
len(resultDF["imageName"].unique())

26

In [18]:
resultDF

Unnamed: 0,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,...,delivery,order,due,payment,tax,bill,receipt,issue,imageName,output
108,7,9,2,3,300,1105,155,23,96.191299,02.04.2023,...,0.0,0.151753,0.0,0.0,0.528837,0.0,0.0,0.0,20.jpeg,0
114,7,9,3,4,1379,1146,155,23,95.732468,02.04.2023,...,0.0,0.804302,0.0,0.0,0.131721,0.0,0.0,0.0,20.jpeg,0
157,20,1,2,7,873,1833,106,18,69.672714,"02/04/2023,",...,0.0,0.0,0.0,0.162403,0.421051,0.0,0.0,0.0,20.jpeg,0
174,26,1,1,4,873,1904,106,18,96.788101,"02/04/2023,",...,0.0,0.0,0.0,0.132623,0.0,0.0,0.0,0.0,20.jpeg,0
51,9,1,1,3,588,897,88,29,71.175232,"9/10/2022,",...,0.0,0.0,0.0,0.148762,0.0,0.0,0.0,0.0,23.jpeg,0
52,9,1,1,4,745,898,92,28,64.239807,"16-10-2022,",...,0.0,0.0,0.0,0.139577,0.0,0.0,0.0,0.0,23.jpeg,0
68,9,3,1,3,588,1037,84,19,94.234421,9/10/2022,...,0.0,0.0,0.0,0.102512,0.0,0.0,0.0,0.0,23.jpeg,0
69,9,3,1,4,745,1031,88,32,84.635925,16-10-2022,...,0.0,0.0,0.0,0.083202,0.0,0.0,0.0,0.0,23.jpeg,0
12,4,1,2,3,84,320,113,19,70.813957,04/14/2022,...,0.0,0.0,0.0,0.0,0.050256,0.094122,0.0,0.0,18.jpeg,0
19,9,1,6,2,1216,328,127,22,73.269348,15/04/2022,...,0.0,0.0,0.0,0.41068,0.0,0.0,0.25286,0.0,39.jpeg,0


In [19]:
resultDF.reset_index(inplace=True)

In [20]:
resultDF.drop(columns=["index"], inplace=True)

In [21]:
resultDF.head()

Unnamed: 0,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,...,delivery,order,due,payment,tax,bill,receipt,issue,imageName,output
0,7,9,2,3,300,1105,155,23,96.191299,02.04.2023,...,0.0,0.151753,0.0,0.0,0.528837,0.0,0.0,0.0,20.jpeg,0
1,7,9,3,4,1379,1146,155,23,95.732468,02.04.2023,...,0.0,0.804302,0.0,0.0,0.131721,0.0,0.0,0.0,20.jpeg,0
2,20,1,2,7,873,1833,106,18,69.672714,"02/04/2023,",...,0.0,0.0,0.0,0.162403,0.421051,0.0,0.0,0.0,20.jpeg,0
3,26,1,1,4,873,1904,106,18,96.788101,"02/04/2023,",...,0.0,0.0,0.0,0.132623,0.0,0.0,0.0,0.0,20.jpeg,0
4,9,1,1,3,588,897,88,29,71.175232,"9/10/2022,",...,0.0,0.0,0.0,0.148762,0.0,0.0,0.0,0.0,23.jpeg,0


In [22]:
pwd

'/home/aman/Documents/Tally/DocumentAI/Code/Tesseract-Modelv4/Dataset/SampleImages'

In [23]:
os.chdir('/home/aman/Documents/Tally/DocumentAI/Code/Tesseract-Modelv4/Dataset/')

In [24]:
pwd

'/home/aman/Documents/Tally/DocumentAI/Code/Tesseract-Modelv4/Dataset'

In [25]:
resultDF.to_csv("tesseract.csv")