In [4]:
#importing libraries

import cv2
import os
import numpy as np
import pandas as pd
import pytesseract
from pytesseract import Output
import matplotlib.pyplot as plt
pytesseract.pytesseract.tesseract_cmd="/home/aman/anaconda3/envs/tallyInvoiceParser.env/bin/tesseract"
os.environ['TESSDATA_PREFIX'] = "/home/aman/anaconda3/envs/tallyInvoiceParser.env/share/tessdata"
import import_ipynb
from imagePreProcessor import *
from TesseractModify import *
from datetime import datetime
import math
from nltk.tag import StanfordNERTagger
os.environ['CLASSPATH'] = '/home/aman/Documents/Tally/Dependencies/StanfordNER/stanford-ner-2020-11-17/stanford-ner.jar'
os.environ['STANFORD_MODELS'] = '/home/aman/Documents/Tally/Dependencies/StanfordNER/stanford-corenlp-4.4.0-models-english/edu/stanford/nlp/models/ner/'
java_path = "/usr/lib/jvm/java-7-openjdk-amd64"
os.environ['JAVAHOME'] = java_path
stanford_classifier  =  '/home/aman/Documents/Tally/Dependencies/StanfordNER/stanford-corenlp-4.4.0-models-english/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz'
from IPython.display import clear_output
from sklearn.cluster import KMeans

In [5]:
st = StanfordNERTagger(stanford_classifier)

In [6]:
imagesFolderPath = '/home/aman/Documents/Tally/Git-Document-AI/Document-AI/SellerNameModel/Dataset/SampleImages/'

In [22]:
def extractOrg(df):
    orgs = []
    texts = df["text"].to_numpy()
    T = ""
    for text in texts:
        T+=text+" "
    tagged = st.tag(str(T).split())
    for tag in tagged:
        if tag[1]=="ORGANIZATION" or tag[1]=="PERSON":
            orgs.append(tag[0])
    return orgs

In [23]:
def extract(df, orgs):
    indexes = []
    for index in df.index:
        if df["text"][index] not in orgs:
            indexes.append(index)
    D = df.drop(indexes)
    return D

In [24]:
def process(data, image):
    data.reset_index(inplace=True)
    data.drop(columns=["index"], inplace=True)
    data["x"] = data["left"] + data["width"]/2
    data["y"] = data["top"] + data["height"]/2
    data["PageWidth"] = image.shape[1]
    data["PageHeight"] = image.shape[0]
    for index in data.index:
        data["text"][index] = data["text"][index].lower()
    neighbours = ["seller", "buyer", "receiver", "bill", "to", "from", "by", "ship", "customer", "details", "purchaser", "supplier", "consignee", "client", "owner", "invoice", "for", "billing", "store", "information", "receipt", "recipient"]
    for col in neighbours:
        data[col]=0
    
    

In [25]:
neighbours = ["seller", "buyer", "receiver", "bill", "to", "from", "by", "ship", "customer", "details", "purchaser", "supplier", "consignee", "client", "owner", "invoice", "for", "billing", "store", "information", "receipt", "recipient"]

In [26]:
def returnDist(x1, y1, x2, y2):
    p = [x1, y1]
    q = [x2, y2]
    return math.dist(p, q)

In [27]:
def allocateNeighbours(df, neighbourFields):
    for index in df.index:
        x1 = df["x"][index]/df["PageWidth"][index]
        y1 = df["y"][index]/df["PageHeight"][index]
        block = df["block_num"][index]
        neighbours = {}
        for index2 in df.index:
            x2 = df["x"][index2]/df["PageWidth"][index2]
            y2 = df["y"][index2]/df["PageHeight"][index2]
            block2 = df["block_num"][index2]
            dist = returnDist(x1, y1, x2, y2)
            if dist<0.2:
                neighbours[df["text"][index2].lower()]=dist
            if abs(y1-y2)<0.2 and abs(block2-block)<=1<=1:
                neighbours[df["text"][index2].lower()] = dist
        for n in neighbourFields:
            if n in neighbours:
                df[n][index]=neighbours[n]

In [28]:
def cluster(df):
    kmeans = KMeans(n_clusters=2, n_init="auto", random_state=0)
    X = df["x"].to_numpy()
    Y = df["y"].to_numpy()
    train = [[X[index], Y[index]] for index in range(len(X))]
    if len(train)<2:
        df["cluster"]=-1
        return
    kmeans.fit(train)
    df["cluster"]=0
    for index in df.index:
        x = df["x"][index]
        y = df["y"][index]
        pred = kmeans.predict([[x, y]])
        if pred==0:
            pass
        else:
            df["cluster"][index]=1

In [29]:
def merge(data):
    df = data.drop(columns=["level", "block_num", "page_num", "par_num", "line_num", "word_num", "left", "top", "width", "height"])
    conf = []
    T = ""
    x = []
    y = []
    for index in df.index:
        conf.append(df["conf"][index])
        T+=df["text"][index] + " "
        x.append(df["x"][index])
        y.append(df["y"][index])
        PW = df["PageWidth"][index]
        PH = df["PageHeight"][index]
        cluster = df["cluster"][index]
    dict = {}
    for col in neighbours:
        if df[df[col]>0].shape[0]==0:
            dict[col]=0
        else:
            dict[col]=min(df[df[col]>0][col].to_numpy())
    C = sum(conf)/len(conf)
    X = sum(x)/len(x)
    Y = sum(y)/len(y)
    merged = [C, T, X, Y, PW, PH]
    for col in neighbours:
        merged.append(dict[col])
    merged.append(cluster)
    return merged
        
    

In [30]:
def concat(df):
    
    df.reset_index(inplace=True)
    df.drop(columns=["index"], inplace=True)
    
    if df[df["cluster"]==0].shape[0] == 0 :
        df.drop(columns=["level", "block_num", "page_num", "par_num", "line_num", "word_num", "left", "top", "width", "height"], inplace=True)
        return df
        
    df1 = df[df["cluster"]==0]
    df2 = df[df["cluster"]==1]
    rows = []
    row1 = merge(df1)
    row2 = merge(df2)
    rows.append(row1)
    rows.append(row2)
    for block in df1["block_num"].unique():
        df = df1[df1["block_num"]==block]
        r = merge(df)
        rows.append(r)
    for block in df2["block_num"].unique():
        df = df2[df2["block_num"]==block]
        r = merge(df)
        rows.append(r)
        
    df.drop(columns=["level", "block_num", "page_num", "par_num", "line_num", "word_num", "left", "top", "width", "height"], inplace=True)

    indexes = []
    for index in df.index:
        indexes.append(index)
    
    for row in rows:
        df.loc[len(df.index)] = row
        
    df.drop(indexes, inplace=True)
        
    return df
    
        
    
    

In [31]:
def create(imagePath):
    image = cv2.imread(imagePath)
    preProcessedImage = preProcessImage(imagePath)
    data = pytesseract.image_to_data(preProcessedImage, output_type=Output.DATAFRAME)
    data.dropna(inplace=True)
    data.reset_index(inplace=True)
    data.drop(columns=["index"], inplace=True)
    df = pd.DataFrame(data)
    process(df, image)
    allocateNeighbours(df, neighbours)
    orgs = extractOrg(df)
    df = extract(df, orgs)
    cluster(df)
    d = concat(df)
    return d
    

In [32]:
def batchCreate(folderPath):
    os.chdir(folderPath)
    images = os.listdir()
    dfList = []
    for imagePath in images:
        #print(imagePath)
        df = create(imagePath)
        df["imageName"] = imagePath
        df["output"] = 0
        dfList.append(df)
    return dfList

In [33]:
pwd

'/home/aman/Documents/Tally/Git-Document-AI/Document-AI/SellerNameModel'

In [34]:
imagePath = '/home/aman/Documents/Tally/Git-Document-AI/Document-AI/SellerNameModel/Dataset/SampleImages/8.jpeg'

In [35]:
df = create(imagePath)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text"][index] = data["text"][index].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cluster"][index]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cluster"][index]=1
A value is trying to be set on a copy of a sl

In [36]:
df

Unnamed: 0,conf,text,x,y,PageWidth,PageHeight,seller,buyer,receiver,bill,...,client,owner,invoice,for,billing,store,information,receipt,recipient,cluster
6,89.46575,solutions pvt ltd amr tech park karnataka indi...,195.333333,902.916667,1654,2339,0,0,0,0,...,0,0,0.102091,0.061838,0,0,0,0,0,0
7,83.922053,‘databytes consulting technologies pvt ltd. co...,885.9375,355.53125,1654,2339,0,0,0,0,...,0,0,0.498386,0.0,0,0,0,0,0,1
8,85.890165,solutions pvt ltd amr tech park karnataka india,174.125,651.8125,1654,2339,0,0,0,0,...,0,0,0.102091,0.0,0,0,0,0,0,0
9,96.616918,state bank of india,237.75,1405.125,1654,2339,0,0,0,0,...,0,0,0.0,0.061838,0,0,0,0,0,0
10,78.79592,‘databytes consulting technologies pvt ltd.,915.7,96.1,1654,2339,0,0,0,0,...,0,0,0.0,0.0,0,0,0,0,0,1
11,88.994286,consulting,439.5,163.0,1654,2339,0,0,0,0,...,0,0,0.0,0.0,0,0,0,0,0,1
12,96.61779,karnataka india,691.0,235.0,1654,2339,0,0,0,0,...,0,0,0.0,0.0,0,0,0,0,0,1


In [37]:
df.columns

Index(['conf', 'text', 'x', 'y', 'PageWidth', 'PageHeight', 'seller', 'buyer',
       'receiver', 'bill', 'to', 'from', 'by', 'ship', 'customer', 'details',
       'purchaser', 'supplier', 'consignee', 'client', 'owner', 'invoice',
       'for', 'billing', 'store', 'information', 'receipt', 'recipient',
       'cluster'],
      dtype='object')

In [38]:
df[df["cluster"]==1]["text"].to_numpy()

array(['‘databytes consulting technologies pvt ltd. consulting karnataka india of karnataka amr tech park i. karnataka india ',
       '‘databytes consulting technologies pvt ltd. ', 'consulting ',
       'karnataka india '], dtype=object)

In [39]:
dfList = batchCreate(imagesFolderPath)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text"][index] = data["text"][index].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n][index]=neighbours[n]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cluster"][index]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text"][index] = data["text"][index].lower()
A value is trying 

In [41]:
len(dfList)

53

In [42]:
resultDF = pd.concat(dfList)

In [43]:
resultDF.shape

(82, 31)

In [44]:
len(resultDF["imageName"].unique())

20

In [45]:
resultDF.drop(columns=["cluster"], inplace=True)

In [46]:
resultDF.reset_index(inplace=True)
resultDF.drop(columns=["index"], inplace=True)

In [47]:
resultDF.head()

Unnamed: 0,conf,text,x,y,PageWidth,PageHeight,seller,buyer,receiver,bill,...,owner,invoice,for,billing,store,information,receipt,recipient,imageName,output
0,94.571243,shankar shankar murthy shankar murthy,1364.6,642.1,1654,2339,0.0,0.0,0.0,0.0,...,0.0,0.190667,0.098689,0.017295,0.0,0.0,0.0,0.0,20.jpeg,0
1,96.840569,amazon,964.0,2143.0,1654,2339,0.316808,0.0,0.0,0.0,...,0.0,0.104751,0.031766,0.0,0.0,0.0,0.0,0.0,20.jpeg,0
2,91.489166,shankar,1320.5,337.5,1654,2339,0.0,0.0,0.0,0.0,...,0.0,0.0,0.098689,0.017295,0.0,0.0,0.0,0.0,20.jpeg,0
3,95.341763,shankar murthy shankar murthy,1375.625,718.25,1654,2339,0.0,0.0,0.0,0.0,...,0.0,0.190667,0.0,0.169185,0.0,0.0,0.0,0.0,20.jpeg,0
4,85.131195,ibibo ibibo group private limited gol,505.916667,1569.75,1700,2200,0.0,0.0,0.0,0.0,...,0.0,0.1985,0.167333,0.0,0.0,0.17164,0.0,0.0,39.jpeg,0


In [None]:
pwd

In [22]:
os.chdir('/home/aman/Documents/Tally/Git-Document-AI/Document-AI/SellerNameModel/Dataset/')

In [23]:
pwd

'/home/aman/Documents/Tally/Git-Document-AI/Document-AI/SellerNameModel/Dataset'

In [None]:
resultDF.to_csv("dataset.csv")