## Imports
*   Importing dependencies

*   Making sure TensorFlow doesn't hog all of GPU




In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
from keras.engine import Layer
from keras.models import load_model

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

## Defining the Elmo Embedding Layer to reload it in the saved model

In [0]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)


## Loading the trained saved model

In [4]:
model=load_model("halfnonehalfsac.h5", custom_objects={'ElmoEmbeddingLayer': ElmoEmbeddingLayer()})

W0712 12:18:09.679744 139887749801856 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0712 12:18:12.524860 139887749801856 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0712 12:18:12.639963 139887749801856 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0712 12:18:12.647618 139887749801856 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:190: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.

W0712 12:18:14.663913 13988774980

## Base imports and setting google credentials for performing OCR

In [0]:
import argparse
import io
import re
import os
from os import listdir
from os.path import isfile, join
import google.cloud
import uploadfiletovision as sendtovision

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="vision_key.json"

## Entering the file name as 'filename.pdf' thereby uploading to Google Cloud Bucket Storage to perform OCR 

Replace string in mypath with the path of the pdf

In [11]:
mypath=r' '
mypath=mypath[0:-1]
file_name='uploads/'
print("Enter the File Name in format 'filename.pdf'")
name=input()
file_path=mypath+name
file_name=file_name+name

Enter the File Name in format 'filename.pdf'
uploads_220114508.pdf


**Uploading the pdf to project's storage bucker**

In [12]:
sendtovision.upload_blob('actionboard-219211-vcm',file_path,file_name)

File uploads_220114508.pdf uploaded to uploads/uploads_220114508.pdf.


## Performing OCR
* API request sent to Google Cloud Vision OCR which returns a json string

* API used:
https://cloud.google.com/vision/docs/pdf

In [14]:
!pip install google-cloud-vision

Collecting google-cloud-vision
[?25l  Downloading https://files.pythonhosted.org/packages/82/07/c6eb18a9bfaf5383424f17d1b3ca0fcc0cd12fd9676460b179d3c4821da6/google_cloud_vision-0.38.0-py2.py3-none-any.whl (413kB)
[K     |████████████████████████████████| 419kB 5.0MB/s 
Installing collected packages: google-cloud-vision
Successfully installed google-cloud-vision-0.38.0


In [15]:
import detect as det
path='gs://actionboard-219211-vcm/uploads/'+name
json_string=det.async_detect_document(path, r'gs://actionboard-219211-vcm/dest/')

Waiting for the operation to finish.
Output files:
dest/
dest/output-1-to-2.json
Full text:
Tax Invoice
MA'ALAKSHMI PAPER WORKS
31/868B. PONNURUNNI
VYTTILA
GSTIN/UIN: 32AXQPM8931K1ZW
State Name : Kerala, Code : 32
Invoice No.
1402
Challan No
DN-132
Supplier's Ref.
Dated
13-Feb-2019
Mode/Terms of Payment
Other Reference
Order No.
Dated
Despatch Doc No
Consignee
MATRIMONY.COM LTD.
3RD. FLOOR, JOS ANEX BUILDING.
JOS JUNCTION
KOCHI
Kerala, Code : 32
GSTIN/UIN 32AADCM0845M1ZI
State Name : Kerala, Code : 32
Dated
13-Feb-2019
Destination
Despatch Through
Terms of Delivery
Description of Goods
HSN/SAC
Quantity
Rate
per
Amount
No.
9989
5 Nos
1,722.96 Nos
8,614.80
9989
10 Nos
1,722.96 Nos
|
17,229.60
PRINTED PHOTO BOOK
Printing &Reproduction
ODR-1477(A)
SHEET-95
5 ALBUMS
2 PRINTED PHOTO BOOK
Printing & Reproduction
ODR-1477(B)
SHEET-190
10 ALBUMS
3 PRINTED PHOTO BOOK
Printing&Reproduction
ODR-1940(A)
SHEET-95
5 ALBUMS
9989
5 Nos
1,722.96 Nos
8,614.80
continued...
This is a Computer Generated Inv

## Extracting Text Blocks from the received JSON file:

In [0]:
from google.cloud import vision
from google.cloud import storage
from google.protobuf import json_format
final=[]
source=[]

response = json_format.Parse(
        json_string, vision.types.AnnotateFileResponse())

    # The actual response for the first page of the input file.
x=[]
for j in range(0,len(response.responses)):
    first_page_response = response.responses[j]
    annotation = first_page_response.full_text_annotation
    y=annotation.text.split("\n")
    x.append(y)
for invoice in x:
    while '' in invoice:
        invoice.remove('')
for k in x:
    for l in k:
        final.append(l)
        source.append(name)

## Prediction and Preliminary Post-Processing
* Parsing the block data to extract GST numbers and eliminating possible OCR errors in GST

In [17]:
Tot_in_words=[]
Amounts=[]
Amounts_conf=[]
Amounts_name=[]
Vendor_gst=[]
Org_gst=[]
cin=[]
for i in final:
    check_cin=re.search(r"[a-zA-Z0-9%]{21}",i)
    if check_cin is None or check_cin.group().isalpha()==True:
        print('not cin')
    else:
        check_cin=check_cin.group()
        cin.append(check_cin)
        continue
    x = re.search(r"[a-zA-Z0-9%]{15}",i)
    if x is None or x.group().isalpha()==True:
        y=i
        y=y.replace(":"," : ")
        #y=re.sub("\.\d{2}","",y)
        y=y.lower()
        a=np.array([y])
        print(a)
        output1= model.predict(a)
        strList1 = ['label_address', 'label_amount', 'label_bill_period',
       'label_cin', 'label_date', 'label_description', 'label_email',
        'label_intents_OrgGST', 'label_intents_VendorGST',
       'label_none', 'label_pan', 'label_ph_no', 'label_quantity', 'label_sac',
       'label_tax_amount', 'label_tax_percent', 'label_tot_in_words',
       'label_total_amount', 'label_vendor_name']
        result1 = zip(output1[0]
             , strList1)
        z=set(result1)
        greatest=0
        for j in z:
            if(j[0]>greatest):
                greatest=j[0]
                name=j[-1]
                conf=j[0]
        print(name)
        if (name=='label_amount') or (name=='label_tax_amount') or (name=='label_total_amount'):
#             r=re.search(r"ph",y)
#             if r is None:
            Amounts.append(y)
            Amounts_name.append(name)
            Amounts_conf.append(conf)
        elif (name=='label_tot_in_words'):
            Tot_in_words.append(y)
    else:
        y=x.group()
        
        if y[-2]==2 or y[-2]=='%' :
            y = y[:-2] + 'z' + y[-1:]
        if y[-3]=='i' or y[-3]=='I' :
            y = y[:-3] + '1' + y[-2:]
        y=y.lower()
        a=np.array([y])
        print(a)
        output= model.predict(a)
        strList = ['label_address', 'label_amount', 'label_bill_period',
       'label_cin', 'label_date', 'label_description', 'label_email',
        'label_intents_OrgGST', 'label_intents_VendorGST',
       'label_none', 'label_pan', 'label_ph_no', 'label_quantity', 'label_sac',
       'label_tax_amount', 'label_tax_percent', 'label_tot_in_words',
       'label_total_amount', 'label_vendor_name']
        result = zip(output[0]
             , strList)
        z=set(result)
        greatest=0
        for j in z:
            if(j[0]>greatest):
                if(j[-1]=="label_intents_VendorGST")or(j[-1]=="label_intents_OrgGST"):
                    greatest=j[0]
                    name=j[-1]
                    conf=j[0]
        print(name)
        if(name=='label_intents_VendorGST'):
            clean=x.group()
            if clean[-2]=='2' or clean[-2]=='%' :
                idx=-2
                clean = clean[0:idx] + 'Z' + clean[idx+1:]
            if clean[-3]=='I' or clean[-2]=='i' :
                idx=-3
                clean = clean[0:idx] + '1' + clean[idx+1:]
            Vendor_gst.append(clean)
        elif(name=='label_intents_OrgGST'):
            clean=x.group()
            if clean[-2]=='2' or clean[-2]=='%' :
                idx=-2
                clean = clean[0:idx] + 'Z' + clean[idx+1:]
            if clean[-2]=='I' or clean[-2]=='i' :
                idx=-3
                clean = clean[0:idx] + '1' + clean[idx+1:]
            Org_gst.append(clean)

not cin
['tax invoice']
label_none
not cin
["ma'alakshmi paper works"]
label_vendor_name
not cin
['31/868b. ponnurunni']
label_address
not cin
['vyttila']
label_address
not cin
['32axqpm8931k1zw']
label_intents_VendorGST
not cin
['state name  :  kerala, code  :  32']
label_none
not cin
['invoice no.']
label_none
not cin
['1402']
label_total_amount
not cin
['challan no']
label_none
not cin
['dn-132']
label_date
not cin
["supplier's ref."]
label_none
not cin
['dated']
label_none
not cin
['13-feb-2019']
label_date
not cin
['mode/terms of payment']
label_none
not cin
['other reference']
label_none
not cin
['order no.']
label_none
not cin
['dated']
label_none
not cin
['despatch doc no']
label_none
not cin
['consignee']
label_none
not cin
['matrimony.com ltd.']
label_vendor_name
not cin
['3rd. floor, jos anex building.']
label_address
not cin
['jos junction']
label_none
not cin
['kochi']
label_address
not cin
['kerala, code  :  32']
label_none
not cin
['32aadcm0845m1zi']
label_intents_OrgGST

## Dictionaries for extracting numbers from words (one to 1):
* Loading a dictionary from a picklefile which contains the numbers in words and their corresponding numerical values (from 0 to 99,99,999)
* Loading all possible numerical words to transform misspelt words(OCR/Human errors in invoices) to the correct representation

In [26]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [31]:
import pickle
import difflib

# with open('words2num.pickle', 'rb') as handle:
#     magic = pickle.load(handle)
DATA_PATH = "/content/drive/My Drive/Data"
infile = open(DATA_PATH+'/words2num.pickle','rb')
magic = pickle.load(infile)

with open('unique_words_in_total.pickle', 'rb') as handle:
    test= pickle.load(handle)
print(test)
#Sample Usage of word2num and difflib
print(magic[difflib.get_close_matches('fibe', test)[0]])

{'fifty', 'thirteen', 'fifteen', 'seventeen', 'eighty', 'eleven', 'hundred', 'eight', 'ninety', 'fourteen', 'forty', 'ten', 'sixteen', 'nineteen', 'two', 'seven', 'three', 'one', 'eighteen', 'nine', 'thirty', 'five', 'twelve', 'lakh', 'four', 'zero', 'seventy', 'twenty', 'thousand', 'six', 'sixty'}
5


## Transforming the tot_in_words to a numerical form

In [33]:
for i in range(0,len(Tot_in_words)):
    Tot_in_words[i]=Tot_in_words[i].lower()
    Tot_in_words[i]=re.sub("lakhs","lakh",Tot_in_words[i])
    Tot_in_words[i]=re.sub("lacs","lakh",Tot_in_words[i])

cleaned=[]
for i in Tot_in_words:
    i=re.sub(' and.*?paise','',i)
    a=list(i.split())
    string=''
    if a[0]=='hundred' or a[0]=='thousand' or a[0]=='lakh':
        string=string+'one'+' '
    b=[]
    for j in a:
        if not difflib.get_close_matches(j.lower(), test):
            j=j.lower()
        else:
            j=difflib.get_close_matches(j.lower(), test)[0]
        if j.lower() in test:
            b.append(j.lower())
            string=string+j.lower()+' '
    cleaned.append(string[:-1])
total_from_words=[]
for i in cleaned:
    total_from_words.append(magic[i])
    print(i,total_from_words[-1])

forty eight thousand two hundred forty three 48243
five thousand one hundred sixty eight 5168


## Logic to find out tax amount from the list of amounts in an invoice:

In [0]:
# Amounts
values=[]
tax=[]
def find_tax(Amounts):
    highest=0
    highest2=0
    lowest=9999999999999
    secondlowest=99999999999
    from re import sub
    from decimal import Decimal
    def hasNumbers(s):
        try:
            float(s)
            return True
        except ValueError:
            return False
    i=0
    for sep_amount in Amounts:
        amount_sep=sep_amount.split()
        for amount in amount_sep:
            amount=amount.replace(",","")
            amount=re.sub(r'[A-Za-z\+\*\?\^\$\(\)\[\]\{\}\|]', '', amount)
            if hasNumbers(amount) != False:
                value = float(amount)
                if value>0 :
                    #value = Decimal(amount.strip('$'))
                    values.append(value)
                    if(Amounts_name[i]=='label_tax_amount'):
                        if(value>highest):
                            highest=value
                        elif(value>highest2):
                            highest2=value
                        if(value<lowest):
                            lowest=value
                        elif(value<secondlowest):
                            secondlowest=value
        i=i+1
    if(highest==highest2):
        tax_amount1=highest+highest2
    else:
        tax_amount1=highest*2
    flag=0
    values.sort()
    i=0
    while(i<=len(values)-1):
        j=i
        while(j<=len(values)-1):
            temp1=values[i]
            temp2=values[j]
            tax_amount=temp1+temp2
            for value in values:
                if(tax_amount==value):
                    tax.append(tax_amount)
                    flag=1
            j=j+1
        i=i+1

    if(flag==1):
        print("the right way")
        return pd.Series(tax).value_counts().idxmax()
    else:
        return tax_amount1

In [37]:
find_tax(Amounts)

the right way


5168.9