In [1]:
import pandas as pd
import numpy as np
import os
train_data= pd.read_csv("trainingdata.csv",encoding="unicode escape")
#test_data=pd.read_csv("testdata.csv",encoding="unicode escape")


In [2]:
train_data

Unnamed: 0,Tag,Text
0,bank statement,page 10f5 03/02/2022 dc 1090001004230 gs anyco...
1,invoice,"anycompany retail services date mar26,2018_ ll..."
2,invoice,anycompany llc 7688 florencio causeway millsbu...
3,bank statement,"page 1of5 03/02/2022 dc 1090001004290, ge anyc..."
4,bank statement,page 10f5 03/02/2022 dc 1090001004290 . 999999...
...,...,...
342,invoice,"anycompany dealers date jun 20, 2018 85899 her..."
343,business proof,nagaswara hhusie publishing surat keterangan k...
344,invoice,"anycompany hardware date jan 20, 2020 s s invo..."
345,address proof,affidavit of residence . application implement...


In [4]:
from sklearn.preprocessing import OneHotEncoder as OHE

y_encoder= OHE().fit(np.array(train_data.Tag).reshape(-1,1))

In [6]:
ytr_encoded= y_encoder.transform(np.array(train_data.Tag).reshape(-1,1)).toarray()
#yts_encoded= y_encoder.transform(np.array(tags).reshape(-1,1)).toarray()

In [7]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')
train_data["tokenized"]= train_data.Text.map(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SJain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')
def remove_stop(strings, stop_list):
    classed= [s for s in strings if s not in stop_list]
    return classed

stop= stopwords.words("english")
stop_punc= list(set(punctuation))+ stop

train_data["selected"]= train_data.tokenized.map(lambda df: remove_stop(df, stop_punc))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SJain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from nltk.stem import PorterStemmer

def normalize(text):
    return " ".join(text)

stemmer= PorterStemmer()

train_data["stemmed"]= train_data.selected.map(lambda xs: [stemmer.stem(x) for x in xs])
train_data["normalized"]= train_data.stemmed.apply(normalize)


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer= Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(train_data.normalized)

tokenized_train= tokenizer.texts_to_sequences(train_data.normalized)

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_padded= pad_sequences(tokenized_train, maxlen= 20, padding= "pre")


In [13]:
def transform_x(data, tokenizer):
    output_shape= [data.shape[0],
                  data.shape[1],
                  tokenizer.word_index.keys().__len__()]
    results= np.zeros(output_shape)
    
    for i in range(data.shape[0]):
        for ii in range(data.shape[1]):
            results[i, ii, data[i,ii]-1]= 1
    return results

xtr_transformed= transform_x(train_padded, tokenizer)

In [14]:
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy as CC
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.initializers import he_uniform, glorot_uniform
from tensorflow.keras.metrics import AUC
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2
import tensorflow as tf

#@tf.function
class LSTMModel(object):
    
    #@tf.function
    def build_model(self, input_dim, output_shape, steps, dropout_rate, kernel_regularizer, bias_regularizer):
        input_layer= Input(shape= (steps, input_dim))
        
        #make lstm_layer
        lstm= LSTM(units= steps)(input_layer)
        dense_1= Dense(output_shape, kernel_initializer= he_uniform(),
                       bias_initializer= "zeros", 
                       kernel_regularizer= l2(l= kernel_regularizer),
                       bias_regularizer= l2(l= bias_regularizer))(lstm)
        x= BatchNormalization()(dense_1)
        x= relu(x)
        x= Dropout(rate= dropout_rate)(x)
        o= Dense(output_shape, kernel_initializer= glorot_uniform(),
                 bias_initializer= "zeros", 
                 kernel_regularizer= l2(l= kernel_regularizer), 
                 bias_regularizer= l2(l= bias_regularizer))(dense_1)
        o= BatchNormalization()(o)
        output= softmax(o, axis= 1)
        
        loss= CC()
        metrics= AUC()
        optimizer= Adam()
        self.model= Model(inputs= [input_layer], outputs= [output])
        self.model.compile(optimizer= optimizer, loss= loss, metrics= [metrics])
        
        
    def train(self, x, y, validation_split, epochs):
        self.model.fit(x, y, validation_split= validation_split, epochs= epochs)
        
    def predict(self, x):
        return self.model.predict(x)

In [15]:
steps= xtr_transformed.shape[1]
dim= xtr_transformed.shape[2]
output_shape= ytr_encoded.shape[1]

model= LSTMModel()
model.build_model(input_dim= dim,
                  output_shape= output_shape,
                  steps= steps, 
                  dropout_rate= 0.5, 
                  bias_regularizer= 0.3, 
                  kernel_regularizer= 0.3)

In [16]:
model.train(xtr_transformed, ytr_encoded,
           0.2, 120)

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

In [17]:
from sklearn.metrics import classification_report

prediction= y_encoder.inverse_transform(model.predict(xtr_transformed))
#print(classification_report(train_data.tag, prediction))



In [18]:
prediction

array([['bank statement'],
       ['invoice'],
       ['invoice'],
       ['bank statement'],
       ['bank statement'],
       ['invoice'],
       ['receipt'],
       ['invoice'],
       ['salary slip'],
       ['employment proof'],
       ['bank statement'],
       ['bank statement'],
       ['receipt'],
       ['receipt'],
       ['receipt'],
       ['invoice'],
       ['bank statement'],
       ['bank statement'],
       ['receipt'],
       ['receipt'],
       ['receipt'],
       ['invoice'],
       ['address proof'],
       ['receipt'],
       ['invoice'],
       ['bank statement'],
       ['invoice'],
       ['invoice'],
       ['receipt'],
       ['address proof'],
       ['invoice'],
       ['invoice'],
       ['bank statement'],
       ['employment proof'],
       ['bank statement'],
       ['receipt'],
       ['bank statement'],
       ['receipt'],
       ['receipt'],
       ['bank statement'],
       ['receipt'],
       ['receipt'],
       ['receipt'],
       ['address proof

In [23]:
from PIL import Image
from pytesseract import pytesseract
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pandas as pd
import numpy as np

def receipt1(image_path):
    path_to_tesseract = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
#image_path = r"bank1.png"
    #image_path="C:\Users\SJain\Documents\samples\bank statements\"
# Opening the image & storing it in an image object
    img = Image.open(image_path)

# Providing the tesseract executable
# location to pytesseract library
    pytesseract.tesseract_cmd = path_to_tesseract

# Passing the image object to image_to_string() function
# This function will extract the text from the image
    text = pytesseract.image_to_string(img)
    text=text.replace("\n\n"," ").replace("[","").replace("-","").replace(".—","").replace("|","").replace(":","")
    text=text.replace("|","").replace("§","").replace("@","").replace("¥","").replace("©","")
    text=text.replace("«","")
    text=text.replace("\n"," ")
    text=text.replace(">","")
# Displaying the extracted text
    #print(text[:-1])
    
    l=[]
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    #print(tokens_without_sw)
    l.append(tokens_without_sw)
    l2=[]
    for i in l:
        r=TreebankWordDetokenizer().detokenize(i)
        l2.append(r)
    lst=[]
    ps = PorterStemmer()
    for i in l2:
    #text_tokens = word_tokenize(i)
    #tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
        res=ps.stem(i)
   # res=res.replace(" | "," ")
    #res = i.translate(remove_digits)
        lst.append(res)
    df1 = {
     'Text' :lst
    }
 
    df1 = pd.DataFrame(df1,columns=['Text'])
    #print(df1)
    return df1

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SJain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
re1=receipt1("receipt_99.png")

In [25]:
test = re1['Text']
test

0    the aiml store 1234 somewhere . rd poway, cali...
Name: Text, dtype: object

In [30]:
re1.to_csv("testsample.csv")

In [34]:
def build1():
    #text=['NSF Checks','Supply Inventory','PayPal Bank','Fixed Assets','Proshop Inventory','TD-operations 9798','Emp W/H-Medical','Rent - 55+ Housing']
    #text=['Emp W/H-Medical','Rent - 55+ Housing']
    #text1=['Laptop']
    text=['the aiml store 1234 somewhere . rd poway, california 92129 02/13/2022 1228 am tances eea belkin soreenforce tempered; oy $35.00 blas screen oc . 4 hogwarts wooden steamer trek 1 sar; sbt sed . t = ca tax 7 . 7800 161 75 1; total 8161 . 745 002 visa char s161 . 76 oe aid ag000407307 162 visa debit oauth code 07162 some promotions may . reduce the "refund valle of items appic peacoat aindiinchpancbacdat pobsece we feedback! take quick survey enter chance â€œ to hin $260 store gift card . go â€™ "myfakestore . com/win â€œ winners monthiy * (rce darori rocco aoi ata ioick cashier jot doe store 123 pos 003 transact ion 2629 sunday, february 12, 2022 1228 am']
    data = [name.lower() for name in text]
    tokenized_test=tokenizer.texts_to_sequences(data)
    test_padded= pad_sequences(tokenized_test, maxlen= 20, padding= "pre")
    xts_transformed=transform_x(test_padded,tokenizer)
    prediction_test1= y_encoder.inverse_transform(model.predict(xts_transformed))
    
   # prediction_test2= y_encoder.inverse_transform(model.predict_(xts_transformed))
    print(prediction_test1)
    #print(prediction_test2)

In [35]:
build1()



AttributeError: 'LSTMModel' object has no attribute 'predict_proba'

In [37]:
print(classification_report(train_data.Tag, prediction))

                  precision    recall  f1-score   support

   address proof       1.00      0.80      0.89        10
  bank statement       0.99      0.98      0.99       105
  business proof       1.00      0.80      0.89         5
employment proof       1.00      0.86      0.92         7
    fund raising       1.00      1.00      1.00         2
         invoice       1.00      1.00      1.00        98
         receipt       1.00      1.00      1.00       102
     salary slip       0.59      1.00      0.74        10
      tax return       1.00      0.75      0.86         8

        accuracy                           0.98       347
       macro avg       0.95      0.91      0.92       347
    weighted avg       0.99      0.98      0.98       347

