In [1]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
import json
import io
from spacy.util import minibatch, compounding
import pandas as pd
import re

In [2]:
LABELS = ['CREDIT AMOUNT', 'DATE OF TRANSACTION', 'TIME OF TRANSACTION', 'REFERENCE NO', 'ACCOUNT NO', 'TOTAL BALANCE', 'CARD NO', 'LEDGER BALANCE', 'BENEFICIARY NAME',
          'BENEFICIARY ACCOUNT','AVAILABLE CREDIT LIMIT', 'COMBINED BALANCE', 'CHEQUE NO', 'CLEARING BALANCE', 'VENDOR LIST', 'ACCOUNT BALANCE', 'TOTAL CREDIT LIMIT', 'TXN INSTRUMENT']

with io.open('C:/Users/Simrandeep/Desktop/train_credit.json', encoding='utf8') as f:
    TRAIN_DATA = json.load(f)

In [3]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))

def main(model=None, new_model_name='model', output_dir='C:\\Users\\Simrandeep\\Desktop\\Think\\ner output\\credit', n_iter=50):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')
    
    for label in LABELS:
        ner.add_label(label)   # add new entity label to entity recognizer

    # get names of other pipes to disable them during training
    random.seed(7)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'A/C 3XXXXX9551 CREDITED BY RS. 3,000 TOTAL BAL: RS. 28,962.90 CR CLR BAL: RS. 28,962.90 CR. VISIT BRANCH TO SEED AADHAAR IN ACCOUNT.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

main()

Created blank 'en' model
{'ner': 109.01367341963395}
{'ner': 57.299971091566597}
{'ner': 41.909280748913901}
{'ner': 31.989531356043624}
{'ner': 22.167971010239324}
{'ner': 19.425541479587231}
{'ner': 18.961193648718826}
{'ner': 13.339550811505877}
{'ner': 11.964714546242172}
{'ner': 11.309812877033915}
{'ner': 10.606648838207791}
{'ner': 8.8650144158881083}
{'ner': 8.4943487168516665}
{'ner': 8.620483534566441}
{'ner': 6.1894131792900255}
{'ner': 5.8563125953802908}
{'ner': 4.347802495046837}
{'ner': 3.5083463049475827}
{'ner': 5.0337368429157072}
{'ner': 5.0904530521427045}
{'ner': 4.5510886070383281}
{'ner': 3.38002894346599}
{'ner': 4.0125662233818362}
{'ner': 2.3535674336171502}
{'ner': 3.9751723505218992}
{'ner': 3.0375255547565878}
{'ner': 2.5891446843406678}
{'ner': 3.4361951383980127}
{'ner': 3.4098874903313874}
{'ner': 3.5829313035788664}
{'ner': 2.0276037317844167}
{'ner': 2.6950135763340199}
{'ner': 2.5614088438974227}
{'ner': 1.9016992668504105}
{'ner': 2.5978016500463919}

In [4]:
test_text=['09093070000182@SYNB0000909.IFSC.NPCI LINKED TO A/C XXXXXXXX000182 CREDITED FOR RS.200.00 ON 15/JUL/2018 19:37:27 UPI REF NO 819657998822',
           'A/C 3XXXXX9551 CREDITED BY RS. 4,420 TOTAL BAL: RS. 25,759.10 CR CLR BAL: RS. 25,759.10 CR. VISIT BRANCH TO SEED AADHAAR IN ACCOUNT.',
           'AC XXXXXXXX00123995 CREDITED WITH RS.1.00, 21-07-2018 17:31:51 THRU UPI . AVAL BAL RS.283.00 CR. HELPLINE 18001802222',
           'AC XXXXXXXX00006635 CREDITED WITH RS.2.25, 19-03-2018 10:51:50. AVAL BAL RS.110.36 CR. HELPLINE 18001802222',
           'AN AMOUNT OF 7000 INR HAS BEEN CREDITED TO A/C NO XXXXXXX1007872 BY EFT / ATM CARD TRANSACTION. ON 07-JUL-18 19:17:52. NOW CLEAR BALANCE IS CREDIT INR 8102.75',
           'AN AMOUNT OF INR 1,350.00 HAS BEEN CREDITED TO YOUR ACCOUNT XXXX5798 ON 07/07/2018 .AVL BAL RS 7,193.00',
           'BOI - RS 128.82 HAS BEEN CREDITED IN YOUR ACCOUNT XXXX0354 THROUGH NACH ON 09-05-2018.AVAILABLE BALANCE 235.55',
           'BOI - RS 21382.00 CREDITED(TRF)SALARY JULY 2018 IN YOUR AC XX4176 ON 07-08-2018. .AVL BAL RS 21949.82. PLS LINK AADHAAR TO A/C, IF NOT LINKED.',
           'BOI - RS 5000 CREDITED IN YOUR AC XX1487 ON 15-06-2018. BY IMPS INWARD REF NO.816615714115 AVL BAL RS. 5043.75 PLS LINK AADHAAR TO A/C, IF NOT LINKED',
           'CASH DEPOSIT TRANSACTION OF RS. 3000 FOR ACCOUNT 1136XXXX7164 HAS BEEN COMPLETED ON 2018-06-29 VIDE TRANSACTION REFERENCE KCD663651477164.',
           'CHEQUE NO 12599 FOR RS.13000 IS CREDITED TO A/C ...2446 ON 07-06-18 AT SERVICE BRANCH, NEW DELHI. (CURRENT AVLBL BAL RS.28036.57 AT 07-06-18 17:41:00)',
           'DEAR CUSTOMER, ACCT XX0452 IS CREDITED WITH RS.10.00 ON 28-FEB-18 FROM GOOG-PAYMENT@OKAXIS. UPI REF NO 805921001666',
           'DEAR CUSTOMER, WE HAVE SUCCESSFULLY RECEIVED PAYMENT OF RS. 1522. PLEASE WAIT UPTO 24 HOURS FOR THE AMOUNT TO REFLECT IN YOUR LOAN',
           'RS. 1,376.55 IS CREDITED TO YOUR A/C XXXX9499 ON 29-MAY-18 ON ACCOUNT OF 700181435 .',
           'YOUR A/C XXXXX211927 HAS A CREDIT BY TRANSFER OF RS 4,000.00 ON 02/05/18. AVL BAL RS 4,772.62. DOWNLOAD YONO @ WWW.YONOSBI.COM',
           'UR TRANSACTION ON HDFC BANK DEBIT/ATM CARD ENDING 3328 FOR RS. 1530.00 HAS BEEN CREDITED/REVERSED BY IRCTC12027 ON 2018-04-16:16:01:41'
          ]

output_dir= 'C:\\Users\\Simrandeep\\Desktop\\Think\\ner output\\credit'

# test the saved model
print("Loading from", output_dir)
for text in test_text:
    print("")
    print("Entities in '%s'" % text)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)

Loading from C:\Users\Simrandeep\Desktop\Think\ner output\credit

Entities in '09093070000182@SYNB0000909.IFSC.NPCI LINKED TO A/C XXXXXXXX000182 CREDITED FOR RS.200.00 ON 15/JUL/2018 19:37:27 UPI REF NO 819657998822'
ACCOUNT NO XXXXXXXX000182
CREDIT AMOUNT RS.200.00
DATE OF TRANSACTION 15/JUL/2018
TIME OF TRANSACTION 19:37:27
TXN INSTRUMENT UPI
REFERENCE NO 819657998822

Entities in 'A/C 3XXXXX9551 CREDITED BY RS. 4,420 TOTAL BAL: RS. 25,759.10 CR CLR BAL: RS. 25,759.10 CR. VISIT BRANCH TO SEED AADHAAR IN ACCOUNT.'
ACCOUNT NO 3XXXXX9551
CREDIT AMOUNT RS. 4,420
TOTAL BALANCE RS. 25,759.10
CLEARING BALANCE RS. 25,759.10

Entities in 'AC XXXXXXXX00123995 CREDITED WITH RS.1.00, 21-07-2018 17:31:51 THRU UPI . AVAL BAL RS.283.00 CR. HELPLINE 18001802222'
ACCOUNT NO XXXXXXXX00123995
CREDIT AMOUNT RS.1.00
DATE OF TRANSACTION 21-07-2018
TIME OF TRANSACTION 17:31:51
TXN INSTRUMENT UPI
CLEARING BALANCE RS.283.00

Entities in 'AC XXXXXXXX00006635 CREDITED WITH RS.2.25, 19-03-2018 10:51:50. AVAL BA

In [5]:
test_text=['AN AMOUNT OF INR 1,30,000.00 HAS BEEN CREDITED TO XXXXXXXXX1945 ON 25/06/2018 TOWARDS NEFT REF NO. SBIN518176003378.TOTAL AVAIL.BAL INR 1,39,173.10']

output_dir= 'C:\\Users\\Simrandeep\\Desktop\\Think\\ner output\\credit'

# test the saved model
print("Loading from", output_dir)
for text in test_text:
    print("")
    print("Entities in '%s'" % text)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)

Loading from C:\Users\Simrandeep\Desktop\Think\ner output\credit

Entities in 'AN AMOUNT OF INR 1,30,000.00 HAS BEEN CREDITED TO XXXXXXXXX1945 ON 25/06/2018 TOWARDS NEFT REF NO. SBIN518176003378.TOTAL AVAIL.BAL INR 1,39,173.10'
CREDIT AMOUNT INR 1,30,000.00
ACCOUNT NO XXXXXXXXX1945
DATE OF TRANSACTION 25/06/2018
TXN INSTRUMENT NEFT
TOTAL BALANCE INR 1,39,173.10
