In [1]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
import json
import io
from spacy.util import minibatch, compounding
import pandas as pd
import re

In [2]:
LABELS = ['DEBIT AMOUNT', 'DATE OF TRANSACTION', 'TIME OF TRANSACTION', 'REFERENCE NO', 'ACCOUNT NO', 'TOTAL BALANCE', 'CARD NO', 'LEDGER BALANCE', 'AVAILABLE CREDIT LIMIT', 'BENEFICIARY NAME',
          'COMBINED BALANCE', 'CHEQUE NO', 'CLEARING BALANCE', 'VENDOR LIST', 'ACCOUNT BALANCE', 'TOTAL CREDIT LIMIT', 'TXN INSTRUMENT', 'BENEFICIARY ACCOUNT', 'CURRENT OUTSTANDING']

with io.open('C:\\Users\\Simrandeep\\Desktop\\train_debit.json', encoding='utf8') as f:
    TRAIN_DATA = json.load(f)

In [3]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))

def main(model=None, new_model_name='model', output_dir='C:\\Users\\Simrandeep\\Desktop\\Think\\ner output\\debit', n_iter=50):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')
    
    for label in LABELS:
        ner.add_label(label)   # add new entity label to entity recognizer

    # get names of other pipes to disable them during training
    random.seed(7)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 1444.0 ON DATE 2018-05-19 04:20:21 PM BY UPI REF NO 813916610352.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

main()

Created blank 'en' model
{'ner': 216.90629269221313}
{'ner': 109.45352817916894}
{'ner': 78.419865913078496}
{'ner': 63.210563657110768}
{'ner': 52.258344089209793}
{'ner': 42.44354678841691}
{'ner': 38.324832887251532}
{'ner': 30.823337498877148}
{'ner': 27.225512379928009}
{'ner': 22.839966388407102}
{'ner': 21.590503314773812}
{'ner': 21.766772920194221}
{'ner': 16.895602929831362}
{'ner': 14.863990518194317}
{'ner': 15.913148859965519}
{'ner': 14.586008178955014}
{'ner': 13.939061192643061}
{'ner': 11.328613838895578}
{'ner': 12.169256670453604}
{'ner': 11.155785641661726}
{'ner': 10.994515924842492}
{'ner': 12.636921556061941}
{'ner': 8.9722627688480738}
{'ner': 8.9621050441901762}
{'ner': 7.5129150295587319}
{'ner': 8.179202538388056}
{'ner': 9.2422652486652339}
{'ner': 6.4460833579917924}
{'ner': 7.6254772838604366}
{'ner': 7.3543577422067887}
{'ner': 7.0243018144568614}
{'ner': 6.6855773852329108}
{'ner': 8.2118928897180794}
{'ner': 5.4291756471984138}
{'ner': 5.602163444032831

In [3]:
test_text=['TRANX OF GBP 29.99 USING CREDIT CARD 4XXX6002 IS MADE AT WWW.THEREGISTER ON 16-JUL-17. AVBL CR LMT:INR 11,210.78, TOTAL CR LMT: INR 33,000.00']

output_dir= 'C:\\Users\\Simrandeep\\Desktop\\Think\\ner output\\debit'

# test the saved model
print("Loading from", output_dir)
for text in test_text:
    print("")
    print("Entities in '%s'" % text)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)

Loading from C:\Users\Simrandeep\Desktop\Think\ner output\debit

Entities in 'TRANX OF GBP 29.99 USING CREDIT CARD 4XXX6002 IS MADE AT WWW.THEREGISTER ON 16-JUL-17. AVBL CR LMT:INR 11,210.78, TOTAL CR LMT: INR 33,000.00'
DEBIT AMOUNT GBP 29.99
TXN INSTRUMENT CREDIT CARD
CARD NO 4XXX6002
VENDOR LIST WWW.THEREGISTER
DATE OF TRANSACTION 16-JUL-17
AVAILABLE CREDIT LIMIT INR 11,210.78
TOTAL CREDIT LIMIT INR 33,000.00


In [7]:
df = pd.read_excel("C:/Users/Simrandeep/Desktop/test/batch1_22ndjan.xlsx")
df.head()

Unnamed: 0,message
0,"DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 174.0 ON DATE 2018-06-04 10:12:03 PM BY UPI REF NO 815544439649.DOWNLOAD YONO @ WWW.YONOSBI.COM"
1,"DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 323.0 ON DATE 2018-06-05 02:18:19 PM BY UPI REF NO 815642401830.DOWNLOAD YONO @ WWW.YONOSBI.COM"
2,"DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 259.0 ON DATE 2018-06-07 03:46:15 PM BY UPI REF NO 815845332392.DOWNLOAD YONO @ WWW.YONOSBI.COM"
3,"DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 5000.0 ON DATE 2018-07-03 07:38:51 PM BY UPI REF NO 818419568485.DOWNLOAD YONO @ WWW.YONOSBI.COM"
4,"DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 4391.0 ON DATE 2018-07-28 12:02:18 PM BY UPI REF NO 820936971166.DOWNLOAD YONO @ WWW.YONOSBI.COM"


In [8]:
pd.set_option('display.max_colwidth', -1)
df['message']=[re.sub(repl="", string=word, pattern= "DOWNLOAD YONO @ WWW.YONOSBI.COM") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "IF NOT DONE BY YOU, FORWARD THIS SMS FROM MOBILE NUMBER REGISTERED WITH SBI TO 9223008333 TO BLOCK YOUR CARD.") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "YOU MAY ALSO CALL 1-800-111109") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "PLZ DOWNLOAD BUDDY") for word in df['message']]
df['message']=[re.sub(repl=", REF- ", string=word, pattern= ",REF-") for word in df['message']]
df['message']=[re.sub(repl="TID -", string=word, pattern= "TID-") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "UPDATE YOUR EMAIL ID IN ACCOUNT.") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "FREE TXN ON OTH BANK ATM-5(INCL MAX 3 IN METRO) IN SB A/C ONLY.") for word in df['message']]
df['message']=[re.sub(repl="AVL BAL :", string=word, pattern= "AVL BAL:") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "IF YOU FIND TXN SUSPICIOUS, SEND BLOCKUPI TO 9220092200.") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "TOLLFREE 18002083333 OR 180030113333") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "IF THIS TXN IS NOT DONE BY YOU PLEASE HOTLIST YOUR CARD BY CALLING ON 18004251112 OR 022-40429123.") for word in df['message']]
df['message']=[re.sub(repl="", string=word, pattern= "DOWNLOAD YONO /U00A1 WWW.YONOSBI.COM") for word in df['message']]
df['message']=[re.sub(repl=". INFO:", string=word, pattern= ".INFO:") for word in df['message']]
df['message']=[re.sub(repl="TOT BAL :", string=word, pattern= "TOT BAL:") for word in df['message']]
df['message']=[re.sub(repl=". AVL", string=word, pattern= ".AVL") for word in df['message']]
df['message'].head()

0    DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 174.0 ON DATE 2018-06-04 10:12:03 PM BY UPI REF NO 815544439649. 
1    DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 323.0 ON DATE 2018-06-05 02:18:19 PM BY UPI REF NO 815642401830. 
2    DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 259.0 ON DATE 2018-06-07 03:46:15 PM BY UPI REF NO 815845332392. 
3    DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 5000.0 ON DATE 2018-07-03 07:38:51 PM BY UPI REF NO 818419568485.
4    DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 4391.0 ON DATE 2018-07-28 12:02:18 PM BY UPI REF NO 820936971166.
Name: message, dtype: object

In [9]:
output_dir= 'C:\\Users\\Simrandeep\\Desktop\\Think\\ner output\\debit'
# test the saved model
i=0
print("Loading from", output_dir)
for i in range(df.shape[0]):
    text=df.loc[i,"message"]
    print("")
    print("Entities in '%s'" % text)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)
        if 'DEBIT AMOUNT' in ent.label_:
            df.loc[i,'DEBIT AMOUNT']= ent.text
        elif 'DATE OF TRANSACTION' in ent.label_:
            df.loc[i,'DATE OF TRANSACTION']= ent.text
        elif 'TIME OF TRANSACTION' in ent.label_:
            df.loc[i,'TIME OF TRANSACTION']= ent.text
        elif 'REFERENCE NO' in ent.label_:
            df.loc[i,'REFERENCE NO']= ent.text
        elif 'ACCOUNT NO' in ent.label_:
            df.loc[i,'ACCOUNT NO']= ent.text
        elif 'TOTAL BALANCE' in ent.label_:
            df.loc[i,'TOTAL BALANCE']= ent.text
        elif 'CARD NO' in ent.label_:
            df.loc[i,'CARD NO']= ent.text
        elif 'LEDGER BALANCE' in ent.label_:
            df.loc[i,'LEDGER BALANCE']= ent.text
        elif 'AVAILABLE CREDIT LIMIT' in ent.label_:
            df.loc[i,'AVAILABLE CREDIT LIMIT']= ent.text
        elif 'COMBINED BALANCE' in ent.label_:
            df.loc[i,'COMBINED BALANCE']= ent.text
        elif 'CHEQUE NO' in ent.label_:
            df.loc[i,'CHEQUE NO']= ent.text
        elif 'CLEARING BALANCE' in ent.label_:
            df.loc[i,'CLEARING BALANCE']= ent.text
        elif 'VENDOR LIST' in ent.label_:
            df.loc[i,'VENDOR LIST']= ent.text
        elif 'ACCOUNT BALANCE' in ent.label_:
            df.loc[i,'ACCOUNT BALANCE']= ent.text
        elif 'TOTAL CREDIT LIMIT' in ent.label_:
            df.loc[i,'TOTAL CREDIT LIMIT']= ent.text
        elif 'CURRENT OUTSTANDING' in ent.label_:
            df.loc[i,'CURRENT OUTSTANDING']= ent.text
        elif 'BENEFICIARY NAME' in ent.label_:
            df.loc[i,'BENEFICIARY NAME']= ent.text
        elif 'BENEFICIARY ACCOUNT' in ent.label_:
            df.loc[i,'BENEFICIARY ACCOUNT']= ent.text    
        elif 'TXN INSTRUMENT' in ent.label_:
            df.loc[i,'TXN INSTRUMENT']= ent.text      

Loading from C:\Users\Simrandeep\Desktop\Think\ner output\debit

Entities in 'DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 174.0 ON DATE 2018-06-04 10:12:03 PM BY UPI REF NO 815544439649.'
TXN INSTRUMENT UPI
DEBIT AMOUNT INR 174.0
DATE OF TRANSACTION 2018-06-04
TIME OF TRANSACTION 10:12:03 PM
REFERENCE NO 815544439649

Entities in 'DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 323.0 ON DATE 2018-06-05 02:18:19 PM BY UPI REF NO 815642401830.'
TXN INSTRUMENT UPI
DEBIT AMOUNT INR 323.0
DATE OF TRANSACTION 2018-06-05
TIME OF TRANSACTION 02:18:19 PM
REFERENCE NO 815642401830

Entities in 'DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 259.0 ON DATE 2018-06-07 03:46:15 PM BY UPI REF NO 815845332392.'
TXN INSTRUMENT UPI
DEBIT AMOUNT INR 259.0
DATE OF TRANSACTION 2018-06-07
TIME OF TRANSACTION 03:46:15 PM
REFERENCE NO 815845332392

Entities in 'DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 5000.0 ON DATE 2018-07-03 07:38:51 PM BY UPI REF NO 818419568485.'
TXN INSTRUMENT UPI
DEBIT AMOUNT IN

Entities in 'YOUR A/C NO XXXX2833 HAS BEEN DEBITED BY RS. 94.50 ON 07-JUL-2018. A/C BAL IS RS. 552.02 CR AND. AVL BAL IS RS. 552.02'
ACCOUNT NO XXXX2833
DEBIT AMOUNT RS. 94.50
DATE OF TRANSACTION 07-JUL-2018
ACCOUNT BALANCE RS. 552.02
TOTAL BALANCE RS. 552.02

Entities in 'YOUR A/C NO XXXXXXX2833 HAS BEEN DEBITED BY RS. 55.000 ON 10-07-18 . A/C BAL IS RS. 426.02 CR AND. AVL BAL IS RS. 426.02( UPI REF NO. 819140574257 )'
ACCOUNT NO XXXXXXX2833
DEBIT AMOUNT RS. 55.000
DATE OF TRANSACTION 10-07-18
ACCOUNT BALANCE RS. 426.02
TOTAL BALANCE RS. 426.02
TXN INSTRUMENT UPI
REFERENCE NO 819140574257

Entities in 'YOUR A/C NO XXXX2833 HAS BEEN DEBITED BY RS. 500.00 ON 19-JUL-2018 VIA TJ007901/652155XXXXXX5990/820012514937. A/C NO XXXX2833 BAL IS RS. 22.02 CR AND. AVL BAL IS RS. 22.02'
ACCOUNT NO XXXX2833
DEBIT AMOUNT RS. 500.00
DATE OF TRANSACTION 19-JUL-2018
ACCOUNT NO XXXX2833
ACCOUNT BALANCE RS. 22.02
TOTAL BALANCE RS. 22.02

Entities in 'YOUR A/C NO XXXX2833 HAS BEEN DEBITED BY RS. 3,000.00 O

DATE OF TRANSACTION 2018-06-07
TIME OF TRANSACTION 07:15:47 AM
REFERENCE NO 815807690877

Entities in 'DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 102.0 ON DATE 2018-06-30 06:42:13 PM BY UPI REF NO 818172649526.'
TXN INSTRUMENT UPI
DEBIT AMOUNT INR 102.0
DATE OF TRANSACTION 2018-06-30
TIME OF TRANSACTION 06:42:13 PM
REFERENCE NO 818172649526

Entities in 'DEAR SBI UPI USER, YOUR ACCOUNT IS DEBITED INR 5000.0 ON DATE 2018-07-30 04:53:58 PM BY UPI REF NO 821164290228.'
TXN INSTRUMENT UPI
DEBIT AMOUNT INR 5000.0
DATE OF TRANSACTION 2018-07-30
TIME OF TRANSACTION 04:53:58 PM
REFERENCE NO 821164290228

Entities in 'RS.577.00 WAS SPENT ON YOUR SBI CARD ENDING 5320 AT VODAFONE INDIA LTD ON 21/06/18. AVAILABLE CREDIT LIMIT: RS.646.47. DOWNLOAD APP AT HTTPS://GOO.GL/B7QDQU'
DEBIT AMOUNT RS.577.00
CARD NO 5320
VENDOR LIST VODAFONE INDIA LTD
DATE OF TRANSACTION 21/06/18
AVAILABLE CREDIT LIMIT RS.646.47

Entities in 'RS.229.00 WAS SPENT ON YOUR SBI CARD ENDING 5320 AT DOMINOS PIZZA. ON 21/06/18

DEBIT AMOUNT RS. 53.00
DATE OF TRANSACTION 08-JUL-2018
ACCOUNT BALANCE RS. 481.02
TOTAL BALANCE RS. 481.02

Entities in 'YOUR A/C NO XXXXXXX2833 HAS BEEN DEBITED BY RS. 9.000 ON 11-07-18 . A/C BAL IS RS. 26.02 CR AND. AVL BAL IS RS. 26.02( UPI REF NO. 819218023212 )'
ACCOUNT NO XXXXXXX2833
DEBIT AMOUNT RS. 9.000
DATE OF TRANSACTION 11-07-18
ACCOUNT BALANCE RS. 26.02
TOTAL BALANCE RS. 26.02
TXN INSTRUMENT UPI
REFERENCE NO 819218023212

Entities in 'YOUR A/C NO XXXX2833 HAS BEEN DEBITED BY RS. 1,000.00 ON 25-JUL-2018 VIA TJ007901/652155XXXXXX5990/820609679616. A/C NO XXXX2833 BAL IS RS. 2.02 CR AND. AVL BAL IS RS. 2.02'
ACCOUNT NO XXXX2833
DEBIT AMOUNT RS. 1,000.00
DATE OF TRANSACTION 25-JUL-2018
ACCOUNT NO XXXX2833
ACCOUNT BALANCE RS. 2.02
TOTAL BALANCE RS. 2.02

Entities in 'YOUR A/C NO XXXX2833 HAS BEEN DEBITED BY RS. 200.00 ON 01-AUG-2018 VIA S1ANNN11/652155XXXXXX5990/821312030926. A/C NO XXXX2833 BAL IS RS. 2.02 CR AND. AVL BAL IS RS. 2.02'
ACCOUNT NO XXXX2833
DEBIT AMOUNT RS. 200.0

In [10]:
df.to_csv("C:/Users/Simrandeep/Desktop/test/batch1_22jan_output.csv")