In [1]:
import numpy as np
import pandas as pd
import spacy
import re, random
from pathlib import Path
from tqdm import tqdm 

pd.set_option('max_rows', None)
pd.set_option('max_columns', None)

pd.set_option('display.max_colwidth', -1)



## BULDING NER MODELS FOR BANK SMS DATA

In [2]:
# Load the saved model and predict
output_dir=Path("/home/ec2-user/SageMaker/Bulk Sms Data/Ner_model_to_use_V12")
print("Loading from", output_dir)
ner_model = spacy.load(output_dir)

Loading from /home/ec2-user/SageMaker/Bulk Sms Data/Ner_model_to_use_V12


In [3]:
import json
with open("train_data_new_wema.json", "r") as read_file:
    TRAIN_DATA = json.load(read_file)

In [4]:
TRAIN_DATA[:5]

[['Wema Credit  NGN600.00  Acct No 024639  Desc.  NIP Kehinde Johnson Bakare Transfer from to KEHIN   Bal  884.37  23 05 2021 03 45',
  {'entities': [[5, 11, 'TYPE'],
    [13, 22, 'AMOUNT'],
    [40, 96, 'DESCRIPTION'],
    [112, 122, 'DATE'],
    [99, 110, 'BALANCE']]}],
 ['Wema Debit  NGN18,000.00  Acct No 024133  Desc.  ALAT NIP TRANSFER TO GTB Tr   Bal  171,368.72  31 05 2021 10 42',
  {'entities': [[5, 10, 'TYPE'],
    [12, 24, 'AMOUNT'],
    [42, 76, 'DESCRIPTION'],
    [96, 106, 'DATE'],
    [79, 94, 'BALANCE']]}],
 ['Wema Credit  NGN15,000.00  Acct No 024402  Desc.  ATM Trf on 25 05 2021@ADDAX PETROLEUM   Bal  15,055.78  25 05 2021 02 21',
  {'entities': [[5, 11, 'TYPE'],
    [13, 25, 'AMOUNT'],
    [43, 87, 'DESCRIPTION'],
    [106, 116, 'DATE'],
    [90, 104, 'BALANCE']]}],
 ['Wema Debit  NGN6,250.00  Acct No 024177  Desc.  POS Purchase on 03 06 2021@PAYCOM NIGERIA LIM   Bal  4,826.25  03 06 2021 07 38',
  {'entities': [[5, 10, 'TYPE'],
    [12, 23, 'AMOUNT'],
    [41, 93, 'D

## Add labels, Train data based on annotations 

In [7]:
ner = ner_model.get_pipe('ner')

In [8]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [9]:
other_pipes = [pipe for pipe in ner_model.pipe_names if pipe != 'ner']

In [10]:
%%time
from spacy.training.example import Example
with ner_model.disable_pipes(*other_pipes): 
    optimizer = ner_model.resume_training()
    for itn in range(5):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            doc = ner_model.make_doc(text)
            example =  Example.from_dict(doc, annotations)
            ner_model.update(
                [example],  # batch of annotations
                drop=0.4,  # dropout 
                sgd=optimizer,  # callable to update weights
                losses=losses)
    print(losses)

{'ner': 4.000334420342062}
CPU times: user 4.27 s, sys: 5.59 ms, total: 4.27 s
Wall time: 4.27 s


In [30]:
doc = ner_model(msgtxt[0])
spacy.displacy.render(doc, style='ent', jupyter=True) 
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('CR', 'TYPE'), ('N3,000.00', 'AMOUNT'), ('Desc TRANSFER FROM RICHARD JOHN', 'DESCRIPTION'), ('DT 25/MAY/21', 'DATE'), ('Bal N1,244,944.03CR', 'BALANCE')]


## Model Evaluation on out of sample data
### To see if model will generalize well.

In [12]:
import json
with open("test_data_new.json", "r") as read_file:
    TEST_DATA = json.load(read_file)

In [13]:
import spacy
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training.example import Example


TEST_DATA

example = []
def evaluate(ner_model, examples):
    for input_, annot in examples:
        doc = ner_model.make_doc(input_)
        temp = Example.from_dict(doc, annot)
        example.append(temp)
    scores_model = ner_model.evaluate(example)
    #print scores that you want
    precision_model = scores_model["ents_p"]
    recall_model = scores_model["ents_r"]
    f_score_model = scores_model["ents_f"]
    scores_entities = scores_model["ents_per_type"]
    print(f'The precision of the model is {round(precision_model*100,2)}%')
    print(f'The recall of the model is {round(recall_model*100,2)}%')
    print(f'The f1 of the model is {round(f_score_model*100,2)}%')
    print(f'Score entities is {scores_entities}')
    return scores_model

# ner_model = spacy.load('en_core_web_sm') # for spaCy's pretrained use 'en_core_web_sm'
results = evaluate(ner_model, TEST_DATA)

The precision of the model is 94.57%
The recall of the model is 96.13%
The f1 of the model is 95.34%
Score entities is {'TYPE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'AMOUNT': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'DESCRIPTION': {'p': 0.8648648648648649, 'r': 0.8648648648648649, 'f': 0.8648648648648649}, 'DATE': {'p': 0.9166666666666666, 'r': 0.9428571428571428, 'f': 0.9295774647887323}, 'BALANCE': {'p': 0.9459459459459459, 'r': 1.0, 'f': 0.9722222222222222}}


# MAKING PREDICTIONS

In [14]:
# n=500skiprows=lambda i: i % n != 0
df=pd.read_csv("s3://aws-athena-query-results-101063123548-eu-west-1/Unsaved/2021/08/02/df700ad6-4373-41d7-9297-47c8ed092b99.csv",nrows=100000)

### Seperate Transactional text from Non Transactions

In [16]:
def detect_message(row): # Function to loan
    if (bool(re.search(r'\b(?:.*[Cre]dit.*|.*[De]bit.*|.*[Aa][Cc][Cc].*|.*[Char]ged.*|.*[N]gn.*)\b', row["short_message"],  re.IGNORECASE)) == True): 
    #if (bool(re.search(r'\b(?:credit|credi|cr|crd)\b', row["short_message"],  re.IGNORECASE)) == True):
        return 'Trans'
    else:
        return 'Non_trans'
# applying get_message function
df['Message_Type']=pd.DataFrame({'detect_m' : df.apply(detect_message, axis=1)})

In [17]:
def get_trans(*args, **kwargs):
    data = df[df['Message_Type'] != 'Non_trans']
    return data[['encrypted_msisdn','source_addr','short_message']]

In [18]:
df_1=get_trans(df)
print(df_1.shape)

(94010, 3)


In [19]:
searchfor = ['created', 'successfully','convenience']
df = df[~df.short_message.str.contains('|'.join(searchfor))]

In [20]:
df=df.reset_index(drop=True)

In [21]:
def clean_text(line):
    line = str(line) 
    line = line.replace('\n', '    ')
    line = line.replace(':', ' ')
    #     line = line.replace(',', ' ')
    line = line.replace('\n\n', '   ')
    line = line.replace('\n\n\n', '   ')
    line = line.replace('&', ' ')
    line = line.replace('#', '')
    line = line.replace('x200B', ' ')
    line = line.replace('[', ' ')
    line = line.replace(']', ' ')
    line = line.replace('(', ' ')
    line = line.replace(')', ' ')
    line = line.replace('|',' ')
    line = line.replace(':-', '')
    line = line.replace('~', '')
    line = line.replace('~~', '')
    line = line.replace('-', ' ')
    line = line.replace('\\', '')
    line = line.replace('*', '')
    line = line.replace('§', ' ')
    line = line.replace('\r', '   ')
    line = line.replace(',COMM', ' ')
    line = line.replace('\r\n', '   ')
    #line = line.replace('@', ' ')
    return line

In [22]:
# df['Clean_message']=new_ds
df['clean_message']=df.apply(lambda x:clean_text(x['short_message']), axis = 1)

In [23]:
msgtxt=df['clean_message'].to_list()
pred_list = []
for i in range(len(msgtxt)):
    x = msgtxt[i]
    pred = ner_model(x)
    sv = [(ent.text, ent.label_) for ent in pred.ents]
    pred_list.append(sv)   

In [31]:
label_dic = {'TYPE','AMOUNT','DESCRIPTION','BALANCE','DATE'}
from collections import Counter

In [34]:
for elem in pred_list[:10000]:
    ent_list=[]
    val_list = []
    length_list = []
    for val,ent in elem:
        ent_list.append(ent)
        val_list.append(val)
    counts = Counter(ent_list).most_common()
    for ent_count in counts:
        key,val  = ent_count
        if val>1:
            duplicate_key = key
            if duplicate_key =='DESCRIPTION':
                find_index = [index for index, element in enumerate(ent_list) if element == duplicate_key]
                for i in range(val):
                    length_list.append(len(val_list[find_index[i]]))
                maximum_len = max(length_list)
                index_of_max = length_list.index(maximum_len)
                get_dup_index = find_index[index_of_max]
                elem.pop(get_dup_index)
            elif duplicate_key =='AMOUNT':
                find_index = [index for index, element in enumerate(ent_list) if element == duplicate_key]
                if 'BALANCE' in ent_list:
                    for i in range(val):
                        length_list.append(len(val_list[find_index[i]]))
                        maximum_len = max(length_list)
                        index_of_max = length_list.index(maximum_len)
                        get_dup_index = find_index[index_of_max]
                        elem.pop(get_dup_index)
                else:
                    duplicate = find_index[-1]
                    balance = val_list[[duplicate][0]]
                    bal_tup = (balance,'BALANCE')
                    elem.append(bal_tup)
                    elem.pop(duplicate)
            elif duplicate_key == 'DATE':
                find_index = [index for index, element in enumerate(ent_list) if element == duplicate_key]
                check1 = ['BVN','88A','@','2011Q692/','2011P180/','2011P180/','OL','000087 ZENITHATM3','O.A.U']
                dup_index = [i if any(x in val_list[find_index[i]] for x in check1)  else i for i in range(val)]
#                 print(dup_index)
                get_dup_index = find_index[dup_index[1]]
                elem.pop(get_dup_index)
            elif duplicate_key == 'TYPE':
                find_index = [index for index, element in enumerate(ent_list) if element == duplicate_key]
                check2 = ['Campu','CS','DI','CE','DE','CH','C33','CS DEP']
                dup_index = [i if any(x in val_list[find_index[i]] for x in check2) else i for i in range(val)]
                get_dup_index = find_index[dup_index[1]]
                elem.pop(get_dup_index)
            elif duplicate_key == 'BALANCE':
                find_index = [index for index, element in enumerate(ent_list) if element == duplicate_key]
                check3 = ['B84APLL18334400','88A','BenueNG','FE9']
                dup_index = [i if any(x in val_list[find_index[i]] for x in check3) else i for i in range(val)]
                bal_str = val_list[find_index[dup_index[0]]]
                date = bal_str.replace('B84APLL18334400 ','')
                bal_tup = (date,'DATE')
                elem.append(bal_tup)
                get_dup_index = find_index[dup_index[0]]
                elem.pop(get_dup_index)
            else:
                pass

### CONVERTING PREDICTIONS TO DATAFRAME

In [69]:
test = {}
for elem in pred_list[:420]:
    if len(elem) < 5:
        l = []
        for val in elem:
            ent,label = val
            l.append(label)
        msng_label = list(label_dic - set(l))
        for i in range(len(msng_label)):
            ent,label = ' ',msng_label[i]
            elem.append((ent,label))
#         print('_____')
            
    for match in elem:
        ent,label = match
        if label in test:
            test[label].append(ent)
        else:
#             print(ent)
            test[label] = [ent]

In [76]:
df_use=pd.DataFrame(test)

In [80]:
df_use['clean_message'] = df['clean_message'][:420]

In [81]:
df_use.head(100)

Unnamed: 0,TYPE,AMOUNT,DESCRIPTION,DATE,BALANCE,clean_message
0,CR,"N3,000.00",Desc TRANSFER FROM RICHARD JOHN,DT 25/MAY/21,"Bal N1,244,944.03CR","Acct 9282 CR N3,000.00 Desc TRANSFER FROM RICHARD JOHN DT 25/MAY/21 22 50PM Bal N1,244,944.03CR Help 070034335489 Dial 77008 for a loan"
1,DR,"N1,000.00",Desc POS PURCHASE @2033UYY0 SOBAZ NIGERIA LIMITE,DT 28/MAY/21,"Bal N1,243,940.03CR","Acct 9282 DR N1,000.00 Desc POS PURCHASE @2033UYY0 SOBAZ NIGERIA LIMITE DT 28/MAY/21 18 45PM Bal N1,243,940.03CR Help 070034335489 Dial 77008 for a loan"
2,DR,"N8,850.00",Desc POS PURCHASE @20637G89 WELLDONE SUPERMARKET,DT 20/MAY/21,"Bal N1,242,001.78CR","Acct 9282 DR N8,850.00 Desc POS PURCHASE @20637G89 WELLDONE SUPERMARKET DT 20/MAY/21 11 54AM Bal N1,242,001.78CR Help 070034335489 Dial 77008 for a loan"
3,CR,"N55,000.00",Desc FUNDSTRF FOR UGOCHI GLORY NWAOHA 9282,DT 31/MAY/21,"Bal N1,292,082.03CR","Acct 9282 CR N55,000.00 Desc FUNDSTRF FOR UGOCHI GLORY NWAOHA 9282 DT 31/MAY/21 19 26PM Bal N1,292,082.03CR Help 070034335489 Dial 77008 for a loan"
4,DR,"N4,850.00",Desc POS PURCHASE @20637G89 WELLDONE SUPERMARKET,DT 31/MAY/21,"Bal N1,237,086.03CR","Acct 9282 DR N4,850.00 Desc POS PURCHASE @20637G89 WELLDONE SUPERMARKET DT 31/MAY/21 14 42PM Bal N1,237,086.03CR Help 070034335489 Dial 77008 for a loan"
5,DR,"N2,000.00",Desc POS PURCHASE @2076OM15 CHARLES CHIMEKA OPAR,DT 30/MAY/21,"Bal N1,241,936.03CR","Acct 9282 DR N2,000.00 Desc POS PURCHASE @2076OM15 CHARLES CHIMEKA OPAR DT 30/MAY/21 08 26AM Bal N1,241,936.03CR Help 070034335489 Dial 77008 for a loan"
6,CR,"NGN6,500.00",Desc RUTH KADIRI AIWUMUAGBONRIE/Tran,DT,"Bal NGN11,945.75CR","Acct 7683 CR NGN6,500.00 Desc RUTH KADIRI AIWUMUAGBONRIE/Tran DT 26/MAY/2021 13 30 57 Available Bal NGN11,945.75CR"
7,DR,"NGN11,000.00",Desc POS@2057CP84/AGLOW HOMEWARES LA,DT 26/MAY/2021,Bal NGN945.75CR,"Acct 7683 DR NGN11,000.00 Desc POS@2057CP84/AGLOW HOMEWARES LA LANG DT 26/MAY/2021 14 02 33 Available Bal NGN945.75CR"
8,CR,"NGN5,500.00",Desc RUTH KADIRI AIWUMUAGBONRIE/Tran,DT,"Bal NGN5,445.75CR","Acct 7683 CR NGN5,500.00 Desc RUTH KADIRI AIWUMUAGBONRIE/Tran DT 26/MAY/2021 13 19 14 Available Bal NGN5,445.75CR"
9,DR,NGN900.00,Desc POS@2232475E/OPAY DIGITAL SERVICE,DT 26/MAY/2021,Bal NGN1.75CR,Acct 7683 DR NGN900.00 Desc POS@2232475E/OPAY DIGITAL SERVICE DL LANG DT 26/MAY/2021 21 24 32 Available Bal NGN1.75CR
