In [1]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string

In [2]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
### Load NER model
model_ner = spacy.load('./output/model-best/')

In [158]:
# Load Image
image = cv2.imread('./data/242.jpeg')

# cv2.imshow('businesscard',image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

# extract data using Pytesseract 
tessData = pytesseract.image_to_data(image)
# convert into dataframe
tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:],columns=tessList[0])
df.dropna(inplace=True) # drop missing values
df['text'] = df['text'].apply(cleanText)

# convet data into content
df_clean = df.query('text != "" ')
content = " ".join([w for w in df_clean['text']])
print(content)
# get prediction from NER model
doc = model_ner(content)

dr. t. s. reddy senior consultant traffic transportation lea associates south asia pvt. ltd. leadership in consulting engineers planners engineering hl no 1-4-879/54/1a, street no. 8, p lannin g near vijaya bank, lower tank bund, gandhinagar, hyderabad-500080 a.p. india solutions tel 91-40-66747122 / 66747135 . mobile 91-91778 82230 email tsr@lasaindia.com ———— limmappagari@ymait.com _ alea group company website - www lasaindia.com


In [159]:
from spacy import displacy

In [160]:
#displacy.serve(doc,style='ent')

In [161]:
displacy.render(doc,style='ent')

### Tagging

In [162]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [163]:
doc_text = docjson['text']

In [164]:
datafram_tokens = pd.DataFrame(docjson['tokens'])
datafram_tokens['token'] = datafram_tokens[['start','end']].apply(
    lambda x:doc_text[x[0]:x[1]] , axis = 1)
datafram_tokens.head(10)

Unnamed: 0,id,start,end,token
0,0,0,2,dr
1,1,2,3,.
2,2,4,6,t.
3,3,7,9,s.
4,4,10,15,reddy
5,5,16,22,senior
6,6,23,33,consultant
7,7,34,41,traffic
8,8,42,56,transportation
9,9,57,60,lea


In [165]:
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
datafram_tokens = pd.merge(datafram_tokens,right_table,how='left',on='start')

In [166]:
datafram_tokens.fillna('O',inplace=True)
datafram_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,2,dr,B-NAME
1,1,2,3,.,O
2,2,4,6,t.,I-NAME
3,3,7,9,s.,I-NAME
4,4,10,15,reddy,I-NAME
5,5,16,22,senior,B-DES
6,6,23,33,consultant,I-DES
7,7,34,41,traffic,O
8,8,42,56,transportation,O
9,9,57,60,lea,B-ORG


In [167]:
# join lable to df_clean dataframe
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1 
df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)

In [168]:
# inner join with start 
dataframe_info = pd.merge(df_clean,datafram_tokens[['start','token','label']],how='inner',on='start')

In [169]:
dataframe_info.tail(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
50,5,1,6,2,1,1,181,1238,161,76,0,————,363,359,—,B-WEB
51,5,1,6,2,1,2,1207,1241,666,67,9,limmappagari@ymait.com,386,364,limmappagari@ymait.com,B-EMAIL
52,5,1,6,3,1,1,49,1369,14,12,82,_,388,387,_,O
53,5,1,6,3,1,2,99,1302,181,48,92,alea,393,389,alea,O
54,5,1,6,3,1,3,303,1305,174,55,96,group,399,394,group,O
55,5,1,6,3,1,4,494,1317,255,45,96,company,407,400,company,O
56,5,1,6,3,1,5,1016,1322,203,54,93,website,415,408,website,O
57,5,1,6,3,1,6,1242,1338,6,34,56,-,417,416,-,O
58,5,1,6,3,1,7,1271,1298,106,93,44,www,421,418,www,B-WEB
59,5,1,6,3,1,8,1399,1323,361,53,44,lasaindia.com,435,422,lasaindia.com,I-WEB


### Bounding Box

In [170]:
bb_df = dataframe_info.query("label != 'O' ")
img = image.copy()

for x,y,w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv2.putText(img,str(label),(x,y),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    
    
cv2.imshow('Predictions',img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [171]:
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,1,1,1,1,1015,116,105,65,93,dr.,3,0,dr,NAME
1,5,1,1,1,1,2,1153,116,59,65,86,t.,6,4,t.,NAME
2,5,1,1,1,1,3,1247,114,74,68,92,s.,9,7,s.,NAME
3,5,1,1,1,1,4,1357,115,256,83,96,reddy,15,10,reddy,NAME
4,5,1,2,1,1,1,1013,210,210,53,96,senior,22,16,senior,DES


In [172]:
# group the label
class groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id +=1
            self.text = text
            return self.id
        
grp_gen = groupgen()

In [173]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

In [174]:
# right and bottom of bounding box
bb_df[['left','top','width','height']] = bb_df[['left','top','width','height']].astype(int)
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

In [175]:
# tagging: groupby group
col_group = ['left','top','right','bottom','label','token','group']
group_tag_img = bb_df[col_group].groupby(by='group')

In [176]:
img_tagging = group_tag_img.agg({
    
    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'token':lambda x: " ".join(x)
    
})

In [177]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1015,1613,114,198,NAME,dr t. s. reddy
2,1013,1599,210,263,DES,senior consultant
3,1015,2349,531,621,ORG,lea associates south asia pvt ltd
4,1121,1884,1001,1136,PHONE,91 66747135 91 82230
5,1206,1678,1160,1226,EMAIL,tsr@lasaindia.com
6,181,342,1238,1314,WEB,—
7,1207,1873,1241,1308,EMAIL,limmappagari@ymait.com
8,1271,1760,1298,1391,WEB,www lasaindia.com


In [179]:
img_bb = image.copy()
for l,r,t,b,label,token in img_tagging.values:
    cv2.rectangle(img_bb,(l,t),(r,b),(0,255,0),2)
    
    cv2.putText(img_bb,label,(l,t),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    
    
cv2.imshow('Bounding Box BusinessCard',img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [180]:
# parser
def parser(text,label):
    if label == 'PHONE':
        text = text.lower()
        text = re.sub(r'\D','',text)
        
    elif label == 'EMAIL':
        text = text.lower()
        allow_special_char = '@_.-'
        text = re.sub(r'[^{}A-Za-z ]'.format(allow_special_char),'',text)
        
    elif label in ('NAME','DES','ORG'):
        text = text.lower()
        text = re.sub(r'[^A-Za-z ]','',text)
        text = text.title()
        
    elif label == 'WEB':
        text = text.lower()
        allow_special_char = '/.:%-'
        text = re.sub(r'[^{}A-Za-z ]'.format(allow_special_char),'',text)
        
    return text

In [181]:

info_array = dataframe_info.query('label != "O"')[['token','label']].values

In [182]:
entities = dict(NAME=[],ORG=[],DES=[],PHONE=[],EMAIL=[],WEB=[])
previous = ''
for tok, lab in info_array:
    #print(tok,lab)
    bio_tag = lab[:1]
    label_tag = lab[2:]
    #print(label_tag)
    
    text = parser(tok,label_tag)
    
    
    
    if previous != label_tag:
        #print('yes')
        entities[label_tag].append(text)
        
    else:
        #print('no')
        if bio_tag == 'B':
            #print('bio =',bio_tag)
            entities[label_tag].append(text)
            
        else:
            #print('bio =',bio_tag)
            if label_tag in ('NAME','ORG','DES'):
                entities[label_tag][-1] = entities[label_tag][-1]+" "+text
                
            else:
                entities[label_tag][-1] = entities[label_tag][-1]+text
            
    #print('-'*19)
    previous = label_tag

In [183]:
entities

{'NAME': ['Dr T S Reddy'],
 'ORG': ['Lea Associates South Asia Pvt Ltd'],
 'DES': ['Senior Consultant'],
 'PHONE': ['9166747135', '9182230'],
 'EMAIL': ['tsr@lasaindia.com', 'limmappagari@ymait.com'],
 'WEB': ['', 'wwwlasaindia.com']}