In [1]:
import numpy as np
import pandas as pd
import spacy
import PIL
import cv2
import re
import pytesseract

In [2]:
#Cleaning text
import string


def cleanText(txt):
    whitespace = string.whitespace
    punctuation = '!#$%\'()*+-:;<=>?[\\]^_`{|}~'

    tableWhiteSpace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removeWhiteSapce = text.translate(tableWhiteSpace)
    removePunctuation = removeWhiteSapce.translate(tablePunctuation)
    
    return str(removePunctuation)

In [3]:
#Load NER model
model_ner = spacy.load('./output/model-best/')



In [4]:
#Load Image
image = cv2.imread('./data/041.jpeg')
cv2.imshow('business_card',image)
cv2.waitKey(0)
cv2.destroyAllWindows()

#Extract the Data using Pytesseract
text_data = pytesseract.image_to_data(image)
#print(text_data)

#Convert the DataFrame
dataList = list(map(lambda x: x.split('\t'),text_data.split('\n')))
df = pd.DataFrame(data = dataList[1:],columns = dataList[0])
df.dropna(inplace=True) #Drop Missing Values
df['text'] = df['text'].apply(cleanText)


#convert data into content
df_clean = df.query('text != ""')
content = " ".join([w for w in df_clean['text']])


#Get prediction from the NER Model



In [5]:
content

'cell 8099948528 g2 8466045457 email lictsrikant@gmail.com life insurance corporation of india seosrika ntht@gmail “com thathineni srikanth insurance advisor agent code no. 0316164y life insurance corporation of india br. off. lic office, trimulgherry, sec’bad 500 016. add. borabanda, hyderabad 500 018. lictsrikant8099948528.blogspot.in, interviewsinhyderabad.blogspot.in facebook.com/lictsrikant8099948528, facebook.com/thathineni.srikanth.9 promote your business online pybo'

In [6]:
doc = model_ner(content)

In [7]:
from spacy import displacy

In [8]:
colors = {'B-PHONE':'linear-gradient(45deg,orange,red)','B-EMAIL':'linear-gradient(45deg,green,yellow)'}
options={'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options=options)

In [136]:
#Tagging
doc_json = doc.to_json()

In [137]:
#doc_json

In [138]:
data_tokens = pd.DataFrame(doc_json['tokens'])
doc_text = doc_json['text']

In [139]:
data_tokens['word'] = data_tokens[['start','end']].apply(
lambda x: doc_text[x[0]:x[1]],axis=1)

In [140]:
data_tokens.head()

Unnamed: 0,id,start,end,word
0,0,0,4,cell
1,1,5,15,8099948528
2,2,16,18,g2
3,3,19,29,8466045457
4,4,30,35,email


In [141]:
right_dataLabel = pd.DataFrame(doc_json['ents'])[['start','label']]
data_token = pd.merge(data_tokens,right_dataLabel,how='left',on='start')
data_token['label'].fillna('O',inplace=True)

In [142]:
data_token.head(20)

Unnamed: 0,id,start,end,word,label
0,0,0,4,cell,O
1,1,5,15,8099948528,B-PHONE
2,2,16,18,g2,O
3,3,19,29,8466045457,B-PHONE
4,4,30,35,email,O
5,5,36,57,lictsrikant@gmail.com,B-EMAIL
6,6,58,62,life,B-ORG
7,7,63,72,insurance,I-ORG
8,8,73,84,corporation,I-ORG
9,9,85,87,of,I-ORG


In [143]:
#Join Table to df_clean DataFrame
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() -1
df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)


In [144]:
df_clean.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start
12,5,1,3,1,1,1,722,53,64,28,91.176964,cell,4,0
14,5,1,3,1,1,3,822,53,203,28,96.606537,8099948528,15,5
17,5,1,3,2,1,1,55,55,85,89,46.902012,g2,18,16
18,5,1,3,2,1,2,822,95,203,28,96.438461,8466045457,29,19
20,5,1,3,2,2,1,593,136,93,25,90.471046,email,35,30


In [145]:
#Inner Join
dataframe_info = pd.merge(df_clean,data_token[['start','word','label']],how='inner',on='start')

In [146]:
dataframe_info.tail()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,word,label
44,5,1,6,3,4,1,46,571,106,21,96.386932,promote,451,444,promote,O
45,5,1,6,3,4,2,161,576,56,22,96.813004,your,456,452,your,O
46,5,1,6,3,4,3,226,571,111,21,96.439957,business,465,457,business,O
47,5,1,6,3,4,4,347,571,74,21,93.266945,online,472,466,online,O
48,5,1,6,3,4,5,432,571,97,27,92.782974,pybo,477,473,pybo,O


### Drawing the Bounding Box

In [147]:
bb_df = dataframe_info.query("label != 'O' ")
img = image.copy()

for x,y, w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv2.putText(img,str(label),(x,y),cv2.FONT_HERSHEY_PLAIN,1,(255,0,0),2)
    
cv2.imshow('Predictions',img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [148]:
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])
bb_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,word,label
1,5,1,3,1,1,3,822,53,203,28,96.606537,8099948528,15,5,8099948528,PHONE
3,5,1,3,2,1,2,822,95,203,28,96.438461,8466045457,29,19,8466045457,PHONE
5,5,1,3,2,2,3,709,136,316,31,88.662399,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL
6,5,1,3,3,1,1,46,170,33,14,96.542465,life,62,58,life,ORG
7,5,1,3,3,1,2,85,151,92,42,92.259232,insurance,72,63,insurance,ORG


In [149]:
#Grouping the Labels
class Groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''
    def getGroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id = self.id + 1
            self.text = text
            return self.id

grp_gen = Groupgen()

In [150]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getGroup)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['group'] = bb_df['label'].apply(grp_gen.getGroup)


In [151]:
#Right and Bottom of Bounding Box
bb_df[['left','top','width','height']] = bb_df[['left','top','width','height']].astype('int')
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['right'] = bb_df['left'] + bb_df['width']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['bottom'] = bb_df['top'] + bb_df['height']


In [152]:
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,word,label,group,right,bottom
1,5,1,3,1,1,3,822,53,203,28,96.606537,8099948528,15,5,8099948528,PHONE,1,1025,81
3,5,1,3,2,1,2,822,95,203,28,96.438461,8466045457,29,19,8466045457,PHONE,1,1025,123
5,5,1,3,2,2,3,709,136,316,31,88.662399,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL,2,1025,167
6,5,1,3,3,1,1,46,170,33,14,96.542465,life,62,58,life,ORG,3,79,184
7,5,1,3,3,1,2,85,151,92,42,92.259232,insurance,72,63,insurance,ORG,3,177,193


In [153]:
#tagging
#Top - Minimum
#Left - Minimum
#Right - Maximum
#Bottom - Maximum
col_grp = ['left','top','right','bottom','label','word','group']
grp_tag_img = bb_df[col_grp].groupby(by = 'group')

In [154]:
img_tagging = grp_tag_img.agg({
    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'word':lambda x: " ".join(x)
})

In [155]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,word
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,822,1025,53,123,PHONE,8099948528 8466045457
2,709,1025,136,167,EMAIL,lictsrikant@gmail.com
3,46,374,151,193,ORG,life insurance corporation of india
4,667,1025,165,209,EMAIL,seosrika ntht@gmail “
5,310,755,228,259,NAME,thathineni srikanth
6,400,669,271,296,DES,insurance advisor
7,47,882,395,427,ORG,life insurance corporation of india
8,46,917,506,533,WEB,lictsrikant8099948528.blogspot.in interviewsin...


In [158]:
img_bb = image.copy()
for l,r,t,b,lbl, word in img_tagging.values:
    cv2.rectangle(img_bb,(l,t),(r,b),(0,0,200),2)
    cv2.putText(img_bb,lbl,(l,t),cv2.FONT_HERSHEY_PLAIN,1,(255,0,0),2)
    
cv2.imshow('Predictions-BB',img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [159]:
#Parser
import re
def parser(text,label):
    if label == 'PHONE':
        text = text.lower()
        text = re.sub(r'\D','',text)
    
    elif label == 'EMAIL':
        text = text.lower()
        #text = re.findall('\S+@\S+',text)[0]
        allow_special_chars = '@_.\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_chars),'',text)
        
    elif label == 'WEB':
        text = text.lower()
        allow_special_chars = ':/.'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_chars),'',text)
    
    elif label in('NAME','DES'):
        text = text.lower()
        text = re.sub(r'[^A-Za-z]','',text)
        text = text.title()
        
    elif label == 'ORG':
        text = text.lower()
        text = re.sub(r'[^A-Za-z0-9]','',text)
        text = text.title()
    return text

In [160]:
parser('Sayantan gsayantan1999@gmail.com','EMAIL')

'sayantan gsayantan1999@gmail.com'

In [161]:
#info_array

In [162]:
#Entities
info_array = dataframe_info[['word','label']].values
entities = dict(NAME=[],ORG=[],DES=[],EMAIL=[],PHONE=[],WEB=[])



previous = 'O'
for word,label in info_array:
    #print(word, label)
    bio_tag = label[0]
    label_tag = label[2:]
    #print(bio_tag, label_tag)
    #Step 1: Parse the word/token
    text = parser(word,label_tag)
    if bio_tag in ('B','I'):
        
        #If the word label doesn't match insert into entities
        if previous != label_tag:
            entities[label_tag].append(text)
        #If it is sname check for the inside properties 
        else:
            if bio_tag == "B":
                entities[label_tag].append(text)
            else:
                entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                
    previous = label_tag

In [163]:
entities

{'NAME': ['Thathineni Srikanth'],
 'ORG': ['Life Insurance Corporation Of India',
  'Life Insurance Corporation Of India'],
 'DES': ['Insurance Advisor'],
 'EMAIL': ['lictsrikant@gmail.com', 'seosrika ntht@gmail '],
 'PHONE': ['8099948528', '8466045457'],
 'WEB': ['lictsrikant8099948528.blogspot.in',
  'interviewsinhyderabad.blogspot.in']}