In [7]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
from glob import glob
import spacy
import re
import string

In [2]:
def clean_text(txt):
    whitespace = string.whitespace
    punctuation = '!#$%&\'()*+:;<=>?[\\]^`{|}~'
    table_whitespace = str.maketrans('','',whitespace)
    table_punctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    remove_whitespace = text.translate(table_whitespace)
    remove_punctuation = remove_whitespace.translate(table_punctuation)

    return str(remove_punctuation)

In [3]:
### Load NER model
model_ner = spacy.load('output/model-best/')

In [21]:
# Load image
image = cv2.imread('data/010.jpeg')

# cv2.imshow('businesscard',image)
# cv2.waitKey()
# cv2.destroyAllWindows()

# extract data using Pytesseract
tess_data = pytesseract.image_to_data(image)

# convert into dataframe
tess_list = list(map(lambda x:x.split('\t'), tess_data.split('\n')))
df = pd.DataFrame(tess_list[1:], columns=tess_list[0])
df.dropna(inplace=True)
df['text'] = df['text'].apply(clean_text)

# convert data into content
df_clean = df[df['text'] != '']
content = ' '.join([w for w in df_clean['text']])
print(content)

# get prediction from NER model
doc = model_ner(content)

dr. michael dean clarinet performing teaching and consulting 682.888.7639 clarinetmike.com clarinetmike.wordpress.com clarinetmiketexas@ yahoo.com


In [15]:
from spacy import displacy

In [22]:
displacy.serve(doc, style='ent')




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [23]:
displacy.render(doc, style='ent')

# Tagging

In [25]:
doc_json = doc.to_json()
doc_json.keys()

dict_keys(['text', 'ents', 'tokens'])

In [26]:
doc_text = doc_json['text']

In [34]:
df_tokens = pd.DataFrame(doc_json['tokens'])
df_tokens['token'] = df_tokens[['start', 'end']].apply(lambda x:doc_text[x[0]:x[1]], axis=1)
df_tokens.head(10)

Unnamed: 0,id,start,end,token
0,0,0,2,dr
1,1,2,3,.
2,2,4,11,michael
3,3,12,16,dean
4,4,17,25,clarinet
5,5,26,36,performing
6,6,37,45,teaching
7,7,46,49,and
8,8,50,60,consulting
9,9,61,73,682.888.7639


In [40]:
right_table = pd.DataFrame(doc_json['ents'])[['start','label']]
df_tokens = pd.merge(df_tokens,right_table,how='left',on='start')

In [42]:
df_tokens.fillna('O', inplace=True)
df_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,2,dr,B-NAME
1,1,2,3,.,O
2,2,4,11,michael,I-NAME
3,3,12,16,dean,I-NAME
4,4,17,25,clarinet,O
5,5,26,36,performing,O
6,6,37,45,teaching,B-DES
7,7,46,49,and,I-DES
8,8,50,60,consulting,I-DES
9,9,61,73,682.888.7639,B-PHONE


In [50]:
df_tokens

Unnamed: 0,id,start,end,token,label
0,0,0,2,dr,B-NAME
1,1,2,3,.,O
2,2,4,11,michael,I-NAME
3,3,12,16,dean,I-NAME
4,4,17,25,clarinet,O
5,5,26,36,performing,O
6,6,37,45,teaching,B-DES
7,7,46,49,and,I-DES
8,8,50,60,consulting,I-DES
9,9,61,73,682.888.7639,B-PHONE


In [48]:
# join label to df_clean
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1  # end posotion
df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1  # end posotion
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis=1)


In [58]:
# inner join with start
df_info = pd.merge(df_clean, df_tokens[['start', 'token', 'label']], how='inner', on='start')

In [60]:
df_info.tail()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
8,5,1,3,1,1,1,456,309,363,46,94.920631,682.888.7639,73,61,682.888.7639,B-PHONE
9,5,1,4,1,1,1,493,397,289,29,91.978172,clarinetmike.com,90,74,clarinetmike.com,B-WEB
10,5,1,4,1,2,1,402,439,471,38,91.383377,clarinetmike.wordpress.com,117,91,clarinetmike.wordpress.com,B-WEB
11,5,1,4,1,3,1,380,476,322,46,70.143196,clarinetmiketexas@,136,118,clarinetmiketexas@,B-EMAIL
12,5,1,4,1,3,2,717,480,178,38,96.741821,yahoo.com,146,137,yahoo.com,I-EMAIL


# Bounding Box

In [65]:
df_bb = df_info[df_info['label'] != 'O']
img = image.copy()

for x,y,w,h,l in df_bb[['left', 'top', 'width', 'height', 'label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img, (x,y), (x+w,y+h), (0,255,0), 2)
    cv2.putText(img, str(l), (x,y), cv2.FONT_HERSHEY_PLAIN, 1, (255,0,0),2)

cv2.imshow('Predictions',img)
cv2.waitKey()
cv2.destroyAllWindows

<function destroyAllWindows>

In [67]:
# group the label

df_bb['label'] = df_bb['label'].apply(lambda x: x[2:])
df_bb.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bb['label'] = df_bb['label'].apply(lambda x: x[2:])


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,1,1,1,1,339,88,102,56,95.660957,dr.,3,0,dr,NAME
1,5,1,1,1,1,2,468,86,270,58,95.660957,michael,11,4,michael,NAME
2,5,1,1,1,1,3,762,88,174,56,96.329689,dean,16,12,dean,NAME
5,5,1,2,1,2,1,429,235,153,38,96.882019,teaching,45,37,teaching,DES
6,5,1,2,1,2,2,595,235,58,29,96.704781,and,49,46,and,DES


In [68]:
# group the label

class groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self, text):
        if self.text == text:
            return self.id
        else:
            self.id += 1
            self.text = text
            return self.id
        
grp_gen = groupgen()

In [70]:
df_bb['group'] = df_bb['label'].apply(grp_gen.getgroup)
df_bb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bb['group'] = df_bb['label'].apply(grp_gen.getgroup)


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label,group
0,5,1,1,1,1,1,339,88,102,56,95.660957,dr.,3,0,dr,NAME,1
1,5,1,1,1,1,2,468,86,270,58,95.660957,michael,11,4,michael,NAME,1
2,5,1,1,1,1,3,762,88,174,56,96.329689,dean,16,12,dean,NAME,1
5,5,1,2,1,2,1,429,235,153,38,96.882019,teaching,45,37,teaching,DES,2
6,5,1,2,1,2,2,595,235,58,29,96.704781,and,49,46,and,DES,2
7,5,1,2,1,2,3,665,235,181,38,96.409622,consulting,60,50,consulting,DES,2
8,5,1,3,1,1,1,456,309,363,46,94.920631,682.888.7639,73,61,682.888.7639,PHONE,3
9,5,1,4,1,1,1,493,397,289,29,91.978172,clarinetmike.com,90,74,clarinetmike.com,WEB,4
10,5,1,4,1,2,1,402,439,471,38,91.383377,clarinetmike.wordpress.com,117,91,clarinetmike.wordpress.com,WEB,4
11,5,1,4,1,3,1,380,476,322,46,70.143196,clarinetmiketexas@,136,118,clarinetmiketexas@,EMAIL,5


In [71]:
# right and bottom of bounding box
df_bb[['left', 'top', 'width', 'height']] = df_bb[['left', 'top', 'width', 'height']].astype(int)
df_bb['right'] = df_bb['left'] + df_bb['width']
df_bb['bottom'] = df_bb['top'] + df_bb['height']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bb[['left', 'top', 'width', 'height']] = df_bb[['left', 'top', 'width', 'height']].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bb['right'] = df_bb['left'] + df_bb['width']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bb['bottom'] = df_bb['top'] + df_bb['height']


In [79]:
# tagging: groupby by group
# left: min, right:max, top:mim, right:max
col_group = ['left', 'top', 'right', 'bottom', 'label', 'token', 'group']
group_tag_img = df_bb[col_group].groupby(by='group')

In [80]:
img_tagging = group_tag_img.agg({
    
    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'token':lambda x: ' '.join(x)

})

In [81]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,339,936,86,144,[NAME],dr michael dean
2,429,846,235,273,[DES],teaching and consulting
3,456,819,309,355,[PHONE],682.888.7639
4,402,873,397,477,[WEB],clarinetmike.com clarinetmike.wordpress.com
5,380,895,476,522,[EMAIL],clarinetmiketexas@ yahoo.com


In [91]:
img_bb = image.copy()
for l,r,t,b,label,token in img_tagging.values:
    cv2.rectangle(img_bb,(l,t),(r,b),(0,255,0),2)
    
    cv2.putText(img_bb,str(label),(l,t),cv2.FONT_HERSHEY_PLAIN,1,(0,0,255),2)

cv2.imshow('Bounding box business card',img_bb)
cv2.waitKey()
cv2.destroyAllWindows()