In [29]:
import spacy
import string
import cv2 as cv
import pytesseract
import numpy as np
import pandas as pd

from spacy import displacy

import warnings
warnings.filterwarnings('ignore')

In [30]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [31]:
### Load NER model
model_ner = spacy.load('../output/model-best/')

In [55]:
# Load Image
image = cv.imread('../image/test-image/test-1.png')
# extract data using Pytesseract 
tessData = pytesseract.image_to_data(image)
# convert into dataframe
tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:],columns=tessList[0])
# drop missing values
df.dropna(inplace=True) 
df['text'] = df['text'].apply(cleanText)
# convet data into content
df_clean = df.query('text != "" ')
content = " ".join([w for w in df_clean['text']])

In [56]:
# df[['text']][4:]

In [57]:
# get prediction from NER model
doc = model_ner(content)

In [58]:
displacy.render(doc,style='ent')

In [59]:
displacy.render(doc,style='ent')

### Add prediction on images

In [60]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [61]:
doc_text = docjson['text']
datafram_tokens = pd.DataFrame(docjson['tokens'])
datafram_tokens['token'] = datafram_tokens[['start','end']].apply(
    lambda x:doc_text[x[0]:x[1]] , axis = 1)
datafram_tokens.head()

Unnamed: 0,id,start,end,token
0,0,0,13,etk-10007010e
1,1,14,16,to
2,2,17,19,to
3,3,20,27,florida
4,4,28,31,may


In [62]:
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
datafram_tokens = pd.merge(datafram_tokens,right_table,how='left',on='start')
datafram_tokens.fillna('O',inplace=True)
datafram_tokens.head()

Unnamed: 0,id,start,end,token,label
0,0,0,13,etk-10007010e,B-CARRIER
1,1,14,16,to,O
2,2,17,19,to,O
3,3,20,27,florida,O
4,4,28,31,may,B-DATE


In [63]:
# join lable to df_clean dataframe
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1 
df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)
# inner join with start 
dataframe_info = pd.merge(df_clean,datafram_tokens[['start','token','label']],how='inner',on='start')
dataframe_info.tail()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
29,5,1,4,1,5,1,409,905,58,56,56,©,133,132,©,O
30,5,1,4,1,7,1,389,1030,78,28,66,—,135,134,—,O
31,5,1,4,1,8,1,424,1092,21,79,40,oo,138,136,oo,O
32,5,1,4,1,11,1,409,1286,58,32,0,w,140,139,w,O
33,5,1,4,1,12,1,409,1324,58,56,43,0,142,141,0,O


### Add bounding box

In [64]:
bb_df = dataframe_info.query("label != 'O' ")
bb_df[['left','top','width','height','label']].head()

Unnamed: 0,left,top,width,height,label
0,2578,447,399,40,B-CARRIER
4,1335,964,106,52,B-DATE
5,1457,963,69,50,I-DATE
6,1544,963,109,41,I-DATE
7,1730,942,175,116,B-BAGGAGE


In [65]:
img = image.copy()
for x,y,w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)

    cv.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv.putText(img,str(label),(x,y),cv.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    

imS = cv.resize(img, (840,870))
cv.imshow('Predictions',imS)
cv.waitKey(0)
cv.destroyAllWindows()