In [29]:
import spacy
import string
import cv2 as cv
import pytesseract
import numpy as np
import pandas as pd

from spacy import displacy

import warnings
warnings.filterwarnings('ignore')

In [30]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [31]:
### Load NER model
model_ner = spacy.load('../output/model-best/')

In [77]:
# Load Image
image = cv.imread('../image/test-image/test-2.png')
# extract data using Pytesseract 
tessData = pytesseract.image_to_data(image)
# convert into dataframe
tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:],columns=tessList[0])
# drop missing values
df.dropna(inplace=True) 
df['text'] = df['text'].apply(cleanText)
# convet data into content
df_clean = df.query('text != "" ')
content = " ".join([w for w in df_clean['text']])

In [78]:
# df[['text']][4:]

In [79]:
# get prediction from NER model
doc = model_ner(content)

In [80]:
displacy.render(doc,style='ent')

In [81]:
displacy.render(doc,style='ent')

### Add prediction on images

In [82]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [83]:
doc_text = docjson['text']
datafram_tokens = pd.DataFrame(docjson['tokens'])
datafram_tokens['token'] = datafram_tokens[['start','end']].apply(
    lambda x:doc_text[x[0]:x[1]] , axis = 1)
datafram_tokens.head()

Unnamed: 0,id,start,end,token
0,0,0,13,etk-30207752e
1,1,14,17,san
2,2,18,27,francisco
3,3,28,30,to
4,4,31,38,chicago


In [84]:
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
datafram_tokens = pd.merge(datafram_tokens,right_table,how='left',on='start')
datafram_tokens.fillna('O',inplace=True)
datafram_tokens.head()

Unnamed: 0,id,start,end,token,label
0,0,0,13,etk-30207752e,B-TNUMBER
1,1,14,17,san,B-FDEPART
2,2,18,27,francisco,I-FDEPART
3,3,28,30,to,O
4,4,31,38,chicago,B-SFARRIVAL


In [85]:
# join lable to df_clean dataframe
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1 
df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)
# inner join with start 
dataframe_info = pd.merge(df_clean,datafram_tokens[['start','token','label']],how='inner',on='start')
dataframe_info.tail()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
26,5,1,5,1,5,1,409,844,58,56,56,©,137,136,©,O
27,5,1,5,1,7,1,389,969,78,28,66,—,139,138,—,O
28,5,1,5,1,8,1,424,1031,21,79,40,oo,142,140,oo,O
29,5,1,5,1,11,1,409,1225,58,32,0,w,144,143,w,O
30,5,1,5,1,12,1,409,1263,58,56,43,0,146,145,0,O


### Add bounding box

In [86]:
bb_df = dataframe_info.query("label != 'O' ")
bb_df[['left','top','width','height','label']].head()

Unnamed: 0,left,top,width,height,label
0,2578,386,399,40,B-TNUMBER
1,2557,630,83,41,B-FDEPART
2,2656,629,224,42,I-FDEPART
4,653,901,190,54,B-SFARRIVAL
5,1309,901,122,54,B-DATE


In [87]:
img = image.copy()
for x,y,w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)

    cv.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv.putText(img,str(label),(x,y),cv.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    

imS = cv.resize(img, (840,870))
cv.imshow('Predictions',imS)
cv.waitKey(0)
cv.destroyAllWindows()