In [46]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
from spacy import displacy
import re
import string
import warnings
warnings.filterwarnings("ignore")

model_ner = spacy.load("./output/model-best")

In [47]:
def cleanText(text):
    whiteSpace = string.whitespace
    punctuation = '!#$%&\'()*+:;<=>?[\\]^`{|}~'
    tableWhiteSpace = str.maketrans('','', whiteSpace)
    tablePunctuation = str.maketrans('','', punctuation)
    text = str(text)
    text = text.lower()
    removeWhiteSpace = text.translate(tableWhiteSpace)
    removePunctuation = removeWhiteSpace.translate(tablePunctuation)

    return str(removePunctuation)

In [48]:
class groupgen():
    def __init__(self):
        self.id = 0
        self.text = 0

    def getgroup(self, text):
        if self.text == text:
            return self.id
        else:
            self.id += 1
            self.text = text
            return self.id
        
grp_gen = groupgen()

In [49]:
image = cv2.imread("./data/6.jpeg")

tessdata = pytesseract.image_to_data(image)
dataList = list(map(lambda x: x.split("\t"), tessdata.split("\n")))
dataFrame = pd.DataFrame(dataList[1:], columns = dataList[0])
dataFrame.dropna(inplace=True)
dataFrame["text"] = dataFrame["text"].apply(cleanText)
dataClean = dataFrame.query("text != ''")
content = " ".join([w for w in dataClean["text"]])
doc = model_ner(content)

In [50]:
doc_json = doc.to_json()
doc_text = doc_json["text"]
dataFrame_token = pd.DataFrame(doc_json["tokens"])
dataFrame_token["token"] = dataFrame_token[["start", "end"]].apply(lambda x:doc_text[x[0]:x[1]], axis = 1)
right_table = pd.DataFrame(doc_json["ents"])[["start", "label"]]
dataFrame_token = pd.merge(dataFrame_token, right_table, how = "left", on = "start")
dataFrame_token.fillna("O", inplace = True)

In [51]:
dataClean["end"] = dataClean['text'].apply(lambda x: len(x)+1).cumsum() - 1
dataClean["start"]=dataClean[['text','end']].apply(lambda x: x[1]-len(x[0]),axis=1)

In [52]:
dataFrame_info = pd.merge(dataClean, dataFrame_token[["start", "token", "label"]], how = "inner", on = "start")

In [None]:
boundingBoxFrame = dataFrame_info.query("label != 'O'")

img = image.copy()

for x, y, w, h, label in boundingBoxFrame[["left", "top", "width", "height", "label"]].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)

    cv2.rectangle(img, (x,y), (x+w, y+h), (0,255,0), 3)
    cv2.putText(img, str(label), (x,y), cv2.FONT_HERSHEY_PLAIN, 1, (0,0,255), 2)

cv2.imshow("Predictions", img)
cv2.waitKey(0)
cv2.destroyAllWindows()
boundingBoxFrame["label"] = boundingBoxFrame["label"].apply(lambda x: x[2:])
boundingBoxFrame["group"] = boundingBoxFrame["label"].apply(grp_gen.getgroup)
boundingBoxFrame[["left", "top", "width", "height"]] = boundingBoxFrame[["left", "top", "width", "height"]].astype(int)
boundingBoxFrame["right"] = boundingBoxFrame["left"] + boundingBoxFrame["width"]
boundingBoxFrame["bottom"] = boundingBoxFrame["top"] + boundingBoxFrame["height"]
boundingBoxFrame

In [53]:
def parser(text, label):
    if label == "PHONE":
        text = text.lower()
        text = re.sub(r"\D","", text)

    elif label == "EMAIL":
        text = text.lower()
        allow_special_chars = "@_.\-"
        text = re.sub(r"[^A-Za-z0-9{}]".format(allow_special_chars), "", text)
    
    elif label == "WEB":
        text = text.lower()
        allow_special_chars = "_.\-:/%#"
        text = re.sub(r"[^A-Za-z0-9{}]".format(allow_special_chars), "", text)
    
    elif label in ("NAME", "DES"):
        text = text.lower()
        text = re.sub(r"[^a-z ]", "", text)
        text = text.title()

    elif label == "ORG":
        text = text.lower()
        text = re.sub(r"[^a-z0-9 ]", "", text)
        text = text.title()
        
    return text

In [54]:
info_array = dataFrame_info[["token", "label"]].values
entities = dict(NAME=[], ORG=[], DES=[], PHONE=[], EMAIL=[], WEB = [])
previous = "O"

for token, label in info_array:
    bio_tag = label[0]
    label_tag = label[2:]

    text = parser(token, label_tag)

    if bio_tag in ("B", "I"):
        if previous != label_tag:
            entities[label_tag].append(text)
        else:
            if bio_tag == "B":
                entities[label_tag].append(text)
            else:
                if label_tag in ("NAME", "ORG", "DES"):
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text
    
    previous = label_tag

In [55]:
entities

{'NAME': ['Thathineni Srikanth'],
 'ORG': ['Life Insurance Corporation Of India',
  'Life Insurance Corporation Of India'],
 'DES': ['Insurance Advisor'],
 'PHONE': ['8099948528', '8466045457'],
 'EMAIL': ['lictsrikant@gmail.com', 'seosrikantht@gmail.com'],
 'WEB': ['interviewsinhyderabad.blogspot.in']}