In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import cv2
import pytesseract

import os
from glob import glob
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
imgPaths = glob('./Selected/*.jpeg')

In [3]:
allBusinessCard = pd.DataFrame(columns=['id','text'])

for imgPath in tqdm(imgPaths,desc='BusinessCard'):
    _, filename = os.path.split(imgPath)
    #print(filename)

    #Extract Data & text
    image = cv2.imread(imgPath)
    data = pytesseract.image_to_data(image)
    dataList = list(map(lambda x: x.split('\t'),data.split('\n')))
    df = pd.DataFrame(data = dataList[1:],columns = dataList[0])
    df.dropna(inplace=True)
    df['conf'] = df['conf'].astype('float')

    #Filtering only the confidence >30
    useFulData = df.query('conf >= 30.0')

    #DataFrame
    businessCard = pd.DataFrame()
    businessCard['text'] = useFulData['text']
    businessCard['id'] = filename
    
    #Concatination
    allBusinessCard = pd.concat((allBusinessCard,businessCard))

BusinessCard: 100%|██████████████████████████████████████████████████████████████████| 293/293 [02:32<00:00,  1.92it/s]


In [4]:
#allBusinessCard.to_csv('Business_Card.csv',index=False)

In [5]:
#Labeling using BIO
data = pd.read_csv('./businessCard.csv')
data.head()

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,8881,I-PHONE
4,000.jpeg,90309,B-PHONE


In [6]:
#Data Preprocessing
with open('businessCard.txt', mode='r', encoding='utf8',errors='ignore') as f:
    text = f.read()

In [7]:
data = list(map(lambda x: x.split('\t'),text.split('\n')))
df = pd.DataFrame(data = data[1:],columns= data[0])
df.head()

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE


In [8]:
#Cleaning text
import string
whitespace = string.whitespace
punctuation = '!#$%\'()*+-:;<=>?[\\]^_`{|}~'

tableWhiteSpace = str.maketrans('','',whitespace)
tablePunctuation = str.maketrans('','',punctuation)

def cleanText(txt):
    text = str(txt)
    text = text.lower()
    removeWhiteSapce = text.translate(tableWhiteSpace)
    removePunctuation = removeWhiteSapce.translate(tablePunctuation)
    
    return str(removePunctuation)

In [9]:
df['text'] = df['text'].apply(cleanText)

In [10]:
dataClean = df.query("text != '' ")
dataClean.dropna(inplace=True)

In [11]:
dataClean.head(10)

Unnamed: 0,id,text,tag
1,000.jpeg,.,O
2,000.jpeg,0404852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
6,000.jpeg,fi,O
7,000.jpeg,/laurelsoverseaseducation,O
8,000.jpeg,@,O
9,000.jpeg,laurels,B-ORG
10,000.jpeg,overseas,I-ORG


## #Convert the Data into Spacy Format

In [12]:

group = dataClean.groupby(by='id')

In [13]:
cards = group.groups.keys()

In [20]:
#card

In [14]:
allCardData = []
for card in cards:
    cardData = []
    grpArray = group.get_group(card)[['text','tag']].values
    content = ''
    annotations = {'entities':[]}

    start = 0
    end = 0

    for text, label in grpArray:
        text = str(text)
        stringLength = len(text) + 1

        start = end
        end = start + stringLength

        #Will not inclue the Output
        if label != 'O':
            annot = (start, end-1,label)
            annotations['entities'].append(annot)

        content = content + text + ' '
    cardData = (content,annotations)
    allCardData.append(cardData)

In [15]:
allCardData

[('. 0404852 "8881," 90309 52549 fi /laurelsoverseaseducation @ laurels overseas educational consultancy pvt. ltd. sea u.k australia canada ireland www.laurelseducation.com info@laurelseducation.com ',
  {'entities': [(2, 9, 'B-PHONE'),
    (10, 17, 'I-PHONE'),
    (18, 23, 'B-PHONE'),
    (24, 29, 'I-PHONE'),
    (61, 68, 'B-ORG'),
    (69, 77, 'I-ORG'),
    (78, 89, 'I-ORG'),
    (90, 101, 'I-ORG'),
    (102, 106, 'I-ORG'),
    (107, 111, 'I-ORG'),
    (145, 169, 'B-WEB'),
    (170, 195, 'B-EMAIL')]}),
 ('john smith marketing manager web www.psdgraphics.com phone 1234567890 mail email@psdgraphics.com ',
  {'entities': [(0, 4, 'B-NAME'),
    (5, 10, 'I-NAME'),
    (11, 20, 'B-DES'),
    (21, 28, 'I-DES'),
    (33, 52, 'B-WEB'),
    (59, 69, 'B-PHONE'),
    (75, 96, 'B-EMAIL')]}),
 ('sau 0 98489 24441 dy "08672," 224441 /enkateswapa & wie ',
  {'entities': [(6, 11, 'B-PHONE'), (12, 17, 'I-PHONE'), (37, 49, 'B-ORG')]}),
 ('prasad @ "9,96,31,73,53,59,49,04,00,000" i flex design album des

In [16]:
#allCardData

In [17]:
#Split the Data into Training and Testing
import random
random.shuffle(allCardData)

In [18]:
len(allCardData)

267

## Splitting The Data into Train & Test Data

In [19]:
trainData = allCardData[:240]
testData = allCardData[240:]

In [20]:
import pickle
pickle.dump(trainData,open('./data/trainData.pickle',mode='wb'))
pickle.dump(testData,open('./data/testData.pickle',mode='wb'))