In [1]:
import numpy as np
import pandas as pd
import string
import re

In [3]:
with open('businessCard.txt', mode='r', encoding='utf8', errors='ignore') as f:
    text = f.read()

In [9]:
data = list(map(lambda x : x.split('\t'), text.split('\n')))

In [10]:
data

[['id', 'text', 'tag'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', '.', 'O'],
 ['000.jpeg', '040-4852', 'B-PHONE'],
 ['000.jpeg', '"8881,"', 'I-PHONE'],
 ['000.jpeg', '90309', 'B-PHONE'],
 ['000.jpeg', '52549', 'I-PHONE'],
 ['000.jpeg', 'Fi', 'O'],
 ['000.jpeg', '/laurelsoverseaseducation', 'O'],
 ['000.jpeg', '@:', 'O'],
 ['000.jpeg', 'LAURELS', 'B-ORG'],
 ['000.jpeg', 'OVERSEAS', 'I-ORG'],
 ['000.jpeg', 'EDUCATIONAL', 'I-ORG'],
 ['000.jpeg', 'CONSULTANCY', 'I-ORG'],
 ['000.jpeg', 'PVT.', 'I-ORG'],
 ['000.jpeg', 'LTD.', 'I-ORG'],
 ['000.jpeg', 'Sea', 'O'],
 ['000.jpeg', '|', 'O'],
 ['000.jpeg', 'U.K', 'O'],
 ['000.jpeg', 'AUSTRALIA', 'O'],
 ['000.jpeg', 'CANADA', 'O'],
 ['000.jpeg', 'IRELAND', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', 'www.laurelseducation.com', 'B-WEB'],
 ['000.jpeg', ')%info@laurelseducation.com', 'B-EMAIL'],
 ['000.jpeg', ' ', 'O'],
 ['001.jpe

In [11]:
df = pd.DataFrame(data[1:], columns=data[0])

In [12]:
df

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
...,...,...,...
10441,290.jpeg,Richard,B-NAME
10442,290.jpeg,Pretorius,I-NAME
10443,290.jpeg,,O
10444,290.jpeg,Director,B-DES


### Cleaning Data

* Remove whitespaces
* Remove unwanted special characters

In [16]:
whitespace = string.whitespace
punctuation = '!#$%\'()*+,:;<=>?[\\]^`{|}~'

tableWhitespace = str.maketrans('', '', whitespace)
tablePunctuation = str.maketrans('','', punctuation)

def cleanText(txt):
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [17]:
df['text'] = df['text'].apply(cleanText)

In [18]:
df

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881""",I-PHONE
4,000.jpeg,90309,B-PHONE
...,...,...,...
10441,290.jpeg,richard,B-NAME
10442,290.jpeg,pretorius,I-NAME
10443,290.jpeg,,O
10444,290.jpeg,director,B-DES


In [19]:
dataClean = df.query('text != ""')
dataClean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataClean.dropna(inplace=True)


In [21]:
dataClean.head(10)

Unnamed: 0,id,text,tag
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
6,000.jpeg,fi,O
7,000.jpeg,/laurelsoverseaseducation,O
8,000.jpeg,@,O
9,000.jpeg,laurels,B-ORG
10,000.jpeg,overseas,I-ORG


### Convert data into spacy format

In [22]:
group = dataClean.groupby(by='id')

In [26]:
grouparray = group.get_group('000.jpeg')[['text','tag']].values

In [34]:
cards = group.groups.keys()

In [36]:
grouparray

array([['.', 'O'],
       ['040-4852', 'B-PHONE'],
       ['"8881"', 'I-PHONE'],
       ['90309', 'B-PHONE'],
       ['52549', 'I-PHONE'],
       ['fi', 'O'],
       ['/laurelsoverseaseducation', 'O'],
       ['@', 'O'],
       ['laurels', 'B-ORG'],
       ['overseas', 'I-ORG'],
       ['educational', 'I-ORG'],
       ['consultancy', 'I-ORG'],
       ['pvt.', 'I-ORG'],
       ['ltd.', 'I-ORG'],
       ['sea', 'O'],
       ['u.k', 'O'],
       ['australia', 'O'],
       ['canada', 'O'],
       ['ireland', 'O'],
       ['www.laurelseducation.com', 'B-WEB'],
       ['info@laurelseducation.com', 'B-EMAIL']], dtype=object)

In [38]:
# converting all cards into spacy format
allCardsData = []

for card in cards:
    cardData = []
    grouparray = group.get_group(card)[['text','tag']].values
    
    content = ''
    annotations = {'entities':[]}
    start = 0
    end = 0

    for text, label in grouparray:
        text = str(text)
        stringLength = len(text) + 1
        start = end
        end = start + stringLength

        if label != 'O':
            annot = (start, end-1, label)
            annotations['entities'].append(annot)

        content = content + text + " "
    cardData = (content, annotations)
    allCardsData.append(cardData)

In [39]:
allCardsData

[('. 040-4852 "8881" 90309 52549 fi /laurelsoverseaseducation @ laurels overseas educational consultancy pvt. ltd. sea u.k australia canada ireland www.laurelseducation.com info@laurelseducation.com ',
  {'entities': [(2, 10, 'B-PHONE'),
    (11, 17, 'I-PHONE'),
    (18, 23, 'B-PHONE'),
    (24, 29, 'I-PHONE'),
    (61, 68, 'B-ORG'),
    (69, 77, 'I-ORG'),
    (78, 89, 'I-ORG'),
    (90, 101, 'I-ORG'),
    (102, 106, 'I-ORG'),
    (107, 111, 'I-ORG'),
    (145, 169, 'B-WEB'),
    (170, 195, 'B-EMAIL')]}),
 ('john smith marketing manager web www.psdgraphics.com phone 123-456-7890 mail email@psdgraphics.com ',
  {'entities': [(0, 4, 'B-NAME'),
    (5, 10, 'I-NAME'),
    (11, 20, 'B-DES'),
    (21, 28, 'I-DES'),
    (33, 52, 'B-WEB'),
    (59, 71, 'B-PHONE'),
    (77, 98, 'B-EMAIL')]}),
 ('sau 0 98489 24441 dy "08672" 224441 /enkateswapa & wie ',
  {'entities': [(6, 11, 'B-PHONE'), (12, 17, 'I-PHONE'), (36, 48, 'B-ORG')]}),
 ('prasad @ "99631735359490400000" i flex design album design vis

### Splitting data into traning and testing set

In [40]:
import random
random.shuffle(allCardsData)
len(allCardsData)

267

In [42]:
TrainData = allCardsData[:240]
TestData = allCardsData[240:]

In [43]:
import pickle
pickle.dump(TrainData, open('./data/TrainData.pickle', mode='wb'))
pickle.dump(TestData, open('./data/TestData.pickle', mode='wb'))