In [1]:
import numpy as np
import pandas as pd
import string
import re

In [2]:
with open('businessCard.txt',mode='r',encoding='utf8',errors='ignore') as f:
    text = f.read()

In [3]:
data = list(map(lambda x:x.split('\t'),text.split('\n')))

In [4]:
df = pd.DataFrame(data[1:],columns=data[0])

In [5]:
df.head(10)

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
6,000.jpeg,Fi,O
7,000.jpeg,/laurelsoverseaseducation,O
8,000.jpeg,@:,O
9,000.jpeg,LAURELS,B-ORG


### Cleaning Text
- Remove white space
- Remove Unwanted special characters

In [23]:
whitespace = string.whitespace
punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
tableWhitespace = str.maketrans('','',whitespace)
tablePunctuation = str.maketrans('','',punctuation)
def cleanText(txt):
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [25]:
df['text'] = df['text'].apply(cleanText)

In [26]:
dataClean = df.query("text != '' ")
dataClean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [27]:
dataClean.head(10)

Unnamed: 0,id,text,tag
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
6,000.jpeg,fi,O
7,000.jpeg,/laurelsoverseaseducation,O
8,000.jpeg,@,O
9,000.jpeg,laurels,B-ORG
10,000.jpeg,overseas,I-ORG


### Convert Data into Spacy Format

In [28]:
group = dataClean.groupby(by='id')

In [56]:
cards = group.groups.keys()

In [57]:
allCardsData = []
for card in cards:
    cardData = []
    grouparray = group.get_group(card)[['text','tag']].values
    content = ''
    annotations = {'entities':[]}
    start = 0
    end = 0
    for text, label in grouparray:
        text = str(text)
        stringLength = len(text) + 1

        start = end
        end = start + stringLength

        if label != 'O':
            annot = (start,end-1,label)
            annotations['entities'].append(annot)

        content = content + text + ' '
        
        
    cardData = (content,annotations)
    allCardsData.append(cardData)

In [58]:
allCardsData

[('. 040-4852 "8881," 90309 52549 fi /laurelsoverseaseducation @ laurels overseas educational consultancy pvt. ltd. sea u.k australia canada ireland www.laurelseducation.com info@laurelseducation.com ',
  {'entities': [(2, 10, 'B-PHONE'),
    (11, 18, 'I-PHONE'),
    (19, 24, 'B-PHONE'),
    (25, 30, 'I-PHONE'),
    (62, 69, 'B-ORG'),
    (70, 78, 'I-ORG'),
    (79, 90, 'I-ORG'),
    (91, 102, 'I-ORG'),
    (103, 107, 'I-ORG'),
    (108, 112, 'I-ORG'),
    (146, 170, 'B-WEB'),
    (171, 196, 'B-EMAIL')]}),
 ('john smith marketing manager web www.psdgraphics.com phone 123-456-7890 mail email@psdgraphics.com ',
  {'entities': [(0, 4, 'B-NAME'),
    (5, 10, 'I-NAME'),
    (11, 20, 'B-DES'),
    (21, 28, 'I-DES'),
    (33, 52, 'B-WEB'),
    (59, 71, 'B-PHONE'),
    (77, 98, 'B-EMAIL')]}),
 ('sau 0 98489 24441 dy "08672," 224441 /enkateswapa wie ',
  {'entities': [(6, 11, 'B-PHONE'), (12, 17, 'I-PHONE'), (37, 49, 'B-ORG')]}),
 ('prasad @ "9,96,31,73,53,59,49,04,00,000" i flex design album d

## Split the Data into Training and Testing Set

In [60]:
import random

In [61]:
random.shuffle(allCardsData)

In [63]:
len(allCardsData)

267

In [64]:
TrainData = allCardsData[:240]
TestData = allCardsData[240:]

In [65]:
import pickle

In [67]:
pickle.dump(TrainData,open('./data/TrainData.pickle',mode='wb'))
pickle.dump(TestData,open('./data/TestData.pickle',mode='wb'))