In [170]:
import json
import os
import glob
import numpy as np
import regex as re

In [108]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))
            training_data.append((text, {"entities": entities}))
        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

# read manually annotated data into one json file

In [311]:
path_lst = ['../data/annotate_emmy/*','../data/annotation_zqw/*', '../data/annotations_LHB/*','../data/Archive_YSY/*']
training_data = []
def read_data(path):
    first_is_special = r'^\W[a-zA-Z0-9]+$'
    last_is_special = r'^[a-zA-Z0-9]+\W$'
    json_lst = glob.glob(path)
    for direc in json_lst:
        f = open(direc)
        data = json.load(f)
        for text, entity in data['annotations']:
            if len(entity['entities'])==0:
                training_data.append((text, entity))
            # add to training data only if there is a text
            elif (len(text) != 0):
                for x in entity['entities']:
                    # do not append 'unlabelled' or 'unknown' entity
                    if (x[2] == 'Unlabelled') or (x[2] == 'UNKNOWN'):
                        continue
                    else:
                        training_data.append((text, entity))
                        break
        f.close()
    return


In [312]:
for x in path_lst:
    print(x)
    read_data(x)

../data/annotate_emmy/*
../data/annotation_zqw/*
../data/annotations_LHB/*
../data/Archive_YSY/*


In [295]:
def trim_special_characters(data: list) -> list:

    special_character = re.compile(r'\W')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and special_character.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and special_character.match(
                    text[valid_end - 1]) and text[valid_end-1]!='#':
                valid_end -= 1

            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [314]:
training_data = trim_special_characters(training_data)

In [297]:
# check any special character still exist
entity_w_special_characters_first = []
entity_w_special_characters_last = []
first_is_special = r'^\W[a-zA-Z0-9]+$'
last_is_special = r'^[a-zA-Z0-9]+\W$'
for text, entity in training_data:
    for x in entity['entities']:
        if re.search(first_is_special, text[x[0]:x[1]]):
            entity_w_special_characters_first.append(text[x[0]:x[1]])
        if re.search(last_is_special, text[x[0]:x[1]]):
            entity_w_special_characters_last.append(text[x[0]:x[1]])

In [300]:
entity_w_special_characters_last

['C#', 'C#', 'C#', 'C#', 'C#', 'C#', 'C#', 'C#', 'C#']

In [301]:
# find all types of entities
entities = []
for text, entity in training_data:
    for start, end, entity_type in entity['entities']:
        entities.append(entity_type)

In [302]:
len(entities)

11887

In [315]:
np.unique(np.array(entities), return_counts=True)

(array(['COLLEGE NAME', 'COMPANIES WORKED AT', 'DEGREE', 'DESIGNATION',
        'DESIGNATION ', 'EMAIL ADDRESS', 'GRADUATION YEAR', 'LOCATION',
        'NAME', 'SKILLS', 'UNKNOWN', 'Unlabelled', 'YEARS OF EXPERIENCE'],
       dtype='<U19'),
 array([ 489,  864,  317,  785,  535,  171,  216, 1657,  218, 5746,    1,
           1,  887]))

In [317]:
# export file
with open('../data/manually_annotation.json', 'w') as f:
    json.dump(training_data,f)

In [318]:
with open('../data/manually_annotation.json', 'r') as f:
    lines = f.readlines()

In [321]:
for line in lines:
    data = json.loads(line)
    print(len(data))

7636


In [111]:
# look at the pre-collected data from github to match with the format
train_fp = "../test/traindata.json"
convert_dataturks_to_spacy(train_fp)[0]

('Govardhana K\nSenior Software Engineer\n\nBengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/\nb2de315d95905b68\n\nTotal IT experience 5 Years 6 Months\nCloud Lending Solutions INC 4 Month • Salesforce Developer\nOracle 5 Years 2 Month • Core Java Developer\nLanguages Core Java, Go Lang\nOracle PL-SQL programming,\nSales Force Developer with APEX.\n\nDesignations & Promotions\n\nWilling to relocate: Anywhere\n\nWORK EXPERIENCE\n\nSenior Software Engineer\n\nCloud Lending Solutions -  Bangalore, Karnataka -\n\nJanuary 2018 to Present\n\nPresent\n\nSenior Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2016 to December 2017\n\nStaff Consultant\n\nOracle -  Bangalore, Karnataka -\n\nJanuary 2014 to October 2016\n\nAssociate Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2012 to December 2013\n\nEDUCATION\n\nB.E in Computer Science Engineering\n\nAdithya Institute of Technology -  Tamil Nadu\n\nSeptember 2008 to June 2012\n\nhttps://www

In [129]:
label_correction_dic = {'Email Address': 'EMAIL ADDRESS',
                        'College Name': 'COLLEGE NAME',
                        'Degree': 'DEGREE',
                        'Location': 'LOCATION',
                        'Skills': 'SKILLS',
                        'Companies Worked at': 'COMPANIES WORKED AT',
                        'Name': 'NAME',
                        'Designation ': 'DESIGNATION',
                        'Years of Experience': 'YEARS OF EXPERIENCE',
                        'Graduation Year': 'GRADUATION YEAR'
                        }
'EMAIL ADDRESS' in label_correction_dic

False