In [None]:
# -----------------------------------------------------------
# Create an API to pull entities using our spaCy model.
# 
# Written primarily by Robbie, who stole T's code from export_entities_for_relation_extraction
# -----------------------------------------------------------

In [6]:
import os
import spacy

ROUTETOROOTDIR = '/home/dssg-cfa/notebooks/dssg-cfa/'
IMPORTSCRIPTSDIR = ROUTETOROOTDIR + "pdf_to_text/Post-Processing/py_files"
os.chdir(IMPORTSCRIPTSDIR)
import trainingDataForSpaCy

In [3]:
# The current best model is stored in this directory
local_output_dir = '/home/dssg-cfa/notebooks/dssg-cfa/NER/T/spaCy/Modified_NER/custom_model_modified_and_improved/'
os.chdir(local_output_dir)

nlp = spacy.load(local_output_dir)   # load the model

In [4]:
def getListOfTexts(gazetteNum):
    """API to training data for spaCy: returns a list of all inner texts from a given gazette.
    
    args:
    gazetteNum: index of pre-processing gazette from list within trainDataForSpaCy.
    
    returns: list, each entry of which is an inner text (cleaned, no headers or footers)."""
    
    return [data[0] for data in trainingDataForSpaCy.exportTrainData(gazetteNum)]

def getNEROutput(gazetteNum):
    """For a given gazette number, call spaCy NER on each segment and return the outputs.
    
    args:
    gazetteNum: index of pre-processing gazette from list within trainDataForSpaCy.
    
    returns: a nested list.
        Outer list: each item contains NER outputs for one segment.
        inner list: each item is an NER tag for a single segment in tuple format.
        items in inner list: (label, text)
            label: entity tag
            text: text corresponding to said tag."""
    
    rawText = getListOfTexts(gazetteNum)
    docs = [nlp(segment) for segment in rawText]
    ret = []
    for doc in docs:
        ret.append([(ent.label_, ent.text) for ent in doc.ents])
    return ret