## Building NER model to identify the entities

In [1]:
import pandas as pd
import string

In [92]:
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "H0")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "H1"), (18, 24, "H0")]}),
]

In [2]:
data = pd.read_csv("sent_classifier_train.csv", index_col=False)

In [3]:
#Preparing Training Data for Spacy
#table = str.maketrans('[]','')
TRAIN_DATA=[]
for index, row in data.iterrows():
    #print(row['Spacy'])
    entities_dict={}
    
    sentence=row['Sentence']
    entities=row['Spacy'].split("],")
    entities_list=[]
    if(len(row['Spacy'].strip())>2):
        for entity_str in entities:
            #entity.translate(table)
            req_entity_str=entity_str.replace('[','').replace(']','')
            req_entity=req_entity_str.split(',')
            start_index=int(req_entity[0].strip())
            #print(start_index)
            end_index=int(req_entity[1].strip())
            #print(end_index)
            entity=req_entity[2].strip().replace("'","")
            #print(entity)
            entities_list.append((start_index,end_index,entity))
    entities_dict['entities']=entities_list
    TRAIN_DATA.append((row['Sentence'],entities_dict))
    
        


In [4]:
TRAIN_DATA[0]

('Isaac David Abella (June 20, 1934 – October 23, 2016) was a Professor of Physics at The University of Chicago. ',
 {'entities': [(0, 4, 'H0'),
   (6, 10, 'H0'),
   (12, 17, 'H0'),
   (25, 28, 'H1'),
   (29, 33, 'H1'),
   (34, 35, 'H1'),
   (36, 43, 'H1'),
   (44, 47, 'H1'),
   (48, 53, 'H1'),
   (54, 57, 'H1'),
   (60, 69, 'H1'),
   (70, 72, 'H1'),
   (73, 80, 'H1'),
   (81, 83, 'H1'),
   (84, 87, 'H1'),
   (88, 98, 'H1'),
   (99, 101, 'H1'),
   (102, 110, 'H1')]})

## Code for Spacy Model

In [5]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [59]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)

SyntaxError: unexpected EOF while parsing (<ipython-input-59-34251adb9d5b>, line 6)

In [6]:
def main(model=None, new_model_name="Privacy_Redaction", output_dir="Spacy_Model", n_iter=300):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")
        
    #Add H0 and H1 as entities
    ner.add_label("H0") 
    ner.add_label("H1") 
    
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
        
    move_names = list(ner.move_names)
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                #print(batch)
                texts, annotations = zip(*batch)
                #print(zip(*batch))
                #print(text)
                #print(annotationsations)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)
    
    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
        
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [7]:
if __name__ == "__main__":
    main()

Created blank 'en' model
Losses {'ner': 6984.608969391234}
Losses {'ner': 5149.077116403521}
Losses {'ner': 4378.090314007212}
Losses {'ner': 3734.79931748015}
Losses {'ner': 3281.2561587755295}
Losses {'ner': 3004.867279932058}
Losses {'ner': 2696.5801871788663}
Losses {'ner': 2399.5476219243337}
Losses {'ner': 2269.5227938266244}
Losses {'ner': 2151.233255962891}
Losses {'ner': 1967.4936714395074}
Losses {'ner': 1778.8185092237288}
Losses {'ner': 1739.3674873866144}
Losses {'ner': 1566.7262238851197}
Losses {'ner': 1499.9920394995888}
Losses {'ner': 1438.9841819992075}
Losses {'ner': 1406.8210648244315}
Losses {'ner': 1246.8125117346779}
Losses {'ner': 1354.6964409755158}
Losses {'ner': 1279.0518894672423}
Losses {'ner': 1151.19263188842}
Losses {'ner': 1082.1056158472381}
Losses {'ner': 1126.5109987091773}
Losses {'ner': 1027.8371648012994}
Losses {'ner': 974.516844457925}
Losses {'ner': 1083.5228720340042}
Losses {'ner': 921.4054606219256}
Losses {'ner': 913.197112164008}
Losses {'

Losses {'ner': 209.49391273926227}
Losses {'ner': 172.65577178383418}
Losses {'ner': 190.90588602501936}
Losses {'ner': 177.30283806329953}
Losses {'ner': 150.20259574718398}
Losses {'ner': 179.0246873233989}
Losses {'ner': 193.2881553555283}
Losses {'ner': 188.22259838858122}
Losses {'ner': 153.96752538473368}
Losses {'ner': 172.39764563282185}
Losses {'ner': 199.79950392376722}
Losses {'ner': 197.11243844142058}
Losses {'ner': 176.49807230840003}
Losses {'ner': 164.72091888674817}
Losses {'ner': 150.44791108120833}
Losses {'ner': 173.43219515279063}
Losses {'ner': 197.56154646925242}
Losses {'ner': 131.2997177000286}
Losses {'ner': 142.36527790566132}
Losses {'ner': 136.9460709503694}
Losses {'ner': 202.70445746312657}
Losses {'ner': 191.56467712443964}
Losses {'ner': 142.97334103837338}
Losses {'ner': 174.55467383529717}
Losses {'ner': 204.66076371927772}
Losses {'ner': 159.8692621450786}
Losses {'ner': 188.12953316686938}
Losses {'ner': 149.02563853911414}
Losses {'ner': 158.467495

## Using the model
### STEP1: Reading a json file

In [17]:
import json
with open("Test/Doc1.json","r") as read_file:
    data = json.load(read_file)

### STEP2: Processing the json to get csv

In [18]:
#Original
myData=[]
for document in data:
    text=document['text']
    H0=document['H0']
    H1=document['H1']
    i=0
    sentence=""
    categ="None"
    entities=[]
    while(i<len(text)):
        if(len(sentence)==0):
            start_index=0
        else:
            start_index=len(sentence)
        if(H0[i]==1):
            entities.append([start_index,start_index+len(text[i])-1,"H0"])
            if(categ!="H0"):
                #categ='Priv'
                categ='H0'
        if(H1[i]==1 and H0[i]==0):
            entities.append([start_index,start_index+len(text[i]),"H1"])
            if(categ!="H1" and categ!="H0"):
                #categ='Priv'
                categ='H1'
        sentence=sentence+text[i]+" "
        if('.' in text[i]):
            myData.append([sentence,categ,entities])
            sentence=""
            categ="None"
            entities=[]
        i+=1

### STEP3: Writing into a csv file

In [19]:
import pandas as pd
my_df = pd.DataFrame(myData)
my_df.columns=['Sentence', 'Category', 'Spacy']

In [20]:
my_df.to_csv('Test/test.csv', index=False)

## Using Spacy to find entities

In [21]:
#Loading Spacy inbuilt model and our model
output_dir="Spacy_Model"
nlp2 = spacy.load(output_dir)
nlp = spacy.load("en_core_web_sm")
import string
table = str.maketrans(dict.fromkeys(string.punctuation)) #Used to remove the punctuations

In [22]:
# Spacy prebuilt model's entities
#NORP: Nationalities or religious or political groups. ---- H1
#PERSON: People, including fictional ------ H0

In [23]:
#Read the csv file
data = pd.read_csv("Test/test.csv", index_col=False)

In [24]:
final_output=[]
dictText={}
dictText['text']=[]
dictText['H0']=[]
dictText['H1']=[]
for index, row in data.iterrows():
    sent_mymodel = nlp2(row['Sentence'])
    sent_inbuilt=nlp(row['Sentence'])
    entities=[]
    for ent in sent_inbuilt.ents:
        if(ent.label_=='PERSON'):
            #print(ent.text, ent.start_char, ent.end_char, ent.label_)
            entities.append([ent,'H0'])
        elif( ent.label_=='NORP'):
            entities.append([ent,'H1'])
            #print([ent,'H1'])
    for ent in sent_mymodel.ents:
        #print(ent.text, ent.start_char, ent.end_char, ent.label_)
        entities.append([ent,ent.label_])
    #print(row['Sentence'])
    words= row['Sentence'].split(" ")
    for word in words:
        dictText['text'].append(word)
        entity_name=None
        #count=0
        for entity in entities:
            #print(word.translate(table))
            #print(entity[0])
            #print(word.translate(table) in str(entity[0]))
            #print("For word:{0}, check entity:{1}, status:{2}".format(word, entity[0], word.translate(table) in str(entity[0])))
            if(word.translate(table) in str(entity[0])):
                entity_name=str(entity[1])
                #print("Word:{0}, EntityName:{1}".format(word,entity[1])) 
                break
                
        if(entity_name!=None):
            if(entity_name=='H0'):
                dictText['H0'].append(1)
            else:
                dictText['H0'].append(0)
            dictText['H1'].append(1)
        else:
            dictText['H0'].append(0)
            dictText['H1'].append(0)
        
    
        
    

In [25]:
#Writing json
import json
with open('Test/Output.json', 'w') as outfile:
    json.dump(dictText, outfile)

In [27]:
from Predict import outputJson

In [37]:
outputJson("D:/Berkeley/1stSem/290/Project/Privacy_redaction/Test/Doc1.json","D:/Berkeley/1stSem/290/Project/Privacy_redaction/Test/Doc1_output.json")

NameError: name 'spacy' is not defined

In [31]:
import spacy