## 1. Import libraries and requirements

In [1]:
# Loading required packages
import spacy
import pandas as pd
import re
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import nltk
import sklearn
from sklearn.model_selection import train_test_split
import json
import csv
import ast

# Select Spacy model
# Efficiency
# nlp = spacy.load("en_core_web_sm")

# Accuracy
nlp = spacy.load("en_core_web_trf")

# Change working directory
%cd '/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets'

/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets


In [2]:
nlp.max_length

1000000

## 2. Import data

### Load EMSCAD Dataset

In [3]:
# Load the csv file into the environment
jobdescriptions = pd.read_csv('EMSCAD/Input data/JobDescriptions.csv', delimiter=',')
jobdescriptions.head(5)
len(jobdescriptions)

17880

### Subset data, remain only the column description

In [4]:
# Copy column description from DF jobdescription
descriptions = jobdescriptions['description']

# Convert Series into Dataframe
descriptions = descriptions.to_frame()
descriptions.head(5)

Unnamed: 0,description
0,"<p>Food52, a fast-growing, James Beard Award-w..."
1,<p>Organised - Focused - Vibrant - Awesome!<br...
2,"<p>Our client, located in Houston, is actively..."
3,<p><b>THE COMPANY: ESRI – Environmental System...
4,<p><b>JOB TITLE:</b> Itemization Review Manage...


## 3. Clean the data

### Remove HTML patterns in job descriptions

### Once cleaned, we can put the data through Spacy's NLP pipeline and tokenize each description

In [5]:
%%time
# Remove HTML codes based on pattern cleanr
# Remove all characters except whitespace an alphabetic characters.

result = []
docDF = pd.DataFrame()
max_length = 0
for i in descriptions["description"]:
    cleanr = re.compile('<.*?>')
    i = re.sub(cleanr, '', i)
    cleanr = re.compile('<[^>]+>')
    i = re.sub(cleanr, '', i)
    i = i.replace('\xa0', ' ')
    i = i.replace('\r', ' ')
    i = i.replace('&amp', ' ')
    i = i.replace('\N{SOFT HYPHEN}', '')
    doc = i
    if len(doc) > max_length:
        max_length = len(doc)
    result.append(doc)

print(max_length)
    
# Add the result
docDF["Result"] = result
docDF.head(5)

# %%time
# # Remove HTML codes based on pattern cleanr
# # Remove all characters except whitespace an alphabetic characters.

# result = []
# docDF = pd.DataFrame()
# count = 0
# for i in descriptions["description"]:
#     cleanr = re.compile('<.*?>')
#     i = re.sub(cleanr, '', i)
#     cleanr = re.compile('<[^>]+>')
#     i = re.sub(cleanr, '', i)
#     i = i.replace('\xa0', ' ')
#     i = i.replace('\r', ' ')
#     i = i.replace('&amp', ' ')
#     i = i.replace('\N{SOFT HYPHEN}', '')
#     if len(i)>512:
#         text = i[:512]
#         count = count +1
#     else:
#         text = i
#     doc = nlp(str(text))
#     result.append(doc)
# print(count)

# # Add the result
# docDF["Result"] = result
# docDF.head(5)

15136
CPU times: user 406 ms, sys: 19.5 ms, total: 426 ms
Wall time: 426 ms


Unnamed: 0,Result
0,"Food52, a fast-growing, James Beard Award-winn..."
1,Organised - Focused - Vibrant - Awesome!Do you...
2,"Our client, located in Houston, is actively se..."
3,THE COMPANY: ESRI – Environmental Systems Rese...
4,JOB TITLE: Itemization Review Manager \nLOCATI...


### Split descriptions into sentences
### By doing so, we prepare the data for annotation and training the custom NER model ---> Spacy

In [6]:
# # doc = nlp("This is a sentence. This is another sentence.")
# # for sent in doc.sents:
# #     print(sent.text)

# for i in docDF["Result"]:
#     for sent in i.sents:
# #         print(sent.text)
#         result.append(sent.text)
    
# sentences = pd.DataFrame(columns=['sentence'])
# sentences["sentence"] = result

# type(sentences["sentence"])

# sentences["sentence"].head(10)

# sentences['sentence'].to_csv('EMSCAD/Output data/sentence.csv')

In [7]:
%%time
# Split each description into sentences
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

result = []

for i in docDF["Result"]:
    i = str(i)
    sentences = split_into_sentences(i)
    if len(sentences)>0:
        result.append(sentences)

sentences = pd.DataFrame(columns=['sentence'])
sentences["sentence"] = result
sentences['sentence'].to_csv('EMSCAD/Output data/sentence.csv', index = False, header = False)

CPU times: user 3.48 s, sys: 108 ms, total: 3.59 s
Wall time: 3.6 s


# SKIP CLEANING PROCESS DOWN HERE

In [8]:
# Use this cell to skip the cleaning process
# sentences = pd.read_csv('EMSCAD/Output data/sentence.csv', delimiter=',', names = ['sentence'])

In [9]:
# 80% / 20% split
Train, Eval = train_test_split(sentences, test_size=0.2, shuffle=False)

### Now that we created sentences, we can also see that not everything went right
### We remove each "sentence" that begins with a prefix: "."
### This action takes care of removing all invalid sentences from the dataset, for each description

In [10]:
%%time
Train_list = []
Eval_list = []
def remove_invalid_sentences(sentences):
    prefixes = ('.')
    tokens = []
    output = []
    for i in sentences["sentence"]:
        tokens = [token for token in i if not token.startswith(prefixes)]
        output.append(tokens)
    return output

Train_list = remove_invalid_sentences(Train)

Eval_list = remove_invalid_sentences(Eval)

CPU times: user 54.4 ms, sys: 3.4 ms, total: 57.8 ms
Wall time: 57.5 ms


In [11]:
# type(docDF)

# Cleaned = pd.DataFrame()
# type(Cleaned)

# Pos = pd.DataFrame()
# type(Pos)
# # cols = ['DocObject']
# # tempdocDF = pd.DataFrame(columns=cols)
# # type(tempdocDF)

In [12]:
# # Niet meer nodig omdat speciale tekens behouden blijven vanwege de zinnen.

# # %%time

# # Use Spacy to select the tokens using criteria
# result = []
# for i in docDF["Result"]:
#     i = [token.orth_ for token in i if not token.is_punct | token.is_space | token.is_stop | token.is_bracket | token.like_url | token.like_email | token.is_digit | token.is_currency] 
#     result.append(i)
# Cleaned["Result"] = result   
# print(Cleaned)


# # %%time

# # # Use Spacy to select the tokens using criteria
# # result = []
# # for i in docDF["Result"]:
# #     i = [token.orth_ for token in i if not token.is_punct | token.is_space | token.is_stop | token.is_bracket | token.like_url | token.like_email | token.is_digit | token.is_currency] 
# #     result.append(i)
# # Cleaned["Result"] = result   
# # print(Cleaned)

## 4. Prepare TRAIN_DATA

### We do this by splitting each sentence on tokens and putting them all in one column, creating a list of words.

In [13]:
%%time
Train_Annotation_data = pd.DataFrame(pd.DataFrame(columns=['Result', 'Label']))
Eval_Annotation_data = pd.DataFrame(pd.DataFrame(columns=['Result', 'Label']))
#########################
lemmatizer = nlp.get_pipe("lemmatizer")
#########################

prefixes = ["\"","#","$","%","&","'","(",")","*","+",","," ","-","/",":",";","<","=",">","@","[","\\","]","^","_","`","{","|","}","~"]
prefixes_end = ["?","!"]
full_stop = ["."]

def sentence_to_words(input_list):
    prefixes = ["\"","#","$","%","&","'","(",")","*","+",","," ","-","/",":",";","<","=",">","@","[","\\","]","^","_","`","{","|","}","~"]
    prefixes_end = ["?","!"]
    full_stop = ["."]
    result = []
    max_length = 0
    count = 0
    for i in input_list:
        i = str(i)
        ######################### USE FOR en_core_web_trf model only!
        if len(i) > max_length:
            max_length = len(i)
        if len(i)>512:
            i = i[:512]
            count = count +1
        #########################
        i = nlp(i)
        for token in i:
            #########################
            token = token.lemma_
            #########################
            if str(token) not in prefixes and str(token) not in prefixes_end:
                result.append(token)
            elif str(token) in prefixes_end:
                result.append(nlp(full_stop[0]))
    print(count)
    print(max_length)
    return result

Train_Annotation_data["Result"] = sentence_to_words(Train_list)
Eval_Annotation_data["Result"] = sentence_to_words(Eval_list)

# %%time
# result = []
# Annotation_data = pd.DataFrame(pd.DataFrame(columns=['Result', 'Label']))
# #########################
# # lemmatizer = nlp.get_pipe("lemmatizer")


# prefixes = ["\"","#","$","%","&","'","(",")","*","+",","," ","-","/",":",";","<","=",">","@","[","\\","]","^","_","`","{","|","}","~"]
# prefixes_end = ["?","!"]
# full_stop = ["."]
# # test = raw_input(test)
# # full_stop[0] = full_stop[0].replace('"', '')
# for i in tokens_total:
#     i = str(i)
#     i = nlp(i)
# #     print(i)
#     for token in i:
#         #########################
# #         token = token.lemma_
#         if str(token) not in prefixes and str(token) not in prefixes_end:
#             result.append(token)
#         elif str(token) in prefixes_end:
#             result.append(nlp(full_stop[0]))
# #         print(token)
            
# Annotation_data["Result"] = result
# Annotation_data["Result"]

Token indices sequence length is longer than the specified maximum sequence length for this model (931 > 512). Running this sequence through the model will result in indexing errors


10037
15128
2614
5806
CPU times: user 3h 55min 33s, sys: 4min 20s, total: 3h 59min 53s
Wall time: 1h 9min 43s


In [14]:
# Export data to be annotated (Can be used for manual annotation!)
Train_Annotation_data['Result'].to_csv('EMSCAD/Output data/Train_Annotation_data.csv')
Eval_Annotation_data['Result'].to_csv('EMSCAD/Output data/Eval_Annotation_data.csv')

In [15]:
# Import the source file that contains all biased word lists
biased_words = pd.read_csv('EMSCAD/Input data/biased_words.csv', delimiter=';')

In [16]:
%%time
# Automated annotation process (Based on the word lists imported.)
# Only exact matches will be annotated.
def automated_annotation(Annotation_data):
    result = []
    row = -1
    count = 0
    for i in Annotation_data['Result']:
        i = str(i)
        row = row + 1
        for j in biased_words:
            for k in biased_words[j]:
                word = str(k)
                if word == i:
                    Annotation_data['Label'][row] = j
                    count = count + 1
    Annotation_data['Label'] = Annotation_data['Label'].fillna("O")
    print(str(count) + " words have been annotated.")
    return Annotation_data

Train_Annotation_data = automated_annotation(Train_Annotation_data)
Eval_Annotation_data = automated_annotation(Eval_Annotation_data)

8629 words have been annotated.
2223 words have been annotated.
CPU times: user 1min 52s, sys: 1.19 s, total: 1min 53s
Wall time: 1min 59s


In [17]:
# Export annotated data (By "automated" annotator)
Train_Annotation_data.to_csv('EMSCAD/Output data/Train_Annotation_data_output.tsv', sep='\t', index = False, header = False)
Eval_Annotation_data.to_csv('EMSCAD/Output data/Eval_Annotation_data_output.tsv', sep='\t', index = False, header = False)

### Convert tsv to JSON format --> Used to be the input for Spacy V2!

In [18]:
%%time
import json
import logging
import sys
def tsv_to_json_format(input_path,output_path,unknown_label):
    try:
        f=open(input_path,'r') # input file
        fp=open(output_path, 'w') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity!=unknown_label:
                    if len(entity) != 1:
                        d={}
                        d['text']=word
                        d['start']=start
                        d['end']=start+len(word)-1
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d)
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                         
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

tsv_to_json_format('EMSCAD/Output data/Train_Annotation_data_output.tsv','EMSCAD/Input model/Train_Annotation_data_model_input.json','abc')
tsv_to_json_format('EMSCAD/Output data/Eval_Annotation_data_output.tsv','EMSCAD/Input model/Eval_Annotation_data_model_input.json','abc')

CPU times: user 3.22 s, sys: 97.3 ms, total: 3.32 s
Wall time: 3.48 s


# Convert JSON to SpaCy V2 format
## We need to do this by OS Terminal!

# Convert SpaCy V2 format --> SpaCy V3 format
### First we need to import the data the data again

In [19]:
# Import file
Train_Spacy_file = open("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets/EMSCAD/Input model/Train_Spacy_v2_format.txt", "r")
Eval_Spacy_file = open("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets/EMSCAD/Input model/Eval_Spacy_v2_format.txt", "r")
TRAIN_DATA = ast.literal_eval(Train_Spacy_file.read())
EVAL_DATA = ast.literal_eval(Eval_Spacy_file.read())

In [None]:
# We transform the imported file back to tuple format.

# TRAIN_DATA = ast.literal_eval(TRAIN_DATA)

### Actual conversion of the data to SpaCy V3 format

In [20]:
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

def SpaCy_v3_format(DATA,FILENAME):
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(DATA): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)

    db.to_disk(FILENAME) # save the docbin object
    

SpaCy_v3_format(TRAIN_DATA,"EMSCAD/TRAIN_EVAL_DATA/train.spacy")
SpaCy_v3_format(EVAL_DATA,"EMSCAD/TRAIN_EVAL_DATA/eval.spacy")

100%|██████████| 42947/42947 [00:43<00:00, 983.93it/s] 
100%|██████████| 11573/11573 [00:12<00:00, 963.52it/s] 


# Model evaluation is done down here

### There is a evaluation cell for each model. We make use of evaluation data that is not seen by the model during training. The job description used for evaluation contains biased words.

In [None]:
# Import evaluation data
Eval_file = open("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model evaluation/Evaluation dataset.txt", "r")
EVAL_DATA = Eval_file.read()

### Evaluation - Model 1

In [None]:
Model_1 = spacy.load("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model training/Model 1/output/model-best/") #load the best model
doc = Model_1(EVAL_DATA) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

### Evaluation - Model 2

In [None]:
Model_2 = spacy.load("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model training/Model 2/output/model-best/") #load the best model
doc = Model_2(EVAL_DATA) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

### Evaluation - Model 3

In [None]:
Model_3 = spacy.load("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model training/Model 3/output/model-best/") #load the best model
doc = Model_3(EVAL_DATA) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

### Evaluation - Model 4

In [None]:
Model_4 = spacy.load("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model training/Model 4/output/model-best/") #load the best model
doc = Model_4(EVAL_DATA) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

### Evaluation - Model 5

In [None]:
Model_5 = spacy.load("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model training/Model 5/output/model-best/") #load the best model
doc = Model_5(EVAL_DATA) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

### Evaluation - Model 6

In [None]:
Model_6 = spacy.load("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model training/Model 6/output/model-best/") #load the best model
doc = Model_6(EVAL_DATA) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

### Evaluation - Model 7

In [None]:
Model_7 = spacy.load("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model training/Model 7/output/model-best/") #load the best model
doc = Model_7(EVAL_DATA) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

### Evaluation - Model 8

In [None]:
Model_8 = spacy.load("/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Model training/Model 8/output/model-best/") #load the best model
doc = Model_8(EVAL_DATA) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
print(TRAIN_DATA[0])

In [None]:
import spacy

nlp = spacy.blank("en")


In [None]:
nlp.add_pipe("ner", before="lemmatizer")

print(nlp.pipe_names)

In [None]:
def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        print(ner)
        nlp.add_pipe("ner", last=True)
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                [text],
                [annotations],
                drop=0.2,
                sgd=optimizer,
                losses=losses
                )
                print(losses)
    return(nlp)
                   

In [None]:
TRAIN_DATA = TRAIN_DATA_1

nlp = train_spacy(TRAIN_DATA, 30)
nlp.to_disk("ner_model_test")

In [None]:
nlp.pipe_names


In [None]:
model = "en_core_web_sm"

# Setting up the pipeline and entity recognizer.
if model is not None:
    nlp = spacy.load(model)  # load existing spacy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

In [None]:
LABEL = 'Masculine-coded words'
LABEL

In [None]:

# Add new entity labels to entity recognizer
for i in LABEL:
    ner.add_label(i)
# Inititalizing optimizer
if model is None:
    optimizer = nlp.begin_training()
else:
    optimizer = nlp.entity.create_optimizer()

In [None]:
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# New entity labels
# Specify the new entity labels which you want to add here
LABEL = ['I-geo', 'B-geo', 'I-art', 'B-art', 'B-tim', 'B-nat', 'B-eve', 'O', 'I-per', 'I-tim', 'I-nat', 'I-eve', 'B-per', 'I-org', 'B-gpe', 'B-org', 'I-gpe']

"""
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
"""
# Loading training data 
with open ('Data/ner_corpus_260', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))

def main(model=None, new_model_name='new_model', output_dir=None, n_iter=10):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)   # Add new entity labels to entity recognizer

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # Test the trained model
    test_text = 'Gianni Infantino is the president of FIFA.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # Save model 
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [None]:
# https://github.com/Jcharis/Natural-Language-Processing-Tutorials/blob/master/Training%20the%20Named%20Entity%20Recognizer%20in%20SpaCy.ipynb

In [None]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy

In [None]:
# new entity label
LABEL = 'Masculine-coded words'
LABEL

In [None]:
TRAIN_DATA = Spacy_training_data

In [None]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))


def main(model=None, new_model_name='Masculine-coded words', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'Do you like horses?'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


# if __name__ == '__main__':
#     plac.call(main)

In [None]:
model

In [None]:
# Run our Function
main(model)

In [None]:
# Training additional entity types using spaCy
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# New entity labels
# Specify the new entity labels which you want to add here
LABEL = ['Masculine-coded words', 'Feminine-coded words']

"""
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
"""
# Loading training data 
with open ('/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets/EMSCAD/Input model/Annotation_data_model_input_spacy.txt', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))

def main(model=None, new_model_name='new_model', output_dir=None, n_iter=10):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)   # Add new entity labels to entity recognizer

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # Test the trained model
    test_text = 'Gianni Infantino is the president of FIFA.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # Save model 
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


if __name__ == '__main__':
    plac.call(main)