In [28]:
import numpy as np
import spacy
from spacy.matcher import PhraseMatcher
from sklearn.model_selection import train_test_split
import random
from spacy.util import minibatch, compounding
from spacy.training.example import Example

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
ner = nlp.get_pipe("ner")
import warnings
warnings.filterwarnings('ignore')

### Reading files

In [29]:
with open(r"data\cups.txt", encoding='utf-8') as file: 
    cups_data = file.readlines()
    
with open(r"data\table.txt", encoding='utf-8') as file: 
    table_data = file.readlines()
    
with open(r"data\chair.txt", encoding='utf-8') as file: 
    chair_data = file.readlines()
    
with open(r"data\bottle.txt", encoding='utf-8') as file: 
    bottle_data = file.readlines()

### Converting into lower case and remove white spaces 

In [30]:
cups_data = [sent.lower().replace("\n", "").strip() for sent in cups_data if sent.lower().replace("\n", "").strip() != ""]
table_data = [sent.lower().replace("\n", "").strip() for sent in table_data if sent.lower().replace("\n", "").strip() != ""]
bottle_data = [sent.lower().replace("\n", "").strip() for sent in bottle_data if sent.lower().replace("\n", "").strip() != ""]
chair_data = [sent.lower().replace("\n", "").strip() for sent in chair_data if sent.lower().replace("\n", "").strip() != ""]

### Creating tokens

In [31]:
cups_data_tokens = [sent.split(" ") for sent in cups_data]
table_data_tokens = [sent.split(" ") for sent in table_data]
bottle_data_tokens = [sent.split(" ") for sent in bottle_data]
chair_data_tokens = [sent.split(" ") for sent in chair_data]

### Matching and the patterns

In [32]:
patterns_cups = list(np.unique([str(token) for sent in cups_data_tokens for token in sent if "cup" in token]))
patterns_table = list(np.unique([str(token) for sent in table_data_tokens for token in sent if "table" in token]))
patterns_bottle = list(np.unique([str(token) for sent in bottle_data_tokens for token in sent if "bottle" in token]))
patterns_chair = list(np.unique([str(token) for sent in chair_data_tokens for token in sent if "chair" in token]))

### Creating the model data for the ner

In [33]:
lemma = lambda value: ''.join([w.lemma_ for w in nlp(value)])
patterns = ["cup", "table", "chair", "bottle"]
def get_model_data(data, patterns):
    model_data = []
    for sent in data:         # Iteration over the each sentence
        entities = []
        for pattern in patterns:  # Iteration over each pattern
            match = [token for token in sent.split(" ") if pattern in token] # Finding the match token
            if len(match) > 0:
                start = sent.find(pattern)
                if start > 0:
                    if match[0][-1] != 's':
                        m = match[0][:len(pattern)]
                        end = start + len(m)
                        entities.append((start, end, lemma(m).capitalize()))
                    else:
                        end = start + len(match[0])
                        entities.append((start, end, lemma(match[0]).capitalize()))
                        
                
        model_data.append((sent, {"entities": entities}))
    return model_data

### Getting the complete datasets

In [34]:
full_data = cups_data + table_data + chair_data + bottle_data
patterns_data = patterns_bottle + patterns_chair + patterns_cups + patterns_table

In [35]:
model_data = get_model_data(full_data, patterns)

### Adding labels to the ner model entities

In [36]:
for _, annotations in model_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

### Filter the pipe

In [37]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [38]:
# Shuffling the dataset
random.shuffle(model_data)

In [39]:
# Generating the training and the test data
train_data, test_data = train_test_split(model_data, test_size=0.3)

### Building the NER model

In [40]:
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 10 iterations
  for iteration in range(10):

    # shufling examples  before every iteration
    random.shuffle(train_data)
    losses = {}
    # batch up the examples using spaCy's minibatch
#     batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    batches = minibatch(train_data, size=40)
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(
                    example,
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        # print("Losses", losses)

### Displacy for checking the model accuaracy

In [41]:
doc = nlp("I was having a cup of tea sitting in a chair just beside the table.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])


from spacy import displacy

# options = {"compact": True, "bg": "#09a3d5",
#            "color": "blue", "font": "Source Sans Pro"}
# displacy.render(doc, style="ent", options=options)
colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["Cup","Chair","Table","Bottle"], "colors": colors}
displacy.render(doc, style="ent", options=options)
# displacy.render(doc, style="ent")

Entities [('cup', 'Cup'), ('chair', 'Chair'), ('table', 'Table')]


### Saving the model by using the disk serializer

In [43]:
# save your model
nlp.to_disk("my-model") # creates a directory at this path