# About
Named Entity Recognition using spacy

In [84]:
#importing modules
import pandas as pd
import spacy
import os
#for visalisation of NER
import spacy.displacy as displacy
#downloading the model
#import spacy.cli
#spacy.cli.download("en_core_web_lg")
import random
from spacy.util import minibatch, compounding

# Brief
Named entities are mostly subjects and objects in a sentence. They can be name of persons, locations, organisations, products, time etc. It can be considered as annotation in texts with IOB and POS.

Spacy is a powerful and production ready library which can do Tokenisation, Part of Speech Tagging(POS), Lemmatisation, Sentence Boundary detection(SBD), NER, Entity linking, Similarity, Text classification along with an option to train your own custom models and serialize them.

For more info, refer [link](https://spacy.io/)

Spacy has concept of Language Processing pipeline which takes in an input text, tokenizes it, pass it through a block of tagger and parser followed by NER to generate a DOC object which contains the final output.

The most common named entities in Spacy are


1.   Person : which contains people
2.   FAC : Name of buildings, bridges etc
3.   NORP: Name of Nationalities, Religious and Political Groups.
4.   ORG:  Geopolitical Entities 
5.   GPE: Monetary values

Various models are available in the pipeline, which can be explored [here](https://spacy.io/models)

Let's explore an example






In [25]:
#loading a large model as it is highly likely to yield much accurate pos
pipeline = spacy.load("en_core_web_lg")
text = '''A mountain is an elevated portion of the Earth's crust, generally with steep sides that show significant exposed bedrock. A mountain differs from a plateau in having a limited summit area, and is larger than a hill, typically rising at least 300 metres (1000 feet) above the surrounding land.'''

In [26]:
#processing and generating the doc
doc = pipeline(text)
for entity in doc.ents:
  #printing the entity name, start char, end char and the label
  print(entity.text,entity.start_char, entity.end_char, entity.label_)

Earth 41 46 LOC
at least 300 metres 233 252 QUANTITY
1000 feet 254 263 QUANTITY


In [27]:
#visualising the same using displacy
displacy.serve(doc)


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


# Now let's spacy for the kaggle competition.

In [3]:
|#changing the path
os.chdir('/content/drive/MyDrive/Colab Notebooks/Dataset/NER_Dataset')

In [4]:
!unzip NER.zip

Archive:  NER.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [29]:
#reading the dataset
dataset = pd.read_csv("ner_dataset.csv", sep=",", encoding="latin1").fillna(method='ffill')
dataset.head(40)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [31]:
#combining the sentences and their and Tag.
combiner_function = lambda sentence: [[word,tag] for word,tag in zip(sentence["Word"].values.tolist(), 
                                            sentence["Tag"].values.tolist())]

In [34]:
combined_dataset = dataset.groupby("Sentence #").apply(combiner_function)


In [35]:
combined_dataset.head(10)

Sentence #
Sentence: 1        [[Thousands, O], [of, O], [demonstrators, O], ...
Sentence: 10       [[Iranian, B-gpe], [officials, O], [say, O], [...
Sentence: 100      [[Helicopter, O], [gunships, O], [Saturday, B-...
Sentence: 1000     [[They, O], [left, O], [after, O], [a, O], [te...
Sentence: 10000    [[U.N., B-geo], [relief, O], [coordinator, O],...
Sentence: 10001    [[Mr., B-per], [Egeland, I-per], [said, O], [t...
Sentence: 10002    [[He, O], [said, O], [last, O], [week, O], ['s...
Sentence: 10003    [[Some, O], [1,27,000, O], [people, O], [are, ...
Sentence: 10004    [[Aid, O], [is, O], [being, O], [rushed, O], [...
Sentence: 10005    [[Lebanese, B-gpe], [politicians, O], [are, O]...
dtype: object

In [36]:
#now extracting the sentence and the tags from the dataset
sentences = [[word[0] for word in sentence] for sentence in combined_dataset.values]
tags = [[token[1] for token in tag] for tag in combined_dataset.values]

In [38]:
len(sentences) == len(tags)

True

In [79]:
#joining the sentences and tags with spaces
joined_sentences = [' '.join(sentence) for sentence in sentences]
joined_tags = [' '.join(tag) for tag in tags]

In [80]:
len(joined_sentences) == len(joined_tags)

True

In [81]:
joined_sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [82]:
joined_tags[0]

'O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O'

In [87]:
# write your own core function that creates a dataset which is a list of dictionary containining sentences and entities, mail me at hrishabhsuraj52@gmail.com if you want the helper function.
#creating a mini dataset with joined_sentences[0] and joined_tags[0]
TRAIN_DATA = [
              ('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', {'entities': [(1,6,'O'),(7,8,'B-geo'),(9,13,'O')]})]

In [88]:
#let's train the model with these custom entities using spacy
#creating a custom language class
nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [89]:
# adding labels
LABELS = ['B-geo','O']
for label in LABELS:
  ner.add_label(label)


In [90]:
optimizer = nlp.begin_training()

In [94]:
# training stage
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(1000):
        random.shuffle(TRAIN_DATA)
        for text, annotation in TRAIN_DATA:
            nlp.update([text], [annotation], sgd=optimizer, drop=0.01,losses=losses)
        print('Losses', losses)

#saving model
nlp.to_disk("model.bin")

Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner': 129.1972965869118}
Losses {'ner':

KeyboardInterrupt: ignored

In [None]:
#inferencing stage
pipeline = spacy.load("model.bin")
doc= pipeline("I have won the lottery and I am supposed to collect it tommorow evening at Maharastra.")
for ent in doc2.ents:
    print(ent.label_, ent.text)