In [None]:
# https://realpython.com/natural-language-processing-spacy-python/

# https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

# https://www.youtube.com/watch?v=08NbfA9od9w&list=PLc2rvfiptPSQgsORc7iuv7UxhbRJox-pW&index=8

# https://github.com/prem2017/new-entity-labelling/blob/master/new_entity_labelling.ipynb

# https://github.com/explosion/projects/blob/master/nel-emerson/scripts/notebook_video.ipynb

In [None]:
from collections import Counter
complete_text = ('Gus Proto is a Python developer currently'
     'working for a London-based Fintech company. He is'
     ' interested in learning Natural Language Processing.'
     ' There is a developer conference happening on 21 July'
     ' 2019 in London. It is titled "Applications of Natural'
     ' Language Processing". There is a helpline number '
     ' available at +1-1234567891. Gus is helping organize it.'
    
                )

complete_doc = nlp(complete_text)
 # Remove stop words and punctuation symbols
words = [token.text for token in complete_doc
          if not token.is_stop and not token.is_punct]
word_freq = Counter(words)
 # 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(5)
print (common_words)
[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]
 # Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)


#Lemma
for token in complete_doc[:10]:
    print (token, token.lemma_)
    
#STOP WORDS
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('No of stop words:',len(spacy_stopwords))
about_no_stopword_doc = [token for token in complete_doc if not token.is_stop]
print (about_no_stopword_doc)

#Preprocessing
def is_token_allowed(token):
    '''Only allow valid tokens which are not stop words and punctuation symbols.'''
    if (not token or not token.string.strip() or token.is_stop or token.is_punct):
        return False
    return True

def preprocess_token(token):
# Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

complete_filtered_tokens = [preprocess_token(token)for token in complete_doc if is_token_allowed(token)]
complete_filtered_tokens

### FYI: use NER in spacy before STEMMING & making it lowercase 
#### (coz "Apple" is company.... "apple" is not entity only)

In [None]:
import spacy
texts=['net income was $9.4 million comapred to last year 3.4$ million',
      'revenue exceeds twelve billion dollars will loss of $1b']
nlp=spacy.load('en_core_web_sm')
docs=nlp.pipe(texts,disable=['tagger','parser'])

for doc in docs:
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_) 

# List format & NER Visualization

In [None]:
import spacy
from spacy import displacy
nlp=spacy.load('en_core_web_sm')
list_of_text_data=texts=['net income was $9.4 million comapred to last year 3.4$ million',
      'revenue exceeds twelve billion dollars will loss of $1b','Google is very great office']
docs = [nlp(text) for text in list_of_text_data]
displacy.render(docs,style='ent')

In [None]:
# Creating a doc on news articles
news_text="""Indian man has allegedly duped nearly 50 businessmen in the UAE of USD 1.6 million and fled the country in the most unlikely way -- on a repatriation flight to Hyderabad, according to a media report on Saturday.Yogesh Ashok Yariava, the prime accused in the fraud, flew from Abu Dhabi to Hyderabad on a Vande Bharat repatriation flight on May 11 with around 170 evacuees, the Gulf News reported.Yariava, the 36-year-old owner of the fraudulent Royal Luck Foodstuff Trading, made bulk purchases worth 6 million dirhams (USD 1.6 million) against post-dated cheques from unsuspecting traders before fleeing to India, the daily said.
The bought goods included facemasks, hand sanitisers, medical gloves (worth nearly 5,00,000 dirhams), rice and nuts (3,93,000 dirhams), tuna, pistachios and saffron (3,00,725 dirhams), French fries and mozzarella cheese (2,29,000 dirhams), frozen Indian beef (2,07,000 dirhams) and halwa and tahina (52,812 dirhams).
The list of items and defrauded persons keeps getting longer as more and more victims come forward, the report said.
The aggrieved traders have filed a case with the Bur Dubai police station.
The traders said when the dud cheques started bouncing they rushed to the Royal Luck's office in Dubai but the shutters were down, even the fraudulent company's warehouses were empty."""

news_doc=nlp(news_text)

# Function to identify  if tokens are named entities and replace them with UNKNOWN
def remove_details(word):
  if word.ent_type_ =='PERSON' or word.ent_type_=='ORG' or word.ent_type_=='GPE':
    return ' UNKNOWN '
  return word.string


# Function where each token of spacy doc is passed through remove_deatils()
def update_article(doc):
  # Passing each token through remove_details() function.
  tokens = map(remove_details,doc)
  return ''.join(tokens)

# Passing our news_doc to the function update_article()
update_article(news_doc)

# default sentence boundary

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')

docs=nlp("Welcome to ... spacy world... please tell us how you feel")

for doc in docs.sents:
    print('sentence:',doc)


# Custom Function for sentence bounndary  

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')

#CUSTOM RULE FUNCTION (Split sentence when 3 dots encountered)
def my_rule(doc):
    for token in doc:
        if token.text == '...':
            doc[token.i + 1].is_sent_start = True
    return doc

nlp.add_pipe(my_rule,before='parser')
#nlp.remove_pipe(my_rule)  ===> if you want to remove the custom rule

docs=nlp("Welcome to ... spacy world... please tell us how you feel")

for idx,sent in enumerate(docs.sents):
    print('sentence',idx,':',sent)

# Custom Function: include prefix of names

In [None]:
import spacy
# from spacy.Matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

nlp=spacy.load('en_core_web_sm')
doc=nlp('Dr. Alex Smith chaired first board meeting at Google and 1 million')
    
print([(ent.text,ent.label_) for ent in doc.ents])  

In [None]:
#Custom function to include ('Dr','Dr.','Mr','Mr.') too
def add_title(doc):
    new_ents=[]
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ent.start!=0:
            prev_token = doc[ent.start-1]
            if prev_token.text in ('Dr','Dr.','Mr','Mr.'):
                new_ent = Span(doc,ent.start-1,ent.end,label=ent.label)
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
    doc.ents =new_ents
    return doc

nlp=spacy.load('en_core_web_sm')
nlp.add_pipe(add_title,after='ner')
doc=nlp('Dr. Alex Smith chaired first board meeting at Google and 1 million')
print([(ent.text,ent.label_) for ent in doc.ents])    
    
    
    

# EntityRuler : Add custom NER with exact pattern match

In [None]:
import spacy

nlp=spacy.load('en_core_web_sm')
text='Tony Stark owns the company StarkEnterprises  She '
doc=nlp(text)

# Printing the named entities

for ent in doc.ents:
    print(ent.label_,ent.text)

In [None]:
import spacy
from spacy.pipeline import EntityRuler
# Initialize
nlp=spacy.load('en')
ruler = EntityRuler(nlp)
pattern=[{"label": "RESEARCH FIELDS", "pattern": "statistics"},{"label": "RESEARCH FIELDS", "pattern": "maths"}]
ruler.add_patterns(pattern)
nlp.add_pipe(ruler)
text="""I recently published my work fanfiction by Dr.X . 
        Tony Stark owns the company StarkEnterprises . 
        You should try My guide to statistics for clear concepts.
        and also in field of maths 
     """
doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])

displacy.render(doc,style='ent')

# Custom NER


In [None]:
import spacy 
from spacy.gold import GoldParse 
# from spacy.language import EntityRecognizer 
from spacy.pipeline import EntityRecognizer

nlp = spacy.load('en', entity = False, parser = False) 

doc_list = [] 
doc = nlp('Llamas make great pets.') 
doc_list.append(doc) 
gold_list = [] 
gold_list.append(GoldParse(doc, [u'ANIMAL', u'O', u'O', u'O'])) 

ner = EntityRecognizer(nlp.vocab, entity_types = ['ANIMAL']) 
ner.update(doc_list, gold_list) 


In [None]:
import spacy
import random
from spacy.gold import GoldParse
from spacy.pipeline import EntityRecognizer

train_data = [
    ('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
    ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
]

# nlp = spacy.load('en', entity=False, parser=False)
nlp = spacy.blank('en', entity=False, parser=False)

ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])

for itn in range(5):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = nlp.make_doc(raw_text)
        gold = GoldParse(doc, entities=entity_offsets)

        nlp.tagger(doc)
        ner.update(doc, gold)
ner.model.end_training()


text = 'My name is Khan, Chaka Khan and I have been to London and Berlin'
doc = nlp(text)
print(doc.ents)
for x in doc.ents:
    if x.text:
        print(x.text.strip(), x.label_.strip())

# Blueprint of complete program

In [None]:
#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# training data
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


if __name__ == "__main__":
    plac.call(main)

    # Expected output:
    # Entities [('Shaka Khan', 'PERSON')]
    # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
    # ('Khan', 'PERSON', 1), ('?', '', 2)]
    # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
    # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
    # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]

# Transfer learning from "en_core_web_sm"

### ----> train with additional examples of ORG & PRODUCT

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')
nlp.pipe_names

In [None]:
article_text="""India that previously comprised only a handful of players in the e-commerce space, is now home to many biggies and giants battling out with each other to reach the top. This is thanks to the overwhelming internet and smartphone penetration coupled with the ever-increasing digital adoption across the country. These new-age innovations not only gave emerging startups a unique platform to deliver seamless shopping experiences but also provided brick and mortar stores with a level-playing field to begin their online journeys without leaving their offline legacies.
In the wake of so many players coming together on one platform, the Indian e-commerce market is envisioned to reach USD 84 billion in 2021 from USD 24 billion in 2017. Further, with the rate at which internet penetration is increasing, we can expect more and more international retailers coming to India in addition to a large pool of new startups. This, in turn, will provide a major Philip to the organized retail market and boost its share from 12% in 2017 to 22-25% by 2021. 
Here’s a view to the e-commerce giants that are dominating India’s online shopping space:
Amazon – One of the uncontested global leaders, Amazon started its journey as a simple online bookstore that gradually expanded its reach to provide a large suite of diversified products including media, furniture, food, and electronics, among others. And now with the launch of Amazon Prime and Amazon Music Limited, it has taken customer experience to a godly level, which will remain undefeatable for a very long time. 
Flipkart – Founded in 2007, Flipkart is recognized as the national leader in the Indian e-commerce market. Just like Amazon, it started operating by selling books and then entered other categories such as electronics, fashion, and lifestyle, mobile phones, etc. And now that it has been acquired by Walmart, one of the largest leading platforms of e-commerce in the US, it has also raised its bar of customer offerings in all aspects and giving huge competition to Amazon. 
Snapdeal – Started as a daily deals platform in 2010, Snapdeal became a full-fledged online marketplace in 2011 comprising more than 3 lac sellers across India. The platform offers over 30 million products across 800+ diverse categories from over 125,000 regional, national, and international brands and retailers. The Indian e-commerce firm follows a robust strategy to stay at the forefront of innovation and deliver seamless customer offerings to its wide customer base. It has shown great potential for recovery in recent years despite losing Freecharge and Unicommerce. 
ShopClues – Another renowned name in the Indian e-commerce industry, ShopClues was founded in July 2011. It’s a Gurugram based company having a current valuation of INR 1.1 billion and is backed by prominent names including Nexus Venture Partners, Tiger Global, and Helion Ventures as its major investors. Presently, the platform comprises more than 5 lac sellers selling products in nine different categories such as computers, cameras, mobiles, etc. 
Paytm Mall – To compete with the existing e-commerce giants, Paytm, an online payment system has also launched its online marketplace – Paytm Mall, which offers a wide array of products ranging from men and women fashion to groceries and cosmetics, electronics and home products, and many more. The unique thing about this platform is that it serves as a medium for third parties to sell their products directly through the widely-known app – Paytm. 
Reliance Retail – Given Reliance Jio’s disruptive venture in the Indian telecom space along with a solid market presence of Reliance, it is no wonder that Reliance will soon be foraying into retail space. As of now, it has plans to build an e-commerce space that will be established on online-to-offline market program and aim to bring local merchants on board to help them boost their sales and compete with the existing industry leaders. 
Big Basket – India’s biggest online supermarket, Big Basket provides a wide variety of imported and gourmet products through two types of delivery services – express delivery and slotted delivery. It also offers pre-cut fruits along with a long list of beverages including fresh juices, cold drinks, hot teas, etc. Moreover, it not only provides farm-fresh products but also ensures that the farmer gets better prices. 
Grofers – One of the leading e-commerce players in the grocery segment, Grofers started its operations in 2013 and has reached overwhelming heights in the last 5 years. Its wide range of products includes atta, milk, oil, daily need products, vegetables, dairy products, juices, beverages, among others. With its growing reach across India, it has become one of the favorite supermarkets for Indian consumers who want to shop grocery items from the comforts of their homes. 
Digital Mall of Asia – Going live in 2020, Digital Mall of Asia is a very unique concept coined by the founders of Yokeasia Malls. It is designed to provide an immersive digital space equipped with multiple visual and sensory elements to sellers and shoppers. It will also give retailers exclusive rights to sell a particular product category or brand in their respective cities. What makes it unique is its zero-commission model enabling retailers to pay only a fixed amount of monthly rental instead of paying commissions. With its one-of-a-kind features, DMA is expected to bring
never-seen transformation to the current e-commerce ecosystem while addressing all the existing e-commerce worries such as counterfeiting. """

doc=nlp(article_text)
for ent in doc.ents:
  print(ent.text,ent.label_)

### Getting NER data from en_core_web_sm (Transfer learning)

In [None]:
ner=nlp.get_pipe("ner")


### annotated data for more examples of ORG,PRODUCT

In [None]:
# training data
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

In [None]:
for _,annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        
pipe_exceptions=["ner","trf_wordpiecer","trf_tok2vec"]
unaffected_pipes=[pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]



In [None]:
ner.labels

### Train & update the NER

In [None]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)
        

### Testing the model

In [None]:
# Testing the model
doc = nlp("I was driving a Alto")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

### Load & Save Model

In [None]:
# Save the  model to directory
output_dir = Path('/Users/c8907070/Desktop/NER')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

#  NER from a blank SpaCy model

In [None]:
import spacy

nlp=spacy.blank("en")

nlp.add_pipe(nlp.create_pipe('ner'))

nlp.begin_training()

In [None]:
# training data
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

for _,annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

        
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
# with nlp.disable_pipes(*unaffected_pipes): (This line not needed since its BLANK MODEL)

# Training for 30 iterations
for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)
ner.model.end_training() 
# Save the  model to directory
output_dir = Path('/Users/c8907070/Desktop/NER')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

# Training completely NEW ENTITY type

### "FOOD" is new entity introduced and updated to existing model

In [None]:
import spacy
nlp=spacy.load("en_core_web_sm") 

# Getting the ner component
ner=nlp.get_pipe('ner')

# New label to add
LABEL = "FOOD"

# Training examples in the required format
TRAIN_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("China's noodles are very famous", {"entities": [(8,14, "FOOD")]}),
              ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
              ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
              ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
              ("Udon is a healthy type of noodles", {"entities": [(0,4, "ORG")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
           ]



### Add new Label & Resume training (Transfer learning)

In [None]:
# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
# optimizer = nlp.entity.create_optimizer()

move_names = list(ner.move_names)


In [None]:
# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Importing requirements
from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :

  sizes = compounding(1.0, 4.0, 1.001)
  # Training for 30 iterations     
  for itn in range(30):
    # shuffle examples before training
    random.shuffle(TRAIN_DATA)
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=sizes)
    # ictionary to store losses
    losses = {}
    for batch in batches:
      texts, annotations = zip(*batch)
      # Calling update() over the iteration
      nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
      print("Losses", losses)
        
  ner.model.end_training()       

In [None]:
#Testing the Model
test_text = "I ate Sushi yesterday. Maggi is a common fast food "
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
  print(ent.text,ent.label_)

#### Observe the above output. The model has correctly identified the FOOD items. Also, notice that I had not passed ” Maggi ” as a training example to the model. Still, based on the similarity of context, the model has identified “Maggi” also asFOOD.

In [None]:
# Output directory
from pathlib import Path
output_dir=Path('/Users/c8907070/Desktop/NER/')

# Saving the model to the output directory
if not output_dir.exists():
  output_dir.mkdir()
nlp.meta['name'] = 'my_ner_food_added'  # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Loading the model from the directory
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
assert nlp2.get_pipe("ner").move_names == move_names



In [None]:
doc2 = nlp2('Dosa is an extremely famous south Indian dish')
for ent in doc2.ents:
  print(ent.label_, ent.text)

# Pipeline options

In [None]:
nlp.pipe_names

In [None]:
import spacy
nlp=spacy.load("en_core_web_sm") 

# #Add textcat
nlp.add_pipe(nlp.create_pipe('textcat'),first=True)


# #ADD Custom Function
def CUST_FUNC1(doc):
    length=len(doc)
    print('CUSTOM FUNC :length of doc is',length)
    return doc
    
# before,after,last=True,first=True    
nlp.add_pipe(CUST_FUNC1, before='ner')


# #Remove pipe commponent
nlp.remove_pipe('textcat')


# # RENAME pipe component
nlp.rename_pipe(old_name='ner',new_name='my_custom_ner')


# Call the nlp object on your text to activate the custom function
doc = nlp(" The Hindu Newspaper has increased the cost. I usually read the paper  ")

# PhraseMatcher & SPAN

In [None]:
# Importing PhraseMatcher from spacy and intialize with a model's vocab
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

# List of book names to be matched 
book_names = ['Pride and prejudice','Mansfield park','The Tale of Two cities','Great Expectations']

# Creating pattern - list of docs through nlp.pipe() to save time
book_patterns = list(nlp.pipe(book_names))

# Adding the pattern to the matcher
matcher.add("identify_books", None, *book_patterns)

# Import Span to slice the Doc
from spacy.tokens import Span

# Define the custom pipeline component
def identify_books(doc):
    # Apply the matcher to YOUR doc
    matches = matcher(doc)
    # Create a Span for each match and assign them under label "BOOKS"
    spans = [Span(doc, start, end, label="BOOKS") for match_id, start, end in matches]
    # Store the matched spans in doc.ents
    doc.ents = spans
    return doc

# Adding the custom component to the pipeline after the "ner" component
nlp.add_pipe(identify_books, after="ner")
print(nlp.pipe_names)

# Calling the nlp object on the text
doc = nlp("The library has got several new copies of Mansfield park and Great Expectations . I have filed a suggestion to buy more copies of The Tale of Two cities ")

# Printing entities and their labels to verify
print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
conference_org_text = ('There is a developer conference'
     'happening on 21 July 2019 in London. It is titled'
     ' "Applications of Natural Language Processing".'
     ' There is a helpline number available'
     ' at (123) 456-789')

def extract_phone_number(nlp_doc):
     pattern = [{'ORTH': '('}, {'SHAPE': 'ddd'},
                {'ORTH': ')'}, {'SHAPE': 'ddd'},
                {'ORTH': '-', 'OP': '?'},
                {'SHAPE': 'ddd'}]
     matcher.add('PHONE_NUMBER', None, pattern)
     matches = matcher(nlp_doc)
     for match_id, start, end in matches:
         span = nlp_doc[start:end]
         return span.text

conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)

# BILIOU way of annotating - ( GOLD PARSE)

#### sample example

In [None]:
import spacy
text = u"This is Google Inc. "
entities = [(8, 19, 'ORG')]
nlp = spacy.blank('en')
doc = nlp.make_doc(text)
gold = spacy.gold.GoldParse(doc, entities=entities)
gold.orig_annot

### By using EntityRecognizer

In [None]:
import spacy 
from spacy.gold import GoldParse 
# from spacy.language import EntityRecognizer 
from spacy.pipeline import EntityRecognizer

nlp = spacy.load('en', entity = False, parser = False) 
doc_list = [] 
doc = nlp('Llamas make great pets.') 
doc_list.append(doc) 
gold_list = [] 
gold_list.append(GoldParse(doc, [u'ANIMAL', u'O', u'O', u'O'])) 

ner = EntityRecognizer(nlp.vocab, entity_types = ['ANIMAL']) 
ner.update(doc_list, gold_list) 


### By passing exact indices

In [25]:
import spacy
import random

TRAINING_DATA = [
    ("How to preorder the iPhone X", {'entities': [(20, 28, 'GADGET')]})
    #Lots of other things
]

nlp=spacy.load('en_core_web_sm',disable=['parser','tagger'])
optimizer = nlp.resume_training()

LABEL="GADGET"
ner=nlp.get_pipe('ner')
ner.add_label(LABEL)


for iteration in range(10):

#     random.shuffle(TRAINING_DATA)
    losses = {}

    for text, annotations in TRAINING_DATA:
        doc = nlp.make_doc(text)
        entity_offsets = annotations["entities"]
        gold = GoldParse(doc, entities=entity_offsets)
        nlp.update([doc], [gold], drop=0.5, sgd=optimizer, losses=losses)
        print('Losses with gold', losses)

Losses with gold {'ner': 4.947793365689627}
Losses with gold {'ner': 7.457957863807753}
Losses with gold {'ner': 5.987422680586395}
Losses with gold {'ner': 8.068778574466705}
Losses with gold {'ner': 5.454485620803871}
Losses with gold {'ner': 8.001810789108276}
Losses with gold {'ner': 7.322129189968109}
Losses with gold {'ner': 5.561052802109079}
Losses with gold {'ner': 8.695716381072998}
Losses with gold {'ner': 5.7760476442449065}


### Creating a BILOU file

In [34]:
import spacy
import numpy as np
import pandas as pd

nlp = spacy.load('en')
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)
words = []
labels = []

for token in doc:
	words.append(token.text)
	labels.append('O') # As most of token will be non-entity (OUT). Replace this later with actual entity a/c the scheme.

df = pd.DataFrame({'word': words, 'label': labels})
df.to_csv('ner-token-per-line.biluo', index=False) # biluo in extension to indicate the type of encoding, it is ok to keep csv

print("Sample entries of BILUO file:\n",df[5:10])
print("============================")

#Upload the BILIOU format file
dpath = 'ner-token-per-line.biluo'

df = pd.read_csv(dpath, sep=',')
words  = df.word.values
ents = df.label.values
text = ' '.join(words)

from spacy.gold import GoldParse

doc = nlp.make_doc(text)
g = GoldParse(doc, entities=ents)
X = [doc]
Y = [g]

add_ents = ['DATED'] # The new entity
# Piplines in core pretrained model are tagger, parser, ner. Create new if blank model is to be trained using `spacy.blank('en')` else get the existing one.
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner") # "architecture": "ensemble" simple_cnn ensemble, bow # https://spacy.io/api/annotation
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe("ner")
prev_ents = ner.move_names # All the existing entities recognised by the model
print('\n[Existing Entities]:\n', ner.move_names)
for ent in add_ents:
    ner.add_label(ent)
    
new_ents = ner.move_names
# print('\n[All Entities] = ', ner.move_names)
print('\n\n[New Entities]: \n', list(set(new_ents) - set(prev_ents)))
print('\n\n')
## Training
model = None # Since we are training a fresh model not a saved model
n_iter = 20

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
print(f'[OtherPipes] = {other_pipes} will be disabled')

with nlp.disable_pipes(*other_pipes):  # only train ner
    # optimizer = nlp.begin_training()
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    for i in range(n_iter):
        losses = {}
        nlp.update(X, Y,  sgd=optimizer, drop=0.0, losses=losses)
        # nlp.entity.update(d, g)
        print("Losses", losses)

Sample entries of BILUO file:
       word label
5       on     O
6     self     O
7        -     O
8  driving     O
9     cars     O

[Existing Entities]:
 ['B-ORG', 'B-DATE', 'B-PERSON', 'B-GPE', 'B-MONEY', 'B-CARDINAL', 'B-NORP', 'B-PERCENT', 'B-WORK_OF_ART', 'B-LOC', 'B-TIME', 'B-QUANTITY', 'B-FAC', 'B-EVENT', 'B-ORDINAL', 'B-PRODUCT', 'B-LAW', 'B-LANGUAGE', 'I-ORG', 'I-DATE', 'I-PERSON', 'I-GPE', 'I-MONEY', 'I-CARDINAL', 'I-NORP', 'I-PERCENT', 'I-WORK_OF_ART', 'I-LOC', 'I-TIME', 'I-QUANTITY', 'I-FAC', 'I-EVENT', 'I-ORDINAL', 'I-PRODUCT', 'I-LAW', 'I-LANGUAGE', 'L-ORG', 'L-DATE', 'L-PERSON', 'L-GPE', 'L-MONEY', 'L-CARDINAL', 'L-NORP', 'L-PERCENT', 'L-WORK_OF_ART', 'L-LOC', 'L-TIME', 'L-QUANTITY', 'L-FAC', 'L-EVENT', 'L-ORDINAL', 'L-PRODUCT', 'L-LAW', 'L-LANGUAGE', 'U-ORG', 'U-DATE', 'U-PERSON', 'U-GPE', 'U-MONEY', 'U-CARDINAL', 'U-NORP', 'U-PERCENT', 'U-WORK_OF_ART', 'U-LOC', 'U-TIME', 'U-QUANTITY', 'U-FAC', 'U-EVENT', 'U-ORDINAL', 'U-PRODUCT', 'U-LAW', 'U-LANGUAGE', 'O']


[New Ent

## Link for other ways of GoldParse : 
https://github.com/prem2017/new-entity-labelling/blob/master/new_entity_labelling.ipynb

# Evaluate Model

In [None]:
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
def evaluate(model, examples):
  scorer = Scorer()
  for input_, annot in examples:
    #print(input_)
    doc_gold_text = model.make_doc(input_)
    gold = GoldParse(doc_gold_text, entities=annot['entities'])
    pred_value = model(input_)
    scorer.score(pred_value, gold)
  return scorer.scores

test_result = evaluate(new_model, test_data)