In [1]:
import spacy

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

corpus = 'India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[25] is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.'


In [3]:
corpus

'India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[25] is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.'

In [4]:
nlp = spacy.load('en_core_web_lg')

## Splitting The Tokens

In [5]:
nlp('GFG is looking for data science')

GFG is looking for data science

In [6]:
s = "GPT is one of the first of it's kind."

d = nlp(s)

In [7]:
print(d[0])

GPT


In [8]:
print(d)

GPT is one of the first of it's kind.


In [9]:
d[0].text

'GPT'

## Finding Part Of Speech

In [10]:
d[0].pos_

'PROPN'

In [11]:
d[4] , d[4].pos_

(the, 'DET')

In [12]:
d[6] , d[6].pos_

(of, 'ADP')

In [13]:
for i in range(0,len(d)):
    print(d[i], " : ", d[i].pos_)

GPT  :  PROPN
is  :  AUX
one  :  NUM
of  :  ADP
the  :  DET
first  :  ADJ
of  :  ADP
it  :  PRON
's  :  AUX
kind  :  ADJ
.  :  PUNCT


## Finding Grained Part Of Speech

In [14]:
for i in range(0,len(d)):
    print(d[i], " : ", d[i].tag_)

GPT  :  NNP
is  :  VBZ
one  :  CD
of  :  IN
the  :  DT
first  :  JJ
of  :  IN
it  :  PRP
's  :  VBZ
kind  :  JJ
.  :  .


In [15]:
for token in d:
    print(token)

GPT
is
one
of
the
first
of
it
's
kind
.


## spacy.explain

In [16]:
for token in d:
    print(f'{token.text:{15}}{token.pos_:{15}}{token.tag_:{15}}{spacy.explain(token.tag_)}')

GPT            PROPN          NNP            noun, proper singular
is             AUX            VBZ            verb, 3rd person singular present
one            NUM            CD             cardinal number
of             ADP            IN             conjunction, subordinating or preposition
the            DET            DT             determiner
first          ADJ            JJ             adjective (English), other noun-modifier (Chinese)
of             ADP            IN             conjunction, subordinating or preposition
it             PRON           PRP            pronoun, personal
's             AUX            VBZ            verb, 3rd person singular present
kind           ADJ            JJ             adjective (English), other noun-modifier (Chinese)
.              PUNCT          .              punctuation mark, sentence closer


In [17]:
spacy.explain(d[0].tag_)

'noun, proper singular'

## Visualisation Of Part Of Speech

<p>Displacy is tool for visualisation of part of speech.</p>

<b>displacy.render(document) : </b> This helps in visualising the POS (Part Of Speech) of the document.

In [18]:
from spacy import displacy

In [19]:
displacy.render(d, jupyter = True)

In [20]:
displacy.render(d)

In [21]:
displacy.render(d, jupyter = True, options = {'distance' : 100, 'color' : 'white', 'bg' : 'red', 'font' : 'times'})

## Named Identity Recognition

In [22]:
doc_2 = nlp('Disney Princess, also called the Princess Line, is a media franchise and toy line owned by the Walt Disney Company. Created by Disney Consumer Products chairman Andy Mooney, the franchise features a lineup of female protagonists who have appeared in various Disney franchises.')
doc_2

Disney Princess, also called the Princess Line, is a media franchise and toy line owned by the Walt Disney Company. Created by Disney Consumer Products chairman Andy Mooney, the franchise features a lineup of female protagonists who have appeared in various Disney franchises.

In [23]:
doc_2.ents

(Disney Princess,
 the Princess Line,
 the Walt Disney Company,
 Disney Consumer Products,
 Andy Mooney,
 Disney)

In [24]:
def show_entities(doc): 
    if doc_2.ents:
        for ent in doc.ents:
            print(f'{ent.text:{30}}{ent.label_:{30}}{spacy.explain(ent.label_)}')            
    else:
        print('No entities Found')

In [25]:
show_entities(doc_2)

Disney Princess               ORG                           Companies, agencies, institutions, etc.
the Princess Line             PRODUCT                       Objects, vehicles, foods, etc. (not services)
the Walt Disney Company       ORG                           Companies, agencies, institutions, etc.
Disney Consumer Products      ORG                           Companies, agencies, institutions, etc.
Andy Mooney                   PERSON                        People, including fictional
Disney                        ORG                           Companies, agencies, institutions, etc.


In [26]:
show_entities(nlp("I'm not feeling well today."))

today                         DATE                          Absolute or relative dates or periods


In [27]:
show_entities(nlp("The earth revolves around the sun in 24 hours."))
show_entities(nlp("The radius of the earth is 6371 kilometres."))

24 hours                      TIME                          Times smaller than a day
6371 kilometres               QUANTITY                      Measurements, as of weight or distance


In [28]:
show_entities(nlp("The earth revolves around the sun."))

## Adding New Entity

In [29]:
show_entities(nlp("Tesla is one of the biggest giant in the field of electric vehicles."))

Tesla                         ORG                           Companies, agencies, institutions, etc.


In [30]:
from spacy.tokens import Span as sp

d1 = nlp("Earth revolves around the sun.")
d2 = nlp("Tesla is one of the biggest giant in the field of electric vehicles.")

ORG = d2.vocab.strings['ORG']
new_entity1 = sp(d2, 0, 1, label = d1.vocab.strings['ORG'])

In [31]:
[new_entity1]

[Tesla]

## Adding New Multiple Entities at a time

In [32]:
d2 = nlp("Playing Cricket and Football are good enough for health.")

show_entities(d2)

In [33]:
from spacy.matcher import PhraseMatcher
m = PhraseMatcher(nlp.vocab)
phrase = ['Cricket','Football']

In [34]:
print([nlp(text) for text in phrase])

[Cricket, Football]


In [35]:
pattern = [nlp(text) for text in phrase]
m.add('Sports', None, *pattern)
m(d2)

[(9611670226552988807, 1, 2), (9611670226552988807, 3, 4)]

In [36]:
from spacy.tokens import Span as sp

sport = d2.vocab.strings['Sports'] 

found = m(d2)

In [37]:
for mtch in found:
    print(mtch[1], mtch[2])

1 2
3 4


In [38]:
new_ents = [sp(d2, mtch[1], mtch[2], label = sport) for mtch in found]
print(new_ents)

[Cricket, Football]


In [39]:
d2.ents = list(d2.ents) + new_ents

In [40]:
print(d2.ents)

(Cricket, Football)


In [41]:
show_entities(d2)

Cricket                       Sports                        None
Football                      Sports                        None




## Combining Two corpuses and Performing NER

In [42]:
d3 = nlp("Tesla is one of the biggest giant in the field of electric vehicles. Playing Cricket and Football are good enough for health.")
show_entities(d3)

Tesla                         ORG                           Companies, agencies, institutions, etc.


In [43]:
m = PhraseMatcher(nlp.vocab)
phrase = ['Cricket','Football']
pattern = [nlp(text) for text in phrase]
m.add('Sports', None, *pattern)
sport = d3.vocab.strings['Sports'] 
found = m(d3)
new_ents = [sp(d3, mtch[1], mtch[2], label = sport) for mtch in found]
d3.ents = list(d3.ents) + new_ents

In [44]:
show_entities(d3)

Tesla                         ORG                           Companies, agencies, institutions, etc.
Cricket                       Sports                        None
Football                      Sports                        None


## Finding Specific Tag Entities

In [45]:
d4 = nlp("Google, Apple, GFG, Tesla, Grapes are good.")
show_entities(d4)

Google                        ORG                           Companies, agencies, institutions, etc.
Apple                         ORG                           Companies, agencies, institutions, etc.
GFG                           ORG                           Companies, agencies, institutions, etc.
Tesla                         ORG                           Companies, agencies, institutions, etc.


In [46]:
d4 = nlp("Google, Apple, GFG, student")
show_entities(d4)

Google                        ORG                           Companies, agencies, institutions, etc.
Apple                         ORG                           Companies, agencies, institutions, etc.
GFG                           ORG                           Companies, agencies, institutions, etc.


In [47]:
[ent for ent in d4.ents]

[Google, Apple, GFG]

In [48]:
[ent for ent in d4.ents if ent.label_ == 'ORG']

[Google, Apple, GFG]

In [49]:
d5 = nlp("Google, Apple, GFG, 1 Million, Phone, 2 Million Dollars")
show_entities(d5)

Google                        ORG                           Companies, agencies, institutions, etc.
Apple                         ORG                           Companies, agencies, institutions, etc.
GFG                           ORG                           Companies, agencies, institutions, etc.
1 Million                     CARDINAL                      Numerals that do not fall under another type
2 Million Dollars             MONEY                         Monetary values, including unit


In [50]:
print([ent for ent in d5.ents if ent.label_ == 'CARDINAL'])
print('\n')
print([ent for ent in d5.ents if ent.label_ == 'MONEY'])

[1 Million]


[2 Million Dollars]


## REFERENCES OR APPENDIX

In [51]:
help(nlp)

Help on English in module spacy.lang.en object:

class English(spacy.language.Language)
 |  English(vocab: Union[spacy.vocab.Vocab, bool] = True, *, max_length: int = 1000000, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[[ForwardRef('Language')], Callable[[str], spacy.tokens.doc.Doc]]] = None, create_vectors: Optional[Callable[[ForwardRef('Vocab')], spacy.vectors.BaseVectors]] = None, batch_size: int = 1000, **kwargs) -> None
 |  
 |  Method resolution order:
 |      English
 |      spacy.language.Language
 |      builtins.object
 |  
 |  Data and other attributes defined here:
 |  
 |  Defaults = <class 'spacy.lang.en.EnglishDefaults'>
 |  
 |  default_config = {'paths': {'train': None, 'dev': None, 'vectors'...s'...
 |  
 |  factories = {'attribute_ruler': <function make_attribute_rul...<functi...
 |  
 |  lang = 'en'
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from spacy.language.Language:
 |  
 |  __call__(

In [52]:
help(show_entities(d3))

Tesla                         ORG                           Companies, agencies, institutions, etc.
Cricket                       Sports                        None
Football                      Sports                        None
Help on NoneType object:

class NoneType(object)
 |  Methods defined here:
 |  
 |  __bool__(self, /)
 |      self != 0
 |  
 |  __repr__(self, /)
 |      Return repr(self).
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.



In [53]:
help(d2.vocab.strings['ORG'])

Help on int object:

class int(object)
 |  int([x]) -> integer
 |  int(x, base=10) -> integer
 |  
 |  Convert a number or string to an integer, or return 0 if no arguments
 |  are given.  If x is a number, return x.__int__().  For floating point
 |  numbers, this truncates towards zero.
 |  
 |  If x is not a number or if base is given, then x must be a string,
 |  bytes, or bytearray instance representing an integer literal in the
 |  given base.  The literal can be preceded by '+' or '-' and be surrounded
 |  by whitespace.  The base defaults to 10.  Valid bases are 0 and 2-36.
 |  Base 0 means to interpret the base from the string as an integer literal.
 |  >>> int('0b100', base=0)
 |  4
 |  
 |  Built-in subclasses:
 |      bool
 |  
 |  Methods defined here:
 |  
 |  __abs__(self, /)
 |      abs(self)
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __and__(self, value, /)
 |      Return self&value.
 |  
 |  __bool__(self, /)
 |      self != 0
 |  
 |  __ceil_

In [54]:
help(PhraseMatcher(nlp.vocab))

Help on PhraseMatcher object:

class PhraseMatcher(builtins.object)
 |  PhraseMatcher(Vocab vocab, attr='ORTH', validate=False)
 |  Efficiently match large terminology lists. While the `Matcher` matches
 |      sequences based on lists of token descriptions, the `PhraseMatcher` accepts
 |      match patterns in the form of `Doc` objects.
 |  
 |      DOCS: https://spacy.io/api/phrasematcher
 |      USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher
 |  
 |      Adapted from FlashText: https://github.com/vi3k6i5/flashtext
 |      MIT License (see `LICENSE`)
 |      Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com)
 |  
 |  Methods defined here:
 |  
 |  __call__(...)
 |      Find all sequences matching the supplied patterns on the `Doc`.
 |      
 |      doclike (Doc or Span): The document to match over.
 |      as_spans (bool): Return Span objects with labels instead of (match_id,
 |          start, end) tuples.
 |      RETURNS (list): A list of `(match_id, start