In [2]:
import spacy

In [21]:
nlp = spacy.blank("en")
doc = nlp("Dr. am confused between Pineapple Pastry or Chocolate Brownie. I am hungry!")
for token in doc:
    print(token)

Dr.
am
confused
between
Pineapple
Pastry
or
Chocolate
Brownie
.
I
am
hungry
!


In [22]:
nlp.pipe_names

[]

In [23]:
#Pretrained NLP Pipeline
#When we load nlp with trained pipeline we get some inbuilt features
nlp = spacy.load("en_core_web_sm")

In [24]:
#List of component
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [25]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2c7f7a090>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2c71cb590>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2c75a12a0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2c6ff8b90>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2c70378d0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2c75a13f0>)]

In [26]:
doc = nlp("Dr. am confused between Pineapple Pastry or Chocolate Brownie. I am hungry!")
for token in doc:
    #token.pos_: part of speech
    #token.lemma_ : base word
    print(token, " | ",token.pos_, " | ", token.lemma_)

Dr.  |  PROPN  |  Dr.
am  |  AUX  |  be
confused  |  VERB  |  confuse
between  |  ADP  |  between
Pineapple  |  PROPN  |  Pineapple
Pastry  |  PROPN  |  Pastry
or  |  CCONJ  |  or
Chocolate  |  PROPN  |  Chocolate
Brownie  |  PROPN  |  Brownie
.  |  PUNCT  |  .
I  |  PRON  |  I
am  |  AUX  |  be
hungry  |  ADJ  |  hungry
!  |  PUNCT  |  !


#### Named Entity Recognition

In [27]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
#ent : entity
for ent in doc.ents:
    print(ent.text, " | ",ent.label_," | ",spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [28]:
#Gives a better visualization
from spacy import displacy
displacy.render(doc,style='ent')

In [29]:
#ner : Named Entity Recognition: Allows to recognize entity for our text
#Bloomberg founded data company called Bloomberg

In [37]:
nlp = spacy.load("fr_core_news_sm")

doc = nlp("Tesla Inc va acquérir Twitter pour 45 milliards de dollars")
for ent in doc.ents:
    print(ent.text," | ",ent.label_," | ",spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [38]:
for token in doc:
    print(token, " | ",token.pos_," | ",token.lemma_)

Tesla  |  PROPN  |  Tesla
Inc  |  PROPN  |  Inc
va  |  VERB  |  aller
acquérir  |  VERB  |  acquérir
Twitter  |  NOUN  |  twitter
pour  |  ADP  |  pour
45  |  NUM  |  45
milliards  |  NOUN  |  milliard
de  |  ADP  |  de
dollars  |  NOUN  |  dollar


#### Adding a component to a blank pipeline

In [34]:
nlp = spacy.blank("en")
#No pipeline
doc = nlp("Tesla Inc va acquérir Twitter pour 45 milliards de dollars")

for ent in doc.ents:
    print(ent.text, " | ",ent.label_," | ",spacy.explain(ent.label_))

In [39]:
#Custome pipeline
source_nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en")
nlp.add_pipe("ner",source=source_nlp)
nlp.pipe_names

['ner']

In [40]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ",ent.label_," | ",spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit
