

# ref : https://spacy.io/usage/processing-pipelines#pipelines



In [1]:
import spacy

In [2]:

nlp = spacy.blank("en")

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [3]:
nlp.pipe_names

[]

In [4]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7ea01964bee0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7ea01ac19ae0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7ea00da2e1f0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7ea011387b40>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7ea00e083180>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7ea00da2eb90>)]

In [8]:
# pos = part of speach
# tager component
doc = nlp("Parth love data science which avg salary is $99000 .")

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)

Parth  |  adjective  |  parth
love  |  noun  |  love
data  |  noun  |  datum
science  |  noun  |  science
which  |  pronoun  |  which
avg  |  proper noun  |  avg
salary  |  noun  |  salary
is  |  auxiliary  |  be
$  |  symbol  |  $
99000  |  numeral  |  99000
.  |  punctuation  |  .


Named Entity Recognition (NER)

In [9]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY


In [13]:
# dispancy
from spacy import displacy

displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is going to acquire twitter for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n</div>'

In [12]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_,spacy.explain(ent.label_))

Tesla Inc ORG Companies, agencies, institutions, etc.
$45 billion MONEY Monetary values, including unit


In [14]:
#Adding a component to a blank pipeline (French pipeline )

In [15]:
source_nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en")
nlp.add_pipe("ner", source=source_nlp)
nlp.pipe_names

['ner']

In [16]:


doc = nlp("Tesla Inc va racheter Twitter pour 45 milliards de dollars")
for ent in doc.ents:
    print(ent.text, ent.label_)


Tesla Inc ORG
45 milliards de dollars MONEY
