# __Language Processing Pipeline__
## Pipeline is basically a bunch of components which comes after tokenizer like tagger, parser, ner (Name Entity Recognition) and we can have number of processing steps.

In [12]:
import spacy

In [13]:
nlp = spacy.blank("en") # Here, we created a blank language processing pipeline

doc = nlp("Captain America ate 100$ of Apples. Then he said he can do that all day.")

for token in doc:
  print(token)

Captain
America
ate
100
$
of
Apples
.
Then
he
said
he
can
do
that
all
day
.


In [14]:
nlp.pipe_names

[]

## Here, we can see a blank array means by default there is no components of pipelines. Now we need to use pre-trained pipeline. Refer https://spacy.io/usage/models#quickstart and download the command using __python -m spacy download en_core_web_sm__ (NB: This is for English) command.

In [15]:
nlp = spacy.load("en_core_web_sm")
# en = english and sm = small

In [16]:
# Checking all the components
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f79afed9340>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f79afed9be0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f79b1548c80>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f79b1b1acc0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f79b17d3f00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f79b1548b30>)]

In [17]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [18]:
doc = nlp("Captain America ate 100$ of Apples. Then he said he can do that all day.")

for token in doc:
  print(token, " | ", token.pos_, " | ", token.lemma_)

Captain  |  PROPN  |  Captain
America  |  PROPN  |  America
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
Apples  |  PROPN  |  Apples
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
he  |  PRON  |  he
can  |  AUX  |  can
do  |  VERB  |  do
that  |  PRON  |  that
all  |  DET  |  all
day  |  NOUN  |  day
.  |  PUNCT  |  .


## __pos = Parts of Speech.__
## When we are speaking an english sentence, every word has some meaning For ex: Siddharth loves to play Football (Here, Siddharth & Football are noun and loves & play are verb; Nouns also are of 2 types PROPN (Proper Noun) and Common Noun)
## __lemmatizer__ means it will show base word. For ex: ate --> eat, played --> play, etc.

In [20]:
# Checking what is NER
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
  print(ent.text, " | ", ent.label_)

Tesla Inc  |  ORG
$45 billion  |  MONEY


## So, here it is recognizing the entities as Organization and Money.

In [21]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [23]:
# Showing these entities in a fancy way
from spacy import displacy

displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is going to acquire twitter for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n</div>'

In [24]:
# Adding NER component to our English Language pipeline
source_nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en")

nlp.add_pipe("ner", source = source_nlp)
nlp.pipe_names

['ner']

In [26]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit
