#Tokenization


Tokenization is the task of splitting a text into meaningful segments,called tokens.The input to the tokenizer is a unicode text and the output is a doc object

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')

In [2]:
for token in doc:
  print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


#Part of speech tagging
this is the process of assigning each word in a text or sentence a part of speech labelc,such as noun,verb,adjective


In [3]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [5]:
for token in doc:
  print(token.text,token.lemma_)

Apple Apple
is be
looking look
at at
buying buy
U.K. U.K.
startup startup
for for
$ $
1 1
billion billion


In [10]:
for token in doc:
  print(f' {token.text:{15}} {token.lemma_:{15}} {token.pos_:{10}}    {token.is_stop}')

 Apple           Apple           PROPN         False
 is              be              AUX           True
 looking         look            VERB          False
 at              at              ADP           True
 buying          buy             VERB          False
 U.K.            U.K.            PROPN         False
 startup         startup         NOUN          False
 for             for             ADP           True
 $               $               SYM           False
 1               1               NUM           False
 billion         billion         NUM           False


#Dependency Parsing

Assigning syntactic dependency labels,describing the relations between individual tokens like subject or object

In [12]:
for chunk in doc.noun_chunks:
  print(f'{chunk.text:{15}} {chunk.root.text:{15}} {chunk.root.dep_}')

Apple           Apple           nsubj
U.K.            U.K.            dobj


#Named Entity Recognition

Labeling namedreal world objects like persons,companies or locations

In [13]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [14]:
for ent in doc.ents:
  print(ent.text,ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


#Sentence Segmentation
this is a process of dividing a text or document into individual sentences



In [15]:
#doc.sents

In [16]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [17]:
for sent in doc.sents:
  print(sent)

Apple is looking at buying U.K. startup for $1 billion


In [18]:
doc1 = nlp('This is the first sentence.This is the second sentence.This is the third sentence.')

In [19]:
doc1

This is the first sentence.This is the second sentence.This is the third sentence.

In [20]:
for sent in doc1.sents:
  print(sent)

This is the first sentence.
This is the second sentence.
This is the third sentence.


In [25]:
doc1 = nlp("This is the first.*. sentence.This is the... second sentence")

In [26]:
for sent in doc1.sents:
  print(sent)

This is the first.
*. sentence.
This is the... second sentence


In [42]:
from spacy.language import Language
@Language.component("set_rule")
def set_rule(doc):
  for token in doc [:-1]:
     if token.text == '.*.':
      doc[token.i+1].sent_start = True
  return doc


In [43]:
nlp= spacy.load("en_core_web_sm")
nlp.add_pipe("set_rule",before="parser")
doc1 = nlp("This is the first.*. sentence.This is the.*. second sentence")

In [44]:
for sent in doc1.sents:
  print(sent)

This is the first.
*. sentence.
This is the.
*.
second sentence


In [45]:
for token in doc:
  print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


#Visualization

In [46]:
from spacy import displacy
doc

Apple is looking at buying U.K. startup for $1 billion

In [47]:
displacy.render(doc,style='dep',jupyter=True)

In [49]:
displacy.render(doc,style='dep',options={'compact':True, 'distance': 100})

In [50]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [51]:
displacy.render(doc,style='ent')

Conclusion visualize name entity recognition