In [1]:
import spacy

# preloaded English web small
nlp = spacy.load('en_core_web_sm')

In [17]:
doc = nlp(u'Apple is looking at buying new U.S. startups for many billion dollars $5')

# u' means uncode strings 
# parse the strings into tokens

for token in doc: 
    print(token.text, token.lemma_, 
          '\n', token.pos, token.pos_,token.tag_,  token.dep_,
         '\n', token.shape_, token.is_alpha, token.is_stop)

# print only the pos
doc[0].pos_


Apple apple 
 95 PROPN NNP nsubj 
 Xxxxx True False
is be 
 99 VERB VBZ aux 
 xx True True
looking look 
 99 VERB VBG ROOT 
 xxxx True False
at at 
 84 ADP IN prep 
 xx True True
buying buy 
 99 VERB VBG pcomp 
 xxxx True False
new new 
 83 ADJ JJ amod 
 xxx True False
U.S. u.s. 
 95 PROPN NNP compound 
 X.X. False False
startups startup 
 91 NOUN NNS dobj 
 xxxx True False
for for 
 84 ADP IN prep 
 xxx True True
many many 
 83 ADJ JJ amod 
 xxxx True True
billion billion 
 92 NUM CD nummod 
 xxxx True False
dollars dollar 
 91 NOUN NNS pobj 
 xxxx True False
$ $ 
 98 SYM $ nmod 
 $ False False
5 5 
 92 NUM CD npadvmod 
 d False False


'PROPN'

In [10]:
# pipeline object consists of tagger, parser and ner

nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x114a02320>),
 ('parser', <spacy.pipeline.DependencyParser at 0x114a501a8>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x114a50200>)]

In [18]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [28]:
for sentence in doc4.sents:
    print(sentence)


This is the first sentence. This is the first sentence.
This is another sentence. This is another sentence.
This is the last sentence. This is the last sentence.


In [32]:
#doc4.is_tagged #is_parsed

doc4[6].is_sent_start

True

In [116]:
# Tokenization 

mystring = "Yay! We're going to Disneyland L.A.!! \
This is an awesome opportunity. Email me as today rpoe!poe@gmail.com! Or find me on www.rpoe.com.\
Tickets cost $300 per head or $2 thousand per family."

In [117]:
mystring

"Yay! We're going to Disneyland L.A.!! This is an awesome opportunity. Email me as today rpoe!poe@gmail.com! Or find me on www.rpoe.com.Tickets cost $300 per head or $2 thousand per family."

In [118]:
doc2 = nlp(mystring)

In [119]:
for token in doc2:
    print(token)

Yay
!
We
're
going
to
Disneyland
L.A.
!
!
This
is
an
awesome
opportunity
.
Email
me
as
today
rpoe!poe@gmail.com
!
Or
find
me
on
www.rpoe.com.Tickets
cost
$
300
per
head
or
$
2
thousand
per
family
.


In [120]:
len(doc4.vocab) #size of the tokens in en_core_web_sm

57852

In [121]:
# Named entities 

for token in doc2:
    print(token.text, end=' | ')

Yay | ! | We | 're | going | to | Disneyland | L.A. | ! | ! | This | is | an | awesome | opportunity | . | Email | me | as | today | rpoe!poe@gmail.com | ! | Or | find | me | on | www.rpoe.com.Tickets | cost | $ | 300 | per | head | or | $ | 2 | thousand | per | family | . | 

In [122]:
for entity in doc2.ents:
    print(entity)
    print(entity.label_, '  \  ', str(spacy.explain(entity.label_)))


Disneyland L.A.
ORG   \   Companies, agencies, institutions, etc.
today
DATE   \   Absolute or relative dates or periods
rpoe!poe@gmail.com
NORP   \   Nationalities or religious or political groups
www.rpoe.com.Tickets
ORG   \   Companies, agencies, institutions, etc.
300
MONEY   \   Monetary values, including unit
$2 thousand
MONEY   \   Monetary values, including unit


In [123]:
for chunk in doc2.noun_chunks:
    print(chunk)

We
Disneyland L.A.
an awesome opportunity
me
me
www.rpoe.com.Tickets
head
family


In [124]:
## Displaying chunks

from spacy import displacy

In [125]:
displacy.render(doc2, jupyter=True, options={'distance':60})

In [126]:
displacy.render(doc2, style='ent', jupyter=True) #missed TODAY as date

In [None]:
# Stemming 