In [1]:
import spacy

# preloaded English web small
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u'Apple is looking at buying new U.S. startups for many billion dollars $5')

# u' means uncode strings 
# parse the strings into tokens

for token in doc: 
    print(token.text, token.lemma_, 
          '\n', token.pos, token.pos_,token.tag_,  token.dep_,
         '\n', token.shape_, token.is_alpha, token.is_stop)

# print only the pos
doc[0].pos_


Apple apple 
 95 PROPN NNP nsubj 
 Xxxxx True False
is be 
 99 VERB VBZ aux 
 xx True True
looking look 
 99 VERB VBG ROOT 
 xxxx True False
at at 
 84 ADP IN prep 
 xx True True
buying buy 
 99 VERB VBG pcomp 
 xxxx True False
new new 
 83 ADJ JJ amod 
 xxx True False
U.S. u.s. 
 95 PROPN NNP compound 
 X.X. False False
startups startup 
 91 NOUN NNS dobj 
 xxxx True False
for for 
 84 ADP IN prep 
 xxx True True
many many 
 83 ADJ JJ amod 
 xxxx True True
billion billion 
 92 NUM CD nummod 
 xxxx True False
dollars dollar 
 91 NOUN NNS pobj 
 xxxx True False
$ $ 
 98 SYM $ nmod 
 $ False False
5 5 
 92 NUM CD npadvmod 
 d False False


'PROPN'

In [3]:
# pipeline object consists of tagger, parser and ner

nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x10e92a2e8>),
 ('parser', <spacy.pipeline.DependencyParser at 0x10e956fc0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x10ea360a0>)]

In [10]:
doc1 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [11]:
for sentence in doc1.sents:
    print(sentence)


This is the first sentence.
This is another sentence.
This is the last sentence.


In [12]:
#doc4.is_tagged #is_parsed

doc1[6].is_sent_start

True

In [13]:
# Tokenization 

mystring = "Yay! We're going to Disneyland L.A.!! \
This is an awesome opportunity. Email me as today rpoe!poe@gmail.com! Or find me on www.rpoe.com.\
Tickets cost $300 per head or $2 thousand per family."

In [14]:
mystring

"Yay! We're going to Disneyland L.A.!! This is an awesome opportunity. Email me as today rpoe!poe@gmail.com! Or find me on www.rpoe.com.Tickets cost $300 per head or $2 thousand per family."

In [15]:
doc2 = nlp(mystring)

In [16]:
for token in doc2:
    print(token)

Yay
!
We
're
going
to
Disneyland
L.A.
!
!
This
is
an
awesome
opportunity
.
Email
me
as
today
rpoe!poe@gmail.com
!
Or
find
me
on
www.rpoe.com.Tickets
cost
$
300
per
head
or
$
2
thousand
per
family
.


In [19]:
len(doc1.vocab) #size of the tokens in en_core_web_sm and not the variable

57852

In [20]:
# Named entities 

for token in doc2:
    print(token.text, end=' | ')

Yay | ! | We | 're | going | to | Disneyland | L.A. | ! | ! | This | is | an | awesome | opportunity | . | Email | me | as | today | rpoe!poe@gmail.com | ! | Or | find | me | on | www.rpoe.com.Tickets | cost | $ | 300 | per | head | or | $ | 2 | thousand | per | family | . | 

In [21]:
for entity in doc2.ents:
    print(entity)
    print(entity.label_, '  \  ', str(spacy.explain(entity.label_)))


Disneyland L.A.
ORG   \   Companies, agencies, institutions, etc.
today
DATE   \   Absolute or relative dates or periods
rpoe!poe@gmail.com
NORP   \   Nationalities or religious or political groups
www.rpoe.com.Tickets
ORG   \   Companies, agencies, institutions, etc.
300
MONEY   \   Monetary values, including unit
$2 thousand
MONEY   \   Monetary values, including unit


In [22]:
for chunk in doc2.noun_chunks:
    print(chunk)

We
Disneyland L.A.
an awesome opportunity
me
me
www.rpoe.com.Tickets
head
family


In [23]:
## Displaying chunks

from spacy import displacy

In [28]:
displacy.render(doc2, jupyter=True, options={'distance':150})

In [29]:
displacy.render(doc2, style='ent', jupyter=True) #missed TODAY as date

In [32]:
# Stemming
# Porter stemmer 1980s, various rules and rule ordering 
# Snowball is another language for stemming 

import nltk
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

In [44]:
words = ['go', 'goes', 'went', 'going', 'goer', 'gone', 'gophers', 
         'easily', 'quickly', 'quuuickly']

In [45]:
for word in words:
    print(word + '---->' + p_stemmer.stem(word))

go---->go
goes---->goe
went---->went
going---->go
goer---->goer
gone---->gone
gophers---->gopher
easily---->easili
quickly---->quickli
quuuickly---->quuuickli


In [46]:
# snowball 

from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')

#arabic danish dutch english finnish french german hungarian italian norwegian 
#portuguese romanian russian spanish swedish

In [47]:
for word in words: 
    print(word + ' ----> ' + s_stemmer.stem(word))

go ----> go
goes ----> goe
went ----> went
going ----> go
goer ----> goer
gone ----> gone
gophers ----> gopher
easily ----> easili
quickly ----> quick
quuuickly ----> quuuick


In [49]:
# Lemmatization - stemming + morphological analysis
# more informative - spacy has only lemmatization, also looks at POS

# Use spacy
doc3 = nlp(u'get going for going is good and I am a goer who have gone many lengths \
for go goes gone gooone easily, quickly and faster than anyone.')


In [52]:
for token in doc3:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)
    
    # the number is an individual hash / number pointing to unique lemmas in the library 

get 	 VERB 	 2013399242189103424 	 get
going 	 VERB 	 8004577259940138793 	 go
for 	 ADP 	 16037325823156266367 	 for
going 	 VERB 	 8004577259940138793 	 go
is 	 VERB 	 10382539506755952630 	 be
good 	 ADJ 	 5711639017775284443 	 good
and 	 CCONJ 	 2283656566040971221 	 and
I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
goer 	 NOUN 	 14292368617582274880 	 goer
who 	 NOUN 	 3876862883474502309 	 who
have 	 VERB 	 14692702688101715474 	 have
gone 	 VERB 	 8004577259940138793 	 go
many 	 ADJ 	 9720044723474553187 	 many
lengths 	 NOUN 	 493173492801391902 	 length
for 	 ADP 	 16037325823156266367 	 for
go 	 NOUN 	 8004577259940138793 	 go
goes 	 VERB 	 8004577259940138793 	 go
gone 	 VERB 	 8004577259940138793 	 go
gooone 	 NOUN 	 3993162000191412813 	 gooone
easily 	 ADV 	 8007658219579238015 	 easily
, 	 PUNCT 	 2593208677638477497 	 ,
quickly 	 ADV 	 7007696535375059571 	 quickly
and 	 CCONJ 	 2283656566040971221 	 and


In [57]:
def show_lemmas(text):
    for token in text: 
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:{22}} {token.lemma_:{10}}')

In [58]:
show_lemmas(doc3)

get          VERB      2013399242189103424 get       
going        VERB      8004577259940138793 go        
for          ADP      16037325823156266367 for       
going        VERB      8004577259940138793 go        
is           VERB     10382539506755952630 be        
good         ADJ       5711639017775284443 good      
and          CCONJ     2283656566040971221 and       
I            PRON       561228191312463089 -PRON-    
am           VERB     10382539506755952630 be        
a            DET      11901859001352538922 a         
goer         NOUN     14292368617582274880 goer      
who          NOUN      3876862883474502309 who       
have         VERB     14692702688101715474 have      
gone         VERB      8004577259940138793 go        
many         ADJ       9720044723474553187 many      
lengths      NOUN       493173492801391902 length    
for          ADP      16037325823156266367 for       
go           NOUN      8004577259940138793 go        
goes         VERB      80045

In [60]:
# Stop words 

# remove stop words and add your own 

print(nlp.Defaults.stop_words, len(nlp.Defaults.stop_words))  #returns a set {}, not a dictiornary

{'many', 'front', 'herself', 'however', 'three', 'who', 'neither', 'one', 'either', 'everywhere', 'does', 'thus', 'top', 'again', 'only', 'all', 'while', 'twenty', 'elsewhere', 'herein', 'nor', 'anywhere', 'than', 'go', 'through', 'whereby', 'made', 'an', 'hence', 'latter', 'nine', 'never', 'whole', 'towards', 'back', 'two', 'after', 'beyond', 'alone', 'might', 'much', 'him', 'became', 'both', 'due', 'noone', 'now', 'first', 'where', 'this', 'always', 'have', 'part', 'too', 'are', 'she', 'along', 'done', 'our', 'although', 'another', 'anyhow', 'eight', 'own', 'on', 'mostly', 'fifty', 'whereupon', 'thereby', 'make', 'move', 'or', 'his', 'how', 'give', 'across', 'meanwhile', 'against', 'but', 'enough', 'really', 'the', 'there', 'why', 'during', 'less', 'whereafter', 'up', 'thence', 'doing', 'a', 'then', 'everyone', 'therefore', 'using', 'yourselves', 'fifteen', 'must', 'that', 'hereby', 'somehow', 'formerly', 'same', 'still', 'unless', 'down', 'various', 'whatever', 'else', 'sixty', 'bet

In [66]:
# find if 'is' is in vocab and check attribute if this is a stop

nlp.vocab['is'].is_stop 
nlp.vocab['buy'].is_stop

False

In [67]:
# Adding stop words to the data set 

nlp.Defaults.stop_words.add('btw')

In [71]:
nlp.vocab['btw'].is_stop  # False 

# so add btw
nlp.vocab['btw'].is_stop = True

nlp.vocab['btw'].is_stop  # True 

# similarly for removing something set .is_stop = False 

True

In [93]:
# Vocabulary and Matching 

# Rule based matching - matcher - matches the rules and returns the list of matches
# match text and annotation 

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab) # matcher with specific nlp object 

pattern1 = [{'LOWER': 'solarpower'}] # attribute LOWER = lowercase form of the token text
# matchers solarpower
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
# matches solar-power
pattern3 = [{'LOWER': 'solar'}, {'LOWER':'power'}]
# matches solar power 



In [94]:
# add the patterns to the matcher

matcher.add('SolarPower',None, pattern1, pattern2, pattern3)

In [95]:
doc5 = nlp(u'The Solar Power grows as solarpower increases and Solar-Power is good. solar power too\
           sOlar pOWer SOlaR PoweR, solar--power, solar;power, solar...power, powersolar')

In [96]:
found_matches = matcher(doc5) # matchid = string id and start and the stop on the tokens in the string

In [97]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc5[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 5 6 solarpower
8656102463236116519 SolarPower 8 11 Solar-Power
8656102463236116519 SolarPower 14 16 solar power
8656102463236116519 SolarPower 18 20 sOlar pOWer
8656102463236116519 SolarPower 20 22 SOlaR PoweR
8656102463236116519 SolarPower 23 26 solar--power
8656102463236116519 SolarPower 29 32 solar...power


In [98]:
# remove a pattern 

matcher.remove('SolarPower')

In [99]:
# making optional pattern rule with new pattern rules as we removed the previous 

# all relevant strings in upper and lowercase
pattern1 = [{'LOWER':'solarpower'}]

# with different punctuations in between 
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]

matcher.add('SolarPower', None, pattern1, pattern2)

found_matches = matcher(doc5)

In [100]:
found_matches

[(8656102463236116519, 1, 3),
 (8656102463236116519, 5, 6),
 (8656102463236116519, 8, 11),
 (8656102463236116519, 14, 16),
 (8656102463236116519, 18, 20),
 (8656102463236116519, 20, 22),
 (8656102463236116519, 23, 26),
 (8656102463236116519, 29, 32)]

In [101]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc5[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 5 6 solarpower
8656102463236116519 SolarPower 8 11 Solar-Power
8656102463236116519 SolarPower 14 16 solar power
8656102463236116519 SolarPower 18 20 sOlar pOWer
8656102463236116519 SolarPower 20 22 SOlaR PoweR
8656102463236116519 SolarPower 23 26 solar--power
8656102463236116519 SolarPower 29 32 solar...power


In [103]:
# Phrase matching and vocabulary 

# more efficient match on a terminology list - phrase matcher 

from spacy.matcher import PhraseMatcher
p_matcher = PhraseMatcher(nlp.vocab)

In [104]:
pwd

'/Users/pbiswas/Documents/Portilla_NLP'

In [110]:
with open('/Users/pbiswas/Documents/Portilla_NLP/UPDATED_NLP_COURSE/\
TextFiles/reaganomics.txt', encoding='utf-8', errors='ignore') as f:
    doc6 = nlp(f.read())

In [111]:
phrase_list = ['voodoo economics','supply-side economics', 'free-market economics']

In [112]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [116]:
type(phrase_patterns[0])  # spacy doc

spacy.tokens.doc.Doc

In [117]:
p_matcher.add('EconMatcher', None, *phrase_patterns) 
# * Python insert keyword argument trick for insert each pattern argument 


In [118]:
found_matches = p_matcher(doc6)

In [119]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677)]

In [125]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc6[start:end]  # get the matched span 
    #span = doc6[start:end+20]  # get the matched span and following words 
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
