In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# creating object "doc"
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million') # unicode string

In [4]:
type(doc)

spacy.tokens.doc.Doc

In [12]:
for token in doc:
    print(token.text,token.pos_,token.dep_) # text, partofspeech, syntatic dependency

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [6]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1f1b7aff4c8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1f1b7afa768>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1f1b7afad08>)]

In [13]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [30]:
doc2 = nlp(u"Maybe about 6 o'clock")

In [31]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
  SPACE 
looking VERB ROOT
into ADP prep
startup NOUN pobj
anymore ADV advmod
. PUNCT punct


|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [42]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')
type(doc3)

spacy.tokens.doc.Doc

In [41]:
life_quote = doc3[16:30]
print(life_quote)
print(type(life_quote))

"Life is what happens to us while we are making other plans"
<class 'spacy.tokens.span.Span'>


#  Play with sentences

In [53]:
doc4 = nlp(u"Hello, My name is Jarvis. I am a program. I don't have any shape.")

In [54]:
for sentence in doc4.sents:
    print(sentence)

Hello, My name is Jarvis.
I am a program.
I don't have any shape.


In [59]:
print(doc4[0].is_sent_start)
print(doc4[7].is_sent_start)
print(doc4[6].is_sent_start)

True
True
None


# Tokenization

In [60]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [61]:
line = '"We\'re moving to L.A.!"'

In [63]:
print(line)
doc = nlp(line)

"We're moving to L.A.!"


In [64]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [67]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")
for t in doc2:
    print(t)


We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [68]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [69]:
doc4 = nlp(u'Google to build a Hong Kong factory for $6 million')

In [71]:
for token in doc4:
    print(token.text,end="|")

Google|to|build|a|Hong|Kong|factory|for|$|6|million|

In [73]:
for entity in doc4.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Google
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [76]:
doc10 = nlp(u"Red cars do not carry higher insurance rates.")

for chunk in doc10.noun_chunks:
    print(chunk.text)

Red cars
higher insurance rates


In [77]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

#  Stemming

In [2]:
import nltk
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

In [9]:
words = ["run","runner","runs","ran","easily","fairly","fairness"]

In [10]:
for word in words:
    print(word+"----->"+p_stemmer.stem(word))

run----->run
runner----->runner
runs----->run
ran----->ran
easily----->easili
fairly----->fairli
fairness----->fair


In [11]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language="english")

In [12]:
for word in words:
    print(word+"----->"+s_stemmer.stem(word))

run----->run
runner----->runner
runs----->run
ran----->ran
easily----->easili
fairly----->fair
fairness----->fair


In [13]:
words = ["generous","generation","generously","generate"]
for word in words:
    print(word+"----->"+s_stemmer.stem(word))

generous----->generous
generation----->generat
generously----->generous
generate----->generat


# Lemmatization 

In [14]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [16]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [17]:
for token in doc1:
    print(token.text,"\t", token.pos_,"\t",token.lemma,"\t",token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [26]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [27]:
doc2 = nlp(u"I saw ten mice today!")
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


# Stop Words

In [30]:
import spacy
nlp = spacy.load("en_core_web_sm")
print(nlp.Defaults.stop_words,"\n",len(nlp.Defaults.stop_words))

{'anything', 'seems', 'much', 'move', 'elsewhere', 'sometime', 'however', 'well', "'d", 'least', '’m', 'neither', 'this', 'do', 'someone', 'put', 'you', 'becomes', 'nobody', 'an', 'those', 'onto', 'under', 'give', 'get', 'herself', 'n’t', 'less', 'had', 'besides', 'were', 'towards', 'where', 'via', '‘re', 'whatever', 'anyone', 'often', 'beside', '‘s', 'latter', 'n‘t', 'ourselves', 'forty', 'whereafter', 'everything', 'too', 'about', 'used', 'of', 'she', '‘ll', 'see', 'he', 'we', 'still', 'during', 'except', 'and', 'now', 'them', 'due', 'on', 'whose', 'somewhere', 'myself', 'am', 'regarding', 'among', 'rather', 'across', 'along', 'thru', 'noone', 'serious', 'thereby', 'ours', 'because', 'for', "'re", 'either', 'ten', 'wherever', 'been', "'s", 'themselves', 'five', 'seeming', 'beforehand', 'amongst', 'at', 'quite', 'moreover', 'another', 'no', 'something', 'front', 'here', 'yourself', 'whereby', "'m", 'sometimes', 'ca', 'former', 'as', 'fifteen', 'also', 'our', 'almost', 'whom', 'meanwhi

In [31]:
nlp.vocab['is'].is_stop

True

In [32]:
nlp.vocab['mystery'].is_stop

False

In [33]:
# To add your own stop word
nlp.Defaults.stop_words.add("btw")

In [34]:
nlp.vocab['btw'].is_stop =True

In [35]:
nlp.vocab['is'].is_stop

True

In [36]:
# to remove stop word
nlp.Defaults.stop_words.remove("beyond")

In [37]:
nlp.vocab['beyond'].is_stop = False

In [38]:
nlp.vocab['beyond'].is_stop

False

#  Phrase Matching and Vocublary

In [151]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [152]:
from spacy.matcher import Matcher

In [153]:
matcher = Matcher(nlp.vocab)

In [154]:
# SolarPower
pattern1 = [{'LOWER':'solarpower'}]
# Solar-Power (- is a punctuation)
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
# Solar Power
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

In [155]:
matcher.add("SolarPower",None,pattern1,pattern2,pattern3)

In [156]:
doc = nlp(u"The Solar Power industry continue to grow a solarpower increases. Solar-power is amazing technology")

In [157]:
found_matches = matcher(doc)
print(found_matches) # id,token_start index, end index

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


## Other token attributes
Besides lemmas, there are a variety of token attributes we can use to determine matching rules:
<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

In [158]:
for match_id,start,end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id,string_id,start,end,doc[start:end].text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [159]:
# removing the matcher
matcher.remove("SolarPower")

This found both two-word patterns, with and without the hyphen!

The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>


In [160]:
# solarpower SolarPower
pattern1 = [{'lower':'solarpower'}]
#solar.power
pattern2 = [{'lower':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]


In [161]:
matcher.add('SolarPower',None,pattern1,pattern2)

In [162]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [163]:
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [164]:
from spacy.matcher import PhraseMatcher

In [165]:
matcher = PhraseMatcher(nlp.vocab)

In [166]:
with open('../TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [172]:
phrase_list = ['voodoo economics','supply-side economics','trickle-down economics','free-market economics']

In [173]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [174]:
print(type(phrase_patterns[0]))
print(phrase_patterns)

<class 'spacy.tokens.doc.Doc'>
[voodoo economics, supply-side economics, trickle-down economics, free-market economics]


In [175]:
matcher.add('EconMatcher',None,*phrase_patterns)
found_matches = matcher(doc3)

In [178]:
for match_id,start,end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start-5:end+5] # to get little more info
    print(match_id,string_id,start,end,span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2987 2991 became widely known as "trickle-down economics", due to the
