In [79]:
import spacy

In [80]:
nlp = spacy.load('en_core_web_sm')

In [33]:
doc = nlp(u'Tesla is looking at buying a U.S. startup for $6 million')

In [34]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
a DET det
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [10]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x7feb384206d0>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7feb98228950>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7feb98228ef0>)]

In [11]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [18]:
doc2 = nlp(u"Tesla isn't     looking for startups anymore.")

In [19]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
n't ADV neg
     SPACE 
looking VERB ROOT
for ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [20]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [21]:
life_quote = doc3[16:30]

In [22]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [23]:
type(doc3)

spacy.tokens.doc.Doc

In [24]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence")

In [26]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence


In [29]:
doc4[6].is_sent_start

True

In [35]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [36]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [37]:
from spacy import displacy

In [38]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [45]:
displacy.render(doc, style='ent', jupyter='True')

In [46]:
import nltk

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [47]:
from nltk.stem.porter import PorterStemmer

In [48]:
p_stemmer = PorterStemmer()

In [57]:
words = ['runner', 'run', 'ran', 'runs', 'easily', 'fairly', 'fairness', 'generous', 'generation', 'generously', 'generate']

In [58]:
for word in words:
    print(word + ' ---->' + p_stemmer.stem(word))

runner ---->runner
run ---->run
ran ---->ran
runs ---->run
easily ---->easili
fairly ---->fairli
fairness ---->fair
generous ---->gener
generation ---->gener
generously ---->gener
generate ---->gener


In [51]:
from nltk.stem.snowball import SnowballStemmer

In [52]:
s_stemmer = SnowballStemmer(language='english')

In [59]:
for word in words:
    print(word + ' ---->' + s_stemmer.stem(word))

runner ---->runner
run ---->run
ran ---->ran
runs ---->run
easily ---->easili
fairly ---->fair
fairness ---->fair
generous ---->generous
generation ---->generat
generously ---->generous
generate ---->generat


In [60]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [64]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [65]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [66]:
show_lemmas(doc1)

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      ADP    16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        ADP    10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today


In [69]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [70]:
#SolarPower
pattern1 = [{'LOWER':'solarpower'}]
#Solar-power
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True}, {'LOWER':'power'}]
#Solar power
pattern3 = [{'LOWER':'solar'}, {'LOWER':'power'}]

In [71]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [72]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing")

In [73]:
found_matches = matcher(doc)

In [74]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [75]:
matcher.remove('SolarPower')

In [81]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [84]:
with open('../TextFiles/reaganomics.txt',encoding='latin1') as fptr:
    stringg = fptr.read()
    doc3 = nlp(stringg)

In [85]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [88]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [89]:
matcher.add('EconMatcher', None, *phrase_patterns)

In [90]:
found_matches = matcher(doc3)

In [92]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2984 2988 trickle-down economics
