Découverte de la bibliotèque Spacy

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [4]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 92 NOUN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.S. 96 PROPN compound
startup 92 NOUN dobj
for 85 ADP prep
$ 99 SYM quantmod
6 93 NUM compound
million 93 NUM pobj


In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1adc05929a0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1adc07590a0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1adc05a8350>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1adc0734580>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1adc072dc00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1adc05a82e0>)]

In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [8]:
for token in doc2:
    print(f"{token.text:{8}}, {token.pos:{5}}, {token.pos_:{5}}, {token.dep_:{10}}")

Tesla   ,    92, NOUN , nsubj     
is      ,    87, AUX  , aux       
n't     ,    94, PART , neg       
looking ,   100, VERB , ROOT      
into    ,    85, ADP  , prep      
startups,    92, NOUN , pobj      
anymore ,    86, ADV  , advmod    
.       ,    97, PUNCT, punct     


In [9]:
doc2[0]

Tesla

In [10]:
doc2[0].pos_

'NOUN'

In [11]:
doc2[0].dep_

'nsubj'

In [12]:
spacy.explain('PROPN')

'proper noun'

In [13]:
spacy.explain('nsubj')

'nominal subject'

In [14]:
print(doc2[4].text)
print(doc2[4].lemma_)

into
into


In [15]:
print(doc2[4].pos_)
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))

ADP
IN / conjunction, subordinating or preposition


In [16]:
print(doc2[0].text+': '+doc2[0].shape_)
print(doc[5].text+' : '+doc[5].shape_)

Tesla: Xxxxx
U.S. : X.X.


In [17]:
print(doc2[0].is_alpha)
print(doc2[0].is_stop)

True
False


In [18]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [20]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [21]:
type(life_quote)

spacy.tokens.span.Span

In [22]:
type(doc3)

spacy.tokens.doc.Doc

In [26]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [27]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [28]:
doc4[6]

This

In [29]:
doc4[6].is_sent_start

True

In [30]:
doc4[7].is_sent_start

False

Tokenization :

In [32]:
mystring = '"We\'re moving to L.A.!"'

In [33]:
print(mystring)

"We're moving to L.A.!"


In [34]:
doc = nlp(mystring)

In [35]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [36]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com")

In [37]:
for token in doc2:
    print(token.text)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com


In [38]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30!")

In [39]:
for token in doc3:
    print(token.text)

A
5
km
NYC
cab
ride
costs
$
10.30
!


In [40]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [41]:
for token in doc4:
    print(token.text)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [42]:
len(doc4)

11

In [44]:
len(doc4.vocab)

843

In [45]:
doc5 = nlp(u"It is better to give than receive.")

In [46]:
doc5[0]

It

In [47]:
doc5[2:5]

better to give

In [48]:
doc5[0] = "test"

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [49]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million.')

In [50]:
for token in doc8:
    print(token.text,end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | . | 

In [53]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [54]:
nlp = spacy.load('fr_core_news_sm')

In [55]:
doc9 = nlp(u'Je souhaite aller de Nantes à Paris en train de nuit.')

In [56]:
for entity in doc9.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

Nantes
LOC
Non-GPE locations, mountain ranges, bodies of water


Paris
LOC
Non-GPE locations, mountain ranges, bodies of water




In [57]:
doc10 = nlp(u'Je souhaite aller de Nantes à Saint Herblain en train de nuit.')

In [59]:
for entity in doc10.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

Nantes
LOC
Non-GPE locations, mountain ranges, bodies of water


Saint Herblain
LOC
Non-GPE locations, mountain ranges, bodies of water




In [62]:
nlp = spacy.load('en_core_web_sm')
doc11 = nlp(u"Autonomous cars shift insurance liability toward manufacturers at L.A..")

In [63]:
for chunk in doc11.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers
L.A


In [64]:
from spacy import displacy

In [66]:
displacy.render(doc11,style='dep',jupyter=True,options={'distance':60})

In [67]:
doc12 = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [70]:
displacy.render(doc12,style='ent',jupyter=True)

In [None]:
displacy.serve(doc12,style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [17/Sep/2022 20:09:52] "GET / HTTP/1.1" 200 14245
127.0.0.1 - - [17/Sep/2022 20:09:53] "GET /favicon.ico HTTP/1.1" 200 14245


Stemming :

In [2]:
import nltk

In [3]:
from nltk.stem.porter import PorterStemmer

In [4]:
p_stemmer = PorterStemmer()

In [5]:
words = ['courir','coureur','courait','cours','facilement','dificilement']

In [6]:
for word in words:
    print(word + "---->" + p_stemmer.stem(word))

courir---->courir
coureur---->coureur
courait---->courait
cours---->cour
facilement---->facil
dificilement---->dificil


In [7]:
from nltk.stem.snowball import FrenchStemmer

In [8]:
f_stemmer = FrenchStemmer()

In [9]:
for word in words:
    print(word + "---->" + f_stemmer.stem(word))

courir---->cour
coureur---->coureur
courait---->cour
cours---->cour
facilement---->facil
dificilement---->dificil


Lemmatization :

In [10]:
import spacy

In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today.")

In [13]:
for token in doc1:
    print(token.text,'\t',token.pos_,'\t',token.lemma,'\t',token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [14]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [15]:
doc2 = nlp(u"I saw ten mice today!")

In [16]:
show_lemmas(doc2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [17]:
show_lemmas(doc1)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   4690420944186131903    I
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
I            PRON   4690420944186131903    I
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


Stop Words :

In [18]:
print(nlp.Defaults.stop_words)

{'your', '’m', 'last', 'every', 'someone', 'towards', "'d", 'are', 'part', 'can', 'another', 'indeed', 'down', 'which', 'during', 'per', 'too', 'both', 'off', 'everyone', 'over', 'same', 'each', 'nor', 'formerly', 'at', 'herself', '‘d', 'few', 'this', 'seem', 'being', 'therein', 'wherein', 'on', 'hundred', 'yet', 'therefore', 'whereby', 'be', 'rather', 'across', 'how', 'ourselves', 'less', "'s", 'very', 'due', 'something', 'just', 'meanwhile', 'that', 'herein', 'after', 'once', 'three', 'while', 'what', 'me', 'though', 'up', 'anyone', 'nine', 'will', 'upon', 'afterwards', 'him', 'before', '’ll', '’ve', 'often', 'call', "n't", 'again', 'top', 'keep', 'least', 'unless', 'thereby', 'neither', 'n’t', 'amongst', 'or', 'have', 'hereupon', 'regarding', 'former', 'with', 'twenty', 'n‘t', 'noone', 'almost', 'otherwise', 'move', 'above', 'had', 'us', 'became', 'namely', 'none', '‘ll', 'most', 'anywhere', 'anything', 'it', 'about', 'somewhere', 'its', 'the', 'thereupon', 'some', 'here', 'than', '

In [19]:
len(nlp.Defaults.stop_words)

326

In [20]:
nlp.vocab['is'].is_stop

True

In [21]:
nlp.vocab['mystery'].is_stop

False

In [22]:
nlp.Defaults.stop_words.add('btw')

In [23]:
nlp.vocab['btw'].is_stop = True

In [24]:
len(nlp.Defaults.stop_words)

327

In [25]:
nlp.vocab['btw'].is_stop

True

In [26]:
nlp.Defaults.stop_words.remove('btw')
nlp.vocab['btw'].is_stop = False

In [27]:
nlp.vocab['btw'].is_stop

False

In [28]:
len(nlp.Defaults.stop_words)

326

Phrase matching and vocabulary :

In [29]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [30]:
from spacy.matcher import Matcher

In [31]:
matcher = Matcher(nlp.vocab)

In [32]:
# SolarPower
# Solar-power
# Solar power

pattern1 = [{'LOWER':'solarPower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [34]:
matcher.add('SolarPower',[pattern1,pattern2,pattern3])

In [38]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [39]:
found_matches = matcher(doc)

In [40]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 13, 16)]


In [41]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 13 16 Solar-power


In [42]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', [pattern1, pattern2])

In [43]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [44]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [45]:
doc2 = nlp(u"Solar--power is solarpower yay !")

In [46]:
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [47]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 The Solar Power
8656102463236116519 SolarPower 4 5 continues


In [48]:
from spacy.matcher import PhraseMatcher

In [49]:
matcher = PhraseMatcher(nlp.vocab)

In [50]:
with open('./UPDATED_NLP_COURSE/TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [53]:
phrase_list = ['voodoo economics','supply-side economics','trickle-down economics','free-market economics']

In [55]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [58]:
matcher.add('EconMatcher', [*phrase_patterns])

In [59]:
found_matches = matcher(doc3)

In [60]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [63]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+5]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2987 2991 became widely known as "trickle-down economics", due to the
