In [1]:
import spacy

In [2]:
nlp = spacy.load('fr_core_news_lg')

In [3]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [4]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [5]:
print(doc[4])

jumped


In [6]:
print(doc[4].pos_)

VERB


In [7]:
print(doc[4].tag_)

VBD


In [10]:
for token in doc:
    print(f"{token.text:{8}} {token.pos_:{5}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

The      DET   DT    determiner
quick    ADJ   JJ    adjective (English), other noun-modifier (Chinese)
brown    ADJ   JJ    adjective (English), other noun-modifier (Chinese)
fox      NOUN  NN    noun, singular or mass
jumped   VERB  VBD   verb, past tense
over     ADP   IN    conjunction, subordinating or preposition
the      DET   DT    determiner
lazy     ADJ   JJ    adjective (English), other noun-modifier (Chinese)
dog      NOUN  NN    noun, singular or mass
's       PART  POS   possessive ending
back     NOUN  NN    noun, singular or mass
.        PUNCT .     punctuation mark, sentence closer


In [11]:
doc = nlp(u"I read books on NLP.")

In [12]:
word = doc[1]

In [13]:
word.text

'read'

In [14]:
token = word
print(f"{token.text:{8}} {token.pos_:{5}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

read     VERB  VBD   verb, past tense


In [15]:
doc = nlp(u"I read a book on NLP.")

In [16]:
word = doc[1]
token = word
print(f"{token.text:{8}} {token.pos_:{5}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

read     VERB  VBD   verb, past tense


In [17]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [18]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [19]:
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [20]:
doc.vocab[90].text

'DET'

In [26]:
doc[0].pos

90

In [27]:
doc[0].pos_

'DET'

In [28]:
for k,v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   3
85. ADP   1
90. DET   2
92. NOUN  3
94. PART  1
97. PUNCT 1
100. VERB  1


In [29]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [31]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

402. amod  3
415. det   2
429. nsubj 1
439. pobj  1
440. poss  1
443. prep  1
445. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


In [59]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [33]:
doc = nlp(u"The quick brown fox jumped over the lazy dog.")

In [34]:
from spacy import displacy

In [35]:
displacy.render(doc,style='dep',jupyter=True)

In [36]:
options = {'distance':110,'compact':'True','color':'yellow','bg':'#09a3d5','font':'Times'}

In [37]:
displacy.render(doc,style='dep',jupyter=True,options=options)

In [38]:
doc2 = nlp(u"This is a sentence. This is another sentence, possibly longer than the other. ")

In [39]:
spans = list(doc2.sents)

In [None]:
displacy.serve(spans,style='dep',options=options)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [18/Sep/2022 21:07:59] "GET / HTTP/1.1" 200 10875
127.0.0.1 - - [18/Sep/2022 21:07:59] "GET /favicon.ico HTTP/1.1" 200 10875


Named entity recognition :

In [8]:
import spacy
nlp = spacy.load("fr_core_news_sm")

In [9]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print("No entities found")

In [10]:
doc = nlp(u"Hi how are you ?")

In [11]:
show_ents(doc)

No entities found


In [12]:
doc = nlp(u"Je vais peut être aller à Paris, le moisprochain pour voir les monuments Parisiens.")

In [13]:
show_ents(doc)

Paris - LOC - Non-GPE locations, mountain ranges, bodies of water
Parisiens - LOC - Non-GPE locations, mountain ranges, bodies of water


In [16]:
doc = nlp(u"Recherche le trajet le plus rapide entre Nantes et Paris")

In [17]:
show_ents(doc)

Nantes - LOC - Non-GPE locations, mountain ranges, bodies of water
Paris - LOC - Non-GPE locations, mountain ranges, bodies of water


In [18]:
doc = nlp(u"Puis-je avoir s'il vous plaît 500 euros d'actions Microsoft ?")

In [19]:
show_ents(doc)

Microsoft - ORG - Companies, agencies, institutions, etc.


In [22]:
doc = nlp(u"Tesla a construit une usine en Angleterre pour $6 million.")

In [23]:
show_ents(doc)

Tesla - PER - Named person or family.
Angleterre - LOC - Non-GPE locations, mountain ranges, bodies of water


In [24]:
from spacy.tokens import Span

In [25]:
ORG = doc.vocab.strings[u"ORG"]

In [26]:
ORG

383

In [27]:
new_ent = Span(doc,0,1,label=ORG)

In [30]:
new_ent.label_

'ORG'

In [53]:
doc.ents = list(doc.spans) +  [new_ent]

In [54]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.


In [60]:
doc = nlp(u"Our company create a brand new vacuum cleaner."
         u"This new vacuum-cleaner is the best in show.")

In [61]:
show_ents(doc)

No entities found


In [62]:
from spacy.matcher import PhraseMatcher

In [63]:
matcher = PhraseMatcher(nlp.vocab)

In [64]:
phrase_list = ['vacuum cleaner','vacuum-cleaner']

In [65]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [66]:
matcher.add('newproduct',None,*phrase_patterns)

In [67]:
found_matches = matcher(doc)

In [68]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [69]:
from spacy.tokens import Span

In [70]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [72]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [73]:
doc.ents = list(doc.ents) + new_ents

In [74]:
show_ents(doc)


vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [77]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now itis marked down by 10 dollars.")

In [79]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.95, 10 dollars]

In [80]:
doc = nlp(u"I go to New York in may.")

In [85]:
ent = [ent for ent in doc.ents if ent.label_ == "GPE" or ent.label_ == "LOC"]

In [88]:
ent[0]

New York

In [89]:
import spacy

In [91]:
nlp = spacy.load('en_core_web_sm')

In [92]:
from spacy import displacy

In [95]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
         u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [96]:
displacy.render(doc,style='ent',jupyter=True)

In [97]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [107]:
colors = {'ORG':'red'}
options = {'ent':['PRODUCT','ORG'],'colors':colors}

In [108]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [None]:
displacy.serve(doc,style='ent',options=options)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [19/Sep/2022 21:32:58] "GET / HTTP/1.1" 200 2653
127.0.0.1 - - [19/Sep/2022 21:32:58] "GET /favicon.ico HTTP/1.1" 200 2653


In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
doc = nlp(u"This is the first sentence. This is another sentence.This it the last sentence.")

In [7]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This it the last sentence.


In [8]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [10]:
doc[0]

This

In [11]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [12]:
list(doc.sents)[0]

This is the first sentence.

In [15]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [16]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [17]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




In [None]:
#ADD A SEGMENTATION RULE
