In [1]:
# import the library
import spacy

In [2]:
# create a blank model to include processing pipeline and language specific "en" tokenization rules, exceptions etc.
nlp = spacy.blank("en")

In [3]:
# create a Doc object, nlp pass the text to tokenizer and no information is removed from the text
doc = nlp("this is a test sentences!")

In [4]:
for token in doc:
    print(token.text)

this
is
a
test
sentences
!


In [5]:
# access specific token by its index
print(doc[4])

sentences


In [6]:
# span is contains 1 or more tokens
span = doc[1:4]
print(span.text)

is a test


In [7]:
# more attributes of a token

for token in doc:
    print("Token index is {} \t {:10} \
    alpha: {} \t  punct: {} \t \
    number: {}".format(token.i, token.text, token.is_alpha, token.is_punct, token.like_num))

Token index is 0 	 this           alpha: True 	  punct: False 	     number: False
Token index is 1 	 is             alpha: True 	  punct: False 	     number: False
Token index is 2 	 a              alpha: True 	  punct: False 	     number: False
Token index is 3 	 test           alpha: True 	  punct: False 	     number: False
Token index is 4 	 sentences      alpha: True 	  punct: False 	     number: False
Token index is 5 	 !              alpha: False 	  punct: True 	     number: False


In [8]:
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are.")

In [9]:
for token in doc:
    print(token.text)

In
1990
,
more
than
60
%
of
people
in
East
Asia
were
in
extreme
poverty
.
Now
less
than
4
%
are
.


In [10]:
for token in doc:
    if token.like_num:
        next_token = doc[token.i + 1]
        if next_token.text == '%':
            print('percentage found: {}'.format(token.text))

percentage found: 60
percentage found: 4


In [11]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 13.9 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.3.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [12]:
# pretrained spaCy models, discover more about the words using pretrained models
nlp = spacy.load("en_core_web_sm")

In [13]:
doc = nlp("She ate a big slice of the pizza.")

In [15]:
for token in doc:
    print(f'{token.text:10} {token.pos_:5}')

She        PRON 
ate        VERB 
a          DET  
big        ADJ  
slice      NOUN 
of         ADP  
the        DET  
pizza      NOUN 
.          PUNCT


In [16]:
for token in doc:
    print(f'{token.text:10} {token.pos_:10} {token.dep_:10} {token.head.text:10}')

She        PRON       nsubj      ate       
ate        VERB       ROOT       ate       
a          DET        det        slice     
big        ADJ        amod       slice     
slice      NOUN       dobj       ate       
of         ADP        prep       slice     
the        DET        det        pizza     
pizza      NOUN       pobj       of        
.          PUNCT      punct      ate       


In [17]:
# find about the Named entities in the text, like companies, etc.
doc = nlp("Apple is looking at buying U.K. startup for $1 billion.")

In [21]:
for ent in doc.ents:
    print(f'{ent.text:15} {ent.label_}')

Apple           ORG
U.K.            GPE
$1 billion      MONEY


In [22]:
# find definition of different abbreviations using .explain method
print(spacy.explain('GPE'))

Countries, cities, states


In [24]:
print(spacy.explain('amod'))

adjectival modifier


In [46]:
print(spacy.explain('det'))

determiner


In [49]:
print(spacy.explain('op'))

prepositional object


In [26]:
# rule based matching to find in tokenized text
from spacy.matcher import Matcher

In [28]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [29]:
pattern = [{"TEXT": "iPhone"}, {"TEXT":"X"}]
matcher.add("IPHONE_PATTERN", [pattern])

In [30]:
doc = nlp("Upcoming iPhone X release date leaked")

In [35]:
matches = matcher(doc)

In [36]:
for match_id,start,end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [38]:
# matching lexical attributes
pattern = [{"IS_DIGIT": True}, {"LOWER": "fifa"}, {"LOWER": "world"}, {"LOWER": "cup"}, {"IS_PUNCT": True}]

doc = nlp("2018 FIFA World Cup: France won!")

matcher.add("FIFA_WORLDCUP", [pattern])

matches = matcher(doc)
for match_id,start,stop in matches:
    matched_span = doc[start:stop]
    print(matched_span)

2018 FIFA World Cup:


In [44]:
# matching oter attributes
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"POS": "NOUN"}]
doc = nlp("I loved dogs but now I love cats more.")
matcher.add("LOVECATDOGS", [pattern])

matches = matcher(doc)
for match_id,start,stop in matches:
    matched_span = doc[start:stop]
    print(matched_span)

loved dogs
love cats


In [45]:
# matches using operators and qualifiers, OP --> optional: match 0 or 1 times
pattern = [{"LEMMA": "buy"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]
doc = nlp("I bought a smartphone. Now I'm buying apps.")
matcher.add("BUYSTUFF", [pattern])

matches = matcher(doc)
for match_id,start,stop in matches:
    matched_span = doc[start:stop]
    print(matched_span)


bought a smartphone
buying apps


In [None]:
# some of operators and qualifiers
# {"OP": "!"}	Negation: match 0 times
# {"OP": "?"}	Optional: match 0 or 1 times
# {"OP": "+"}	Match 1 or more times
# {"OP": "*"}	Match 0 or more times