In [1]:
!pip install spacy



In [2]:
!python -m spacy download en_core_web_sm
!python -m spacy validate

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 25.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.7/dist-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.2.5[0m   [38;5;2m✔[0m
link      en               en_core_web_sm   [38;5;2m2.2.5[0m   [38;5;2m✔[0m



In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp("This is a example text")
doc

This is a example text

In [5]:
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Process a text
doc = nlp("Progress to Contributor to make your voice count!")

# Print the document text
print(doc.text)

Progress to Contributor to make your voice count!


In [7]:
from spacy.lang.en import English
nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# Select the first token
first_token = doc[0]
# Print the first token's text
print(first_token.text)

I


In [8]:
for i in doc:
    print(i.text)

I
like
tree
kangaroos
and
narwhals
.


In [9]:
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are.")

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals '%'
        if next_token.text == '%':
            print('Percentage found:', token.text)

Percentage found: 60
Percentage found: 4


<h1>POS-tagging and Lemmatization<br>

You can establish the lemma for each token as well as its part of speech. Use the token.lemma_ method for lemmas and the token.pos_ method for parts of speech.

In [10]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is an another example text.")
# Coarse-grained part-of-speech tags
[token.pos_ for token in doc]

['DET', 'AUX', 'DET', 'DET', 'NOUN', 'NOUN', 'PUNCT']

In [11]:
doc = nlp("Microsoft News delivers news from the most popular and trusted publishers.")
for i in doc:
    print("{0} – {1} – {2}".format(i.text, i.lemma_, i.pos_))

Microsoft – Microsoft – PROPN
News – News – PROPN
delivers – deliver – VERB
news – news – NOUN
from – from – ADP
the – the – DET
most – most – ADV
popular – popular – ADJ
and – and – CCONJ
trusted – trusted – ADJ
publishers – publisher – NOUN
. – . – PUNCT


<h1>Named Entity Recognition

In [12]:
doc = nlp("Steve Jobs founded Apple")
# Text and label of named entity span
[(ent.text, ent.label_) for ent in doc.ents]

[('Steve Jobs', 'PERSON'), ('Apple', 'ORG')]

In [13]:
doc = nlp("It’s official: Apple is the first U.S. public company to reach a $1 trillion market value")
for i in doc.ents:
    print(i.text + ' ==== ' + i.label_)

Apple ==== ORG
first ==== ORDINAL
U.S. ==== GPE
$1 trillion ==== MONEY


In [14]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}'.format(token_text, token_pos, token_dep))

It          PRON      nsubj     
’s          VERB      punct     
official    NOUN      ccomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [15]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

New iPhone EVENT
Apple ORG


In [16]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print('Missing entity:', iphone_x.text)

New iPhone EVENT
Apple ORG
Missing entity: iPhone X
