# Spacy: Chapter 1

In [1]:
import spacy
from spacy.lang.en import English

#### 1. Print the first letter

In addition to printing the first letter, we will print every letter in the form of a list

In [2]:
# nlp object
nlp = English()

In [3]:
doc = nlp("Hi, my name is viraj5 55")

In [4]:
for token in doc:
    print (token.text)

Hi
,
my
name
is
viraj5
55


In [5]:
first_letter = doc[1]

In [6]:
print (first_letter.text)

,


#### 2. Index position and word for each word in the sentence.

In [7]:
[token.i for token in doc]

[0, 1, 2, 3, 4, 5, 6]

In [8]:
[token.text for token in doc]

['Hi', ',', 'my', 'name', 'is', 'viraj5', '55']

#### 3. True or False - Alpha or not, punct or not, number or not

In [9]:
[token.is_alpha for token in doc]

[True, False, True, True, True, False, False]

In [10]:
[token.is_punct for token in doc]

[False, True, False, False, False, False, False]

In [11]:
[token.like_num for token in doc]

[False, False, False, False, False, False, True]

Summary so far:
- Wrap text into nlp to start processing pipeline.
- Tokens can be accessed from the text you sumbitted to the nlp object
- slice/index the object, even puncuation is identified as a seperate object
- Check if tokens are alpha, punctuation or a number

#### Lexial attributes 
In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are.
    
How can you extract the % values from the text?


In [12]:
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. " "Now less than 4% are.")

In [13]:
for token in doc:
    #Applying the rules below token by token
    if token.like_num:
        #checking the token
        next_token = doc[token.i+1]
        if next_token.text == "%":
            print(str(token) + str(next_token))

60%
4%


#### Statistical models - Predicting Linguistic annotations




In [34]:
nlp = spacy.load("en_core_web_sm")

In [35]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
’s          VERB      ccomp     
official    ADJ       dobj      
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


It's fine if you don't know what any of those terms mean as you can explain them as seen below

In [36]:
spacy.explain("PROPN")

'proper noun'

In [37]:
spacy.explain("DET")

'determiner'

In [38]:
spacy.explain("dobj")

'direct object'

In [39]:
spacy.explain("compound")

'compound'

In [40]:
spacy.explain("ROOT")

In [41]:
spacy.explain("nmod")

'modifier of nominal'

In [42]:
spacy.explain("quantmod")

'modifier of quantifier'

In [43]:
spacy.explain("amod")

'adjectival modifier'

#### Statistical models - Predicting Entities

In [48]:
text = "It’s official: Lloyds is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Lloyds ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


This example below shows the model couldn't predict IPhone X

In [25]:
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Upcoming iPhone X PERSON
Apple ORG
Missing entity: iPhone X


#### Rule Based Matchning

Rule based method to finding words and phrases in text. <br>

Here we will see how to search for iPhone X in the doc by adding it to the matcher

Match patterns are a list of dictionaries. Each dictionary describes one token. The keys are the name of token attributes , mapped to their expected value..


In [26]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", None, pattern)

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

In [27]:
doc = nlp("Upcoming iPhone X release date leaked")
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


#### Matching examples

Let's try this again with a few more examples

In [28]:
matcher = Matcher(nlp.vocab)

pattern = [
    {"IS_DIGIT": True}, #Digit is present
    {"LOWER": "fifa"},  #followed by fifa in any case
    {"LOWER": "world"}, #followed by world
    {"LOWER": "cup"},   #followed by cup
    {"IS_PUNCT": True}  #followed by a puncuation
]                       #In this particular order

matcher.add("IPHONE_PATTERN", None, pattern)

doc = nlp("2018 FIFA World Cup: France won! and 2018 FIFA World Cup")
matches = matcher(doc)

for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)


2018 FIFA World Cup:


In [29]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]

matcher.add("love pattern",None, pattern)

doc = nlp("I loved dogs but now I love cats more.")
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

loved dogs
love cats


In [30]:
def matcher_func(text):
    doc = nlp(text)
    matches = matcher(doc)
    
    for match_id, start,end in matches:
        matched_span = doc[start:end]
        print(matched_span.text)
    

In [31]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times, so if there is a determinter like the or a, include it
    {"POS": "NOUN"}
]
matcher.add("App pattern", None, pattern)

matcher_func("I bought a smartphone. Now I'm buying apps.")

bought a smartphone
buying apps


Relevant documentation can be found here
https://spacy.io/usage/rule-based-matching 


In [32]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [33]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses
