## [Chapter 1](https://course.spacy.io/chapter1)

In [1]:
import spacy
# $ python -m spacy download en_core_web_sm
# $ python -m spacy download en_core_web_md
# $ python -m spacy download en_core_web_lg

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# explain element
spacy.explain('PROPN')

'proper noun'

In [4]:
# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
    )

#### Lexical Attr

In [5]:
# Iterate over the tokens in the doc
for token in doc:

    # Check if the token resembles a number
    if token.like_num:
        print(f'Numeric token: {token}')
        
        # Get the next token in the document
        next_token = doc[token.i + 1]
        
        # Check if the next token's text equals '%'
        if next_token.text == "%":
            print(f"is percentage: {token.text} %\n")
        else:
            print(f"not percentage\n")

Numeric token: 1990
not percentage

Numeric token: 60
is percentage: 60 %

Numeric token: 4
is percentage: 4 %



#### Predicting Part-of-speech Tags

In [6]:
# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


#### NER in context

In [7]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


#### Rule-based matching

In [8]:
# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create the nlp object
nlp = spacy.load('en_core_web_sm')

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)

# Process some text
doc = nlp("New iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

New
iPhone
X
release
date
leaked


In [9]:
# add custome patterns to matcher
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa'},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': True}
]
matcher.add('FIFA_PATTERN', None, pattern)

pattern = [
    {'LEMMA': 'love', 'POS': 'VERB'},
    {'POS': 'NOUN'}
]
matcher.add('LOVE_PATTERN', None, pattern)

doc = nlp("New iPhone X release date leaked. I loved dogs but now I love cats more. 2018 FIFA World Cup: France won!")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

New
iPhone
X
release
date
leaked
.
I
loved
loved dogs
dogs
but
now
I
love
cats
more
.
2018
FIFA
World
Cup
2018 FIFA World Cup:
:
France
won
!
