# Introduction to Advanced NLP with spaCy

In [1]:
# import the English language class from spacy
import spacy
from spacy.lang.en import English

The __English__ class object includes language-specific rules for tokenization: words, numbers, and punctuation.

In [2]:
# Instantiate an English NLP object
nlp = English()

doc = nlp("This is an introductory lesson to spaCy")

## Documents, spans, and tokens

In [3]:
# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


## Tokens and Characters
Accessing words in variable `doc`.

In [4]:
# Access the entire string using the `.text` attribute.
doc.text

'I like tree kangaroos and narwhals.'

In [5]:
# Using index notation to access the first token.
doc[0]

I

## Span Object
Using index notation on an English/nlp object returns a span object. This is merely a __view__ of the document

In [6]:
# Using index notation to view the first 3 tokens of the document.
doc[0:3]

I like tree

In [7]:
# Using index notation with `.text` attribute to access the first 4 characters of the document.
doc.text[0:4]

'I li'

## Import other languages

In [8]:
# Import the Spanish, German classifiers spacy.lang.es, spacy.lang.de
from spacy.lang.es import Spanish
from spacy.lang.de import German

# German and Spanish
nlp_german = German()
nlp_spanish = Spanish()

# Instantiate german and spanish nlp objects
doc_german = nlp_german("Liebe Grüße!")
doc_spanish = nlp_spanish("¿Cómo estás?")

# Print each word in the doc
print(doc_german.text)
print(doc_spanish.text)

Liebe Grüße!
¿Cómo estás?


## Lexical Attributes

In [9]:
# Print the index of each token in the document.
print("Index:   ", [token.i for token in doc])

# Print each token in the document.
print("Index:   ", [token.text for token in doc])

Index:    [0, 1, 2, 3, 4, 5, 6]
Index:    ['I', 'like', 'tree', 'kangaroos', 'and', 'narwhals', '.']


## Lexical Conditional Operators
- doc`.is_alpha` returns True if the token contains all characters, False otherwise.
- doc`.is_punct` returns True if the token is punctuation, False otherwise.
- doc`.like_num` returns True if the token is a digit or numeric word, i.e. "Ten". False otherwise.

In [10]:
# Create a new `doc` variable for this example.
doc_lexi = nlp("There are ten houses priced at $10,000,000 dollars. You bought 5. How much did it cost?")

In [11]:
# Determine if a token is homogenously composed of alpha characters.
print("is_alpha:", [token.is_alpha for token in doc_lexi])

is_alpha: [True, True, True, True, True, True, False, False, True, False, True, True, False, False, True, True, True, True, True, False]


In [12]:
# Determine if a token is a punctuation character.
print("is_punct:", [token.is_punct for token in doc_lexi])

is_punct: [False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True]


In [13]:
# Determine if a token is like a number.
print("like_num:", [token.like_num for token in doc_lexi])

like_num: [False, False, True, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, False]


In [14]:
# Example from spaCy docs
nlp = English()

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document, view the next token
        next_token = doc[token.i+1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            # If a percentage is next to a number like token, return the numeric token.
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


# Statistical Models

__`spaCy`__'s statisitcal models allow a user to predict lingustical attributes in _context_.
- Part-of-speech tags `token.pos_`
- Syntactic dependencies
- Named entities

These models are trained large datasets on labeled example texts and can be updates with more examples to fine-tune predictions, i.e. your specific data.

`en_core_web_sm` is a small English model trained on web text.
- Contains binary weights of the model
- Vocabulary, language, and pipeline information built into the model.

To install the model use the following command in Terminal:
```python
$ python -m spacy download en_core_web_sm
```

In [16]:
# Load the spacy small English model
nlp = spacy.load('en_core_web_sm')

## Predicting Part-of-speech Tags

In [20]:
# Example from spacy docs

# Process the text using the small English model
doc = nlp("She ate the pineapple pizza")

# Iterate over each token in the doc object
for token in doc:
    # For each token print the text and part-of-speech the tag is used.
    # Using an attribute without an underscore will return an integer indicating the index.
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pineapple NOUN
pizza NOUN


## Predicting Syntactic Dependencies
- `.text` returns the text of a token
- `.pos_` returns the part of speech a word: Noun, verb, etc.
- `dep_` returns the dependancy label of the token.
- `.head.text` returns the parent token that the dependency. Shows the word that the token is attached to/dependant on.

In [21]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pineapple NOUN compound pizza
pizza NOUN dobj ate


Part-of-speech tag meaning
- nsubj: nominal subject
- det: determiner
- dobj: direct object

## Predicting Named Entities
Named entities are real world objects that are assigned a name, i.e. Apple

__`spaCy`__ allows you to access named entities from a doc by using the `.ents` attibute.

In [23]:
# Process the text through the simple English model
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the entities in the doc object
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


# spacy.explain method

The `spacy.explain` method allows a user to get quick definitions of the most common tags and labels.

Docstring: Get a description for a given POS tag, dependency label or entity type.

In [29]:
# Geopolitical entities
spacy.explain('GPE')

'Countries, cities, states'

In [26]:
spacy.explain('NNP')

'noun, proper singular'

In [28]:
spacy.explain('compound')

'compound'

In [31]:
spacy.explain('dobj')

'direct object'