# Introduction to Advanced NLP with spaCy

In [1]:
# import the English language class from spacy
from spacy.lang.en import English

The __English__ class object includes language-specific rules for tokenization: words, numbers, and punctuation.

In [2]:
# Instantiate an English NLP object
nlp = English()

doc = nlp("This is an introductory lesson to spaCy")

## Documents, spans, and tokens

In [13]:
# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


## Tokens and Characters
Accessing words in variable `doc`.

In [3]:
# Access the entire string using the `.text` attribute.
doc.text

'This is an introductory lesson to spaCy'

In [4]:
# Using index notation to access the first token.
doc[0]

This

## Span Object
Using index notation on an English/nlp object returns a span object. This is merely a __view__ of the document

In [5]:
# Using index notation to view the first 3 tokens of the document.
doc[0:3]

This is an

In [6]:
# Using index notation with `.text` attribute to access the first 4 characters of the document.
doc.text[0:4]

'This'

## Lexical Attributes

In [7]:
# Print the index of each token in the document.
print("Index:   ", [token.i for token in doc])

# Print each token in the document.
print("Index:   ", [token.text for token in doc])

Index:    [0, 1, 2, 3, 4, 5, 6]
Index:    ['This', 'is', 'an', 'introductory', 'lesson', 'to', 'spaCy']


## Lexical Conditional Operators
- doc`.is_alpha` returns True if the token contains all characters, False otherwise.
- doc`.is_punct` returns True if the token is punctuation, False otherwise.
- doc`.like_num` returns True if the token is a digit or numeric word, i.e. "Ten". False otherwise.

In [8]:
# Create a new `doc` variable for this example.
doc_lexi = nlp("There were ten houses priced at $10,000,000 dollars. You bought 5. How much did it cost?")

In [9]:
# Determine if a token is homogenously composed of alpha characters.
print("is_alpha:", [token.is_alpha for token in doc_lexi])

is_alpha: [True, True, True, True, True, True, False, False, True, False, True, True, False, False, True, True, True, True, True, False]


In [10]:
# Determine if a token is a punctuation character.
print("is_punct:", [token.is_punct for token in doc_lexi])

is_punct: [False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True]


In [11]:
# Determine if a token is like a number.
print("like_num:", [token.like_num for token in doc_lexi])

like_num: [False, False, True, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, False]


## Import other languages

In [12]:
# Import the Spanish, German classifiers spacy.lang.es, spacy.lang.de
from spacy.lang.es import Spanish
from spacy.lang.de import German

# German and Spanish
nlp_german = German()
nlp_spanish = Spanish()

# Instantiate german and spanish nlp objects
doc_german = nlp_german("Liebe Grüße!")
doc_spanish = nlp_spanish("¿Cómo estás?")

# Print each word in the doc
print(doc_german.text)
print(doc_spanish.text)

Liebe Grüße!
¿Cómo estás?
