In [1]:
# https://spacy.io/
# https://spacy.io/usage
# https://spacy.io/models/en
# https://spacy.io/api/doc
# https://spacy.io/api/token
# https://spacy.io/usage/processing-pipelines
# https://spacy.io/usage/spacy-101
#
#  - spaCy is a free, open-source library for advanced industrial-strength Natural Language Processing (NLP) in Python.
#
# - When you call spaCy on a text, spaCy first tokenizes the text (i.e. segments it into words, punctuation and so on) to produce a Doc object. 
#    spaCy uses rules specific to each language for tokenization.
# 
#  - The Doc object is then processed in several different steps (also referred to as the processing pipeline). 
#    The pipeline used by the default models consists of a (pos) tagger, a (dependency) parser and a (named) entity recognizer (ner). 
#    spaCy uses statistical models to predict pos, syntatctic dependencies, and named entities.
#    Each pipeline component returns the processed Doc, which is then passed on to the next component.
#    You can pick and choose the stages you want spaCy to load.
#
# - Here is a list of features and capabilities of spaCy: 
#   https://spacy.io/usage/spacy-101#features
#

In [2]:
##### installation #####

# https://spacy.io/usage
# pip install spacy

# https://spacy.io/models/en
# you can download these general-purpose pretrained models to predict 
# pos tags (tagger), named entities (ner), and syntactic dependencies (parser).
#     note: n_core_web_sm does not include word-vectors, but en_core_web_md and en_core_web_lg do.
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_md
# python -m spacy download en_core_web_lg

In [3]:
import spacy

# once you’ve downloaded and installed a model, you can load it via spacy.load(). 
# spacy.load() returns a Language object containing all components and data needed to process text. \
# the Language object is typically called nlp. 
nlp = spacy.load("en_core_web_md") 

In [4]:
# calling the nlp object on a string of text will return a processed Doc object. the Doc object is typically called doc.
# even though a Doc object is processed (for isntance, split into individual words and annotated),
# it still holds all information of the original text.
# once the doc object has been created, we can  use it to access the various spaCy features.
doc = nlp("Hi Emma Watson! How are you?")

In [5]:
# for instance, you can iterate over individual sentences in the document.
for s in doc.sents:
    print (s.text)

Hi Emma Watson!
How are you?


In [6]:
# you can iterate over the named entities in the document (from ner)
# a named entity is a “real-world object” that’s assigned a name – for example, a person, a country, a product or a book title.
for e in doc.ents:
    print (e.text, e.label_, spacy.explain(e.label_))

# you can visualize the named entities 
spacy.displacy.render(doc, style='ent',jupyter=True)

Emma Watson PERSON People, including fictional


In [7]:
# you can also visualize the dependencies (from parser)
spacy.displacy.render(doc, style="dep", jupyter= True)

In [8]:
# you can iterate over the base noun chunks in the document.
# noun chunks are “base noun phrases”  - a noun plus the words describing the noun.
# for instance, “the lavish green grass” or “the world’s largest tech fund”.
for c in doc.noun_chunks:
    print (c.text)

Hi Emma Watson
you


In [9]:
# you can iterate over the linguisitic annotations associated with tokens in the document (from tagger)
# https://spacy.io/api/annotation
# https://spacy.io/api/token#attributes
doc = nlp("Hi Emma Watson! How are you?")
for token in doc:
    print (token.i,                 # index of the token within the parent document
           token,
           token.text,               # verbatim text
           token.ent_type_,     # named entity type
           spacy.explain(token.ent_type_),
           token.lemma_,        # base form of the token, with no inflectional suffixes
           token.pos_,             # coarse-grained part-of-speech
            spacy.explain(token.pos_),
           token.tag_,             # fine-grained part-of-speech
            spacy.explain(token.tag_),
           token.dep_,            # syntactic dependency relation
           token.like_url,        # does the token resemble a URL
           token.like_num,     # does the token represent a number? e.g. “10.9”, “10”, “ten”, etc
           token.like_email,    # does the token resemble an email address
           token.is_stop,         # is the token part of a “stop list”
          token.is_alpha,
          token.is_ascii,
          token.is_digit,
          token.is_lower,
          token.is_upper,
          token.is_title,
          token.is_punct,
          token.is_space,
          token.is_currency
          )

0 Hi Hi  None hi INTJ interjection UH interjection compound False False False False True True False False False True False False False
1 Emma Emma PERSON People, including fictional Emma PROPN proper noun NNP noun, proper singular compound False False False False True True False False False True False False False
2 Watson Watson PERSON People, including fictional Watson PROPN proper noun NNP noun, proper singular ROOT False False False False True True False False False True False False False
3 ! !  None ! PUNCT punctuation . punctuation mark, sentence closer punct False False False False False True False False False False True False False
4 How How  None how ADV adverb WRB wh-adverb advmod False False False True True True False False False True False False False
5 are are  None be AUX auxiliary VBP verb, non-3rd person singular present ROOT False False False True True True False True False False False False False
6 you you  None -PRON- PRON pronoun PRP pronoun, personal nsubj False Fal

In [10]:
# you can make semantic similarity estimates based on word vectors.
# the default estimate is cosine similarity, using an average of word vectors for the document.
# it returns a scalar similarity score (higher is more similar).
doc1 = nlp("I like oranges that are sweet.")
# print (doc1.vector) # doc vector is average of token vectors
doc2 = nlp("I like apples that are sour.")
# print (doc2.vector) # doc vector is average of token vectors
doc1.similarity(doc2)

0.9621542455456396

In [11]:
# processing large corpuses with nlp.pipe

# let's say you had a very large corpus of text
# illustrated with a very small corpus below :)
data = ["Amy is going to class now.",
          "Matt is having lunch."]

# first, you'll only want to apply the pipeline components you need:
# getting predictions from the model that you don’t actually need adds up and becomes very inefficient at scale. 
# to prevent this, use the disable keyword argument to disable components you don’t need.
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# and second, you'll want to work on batches of texts.
# this can be done with spaCy’s nlp.pipe method which takes an iterable of texts and yields processed Doc objects. 
# the batching is done internally.
corpus = nlp.pipe(data)

# now we can clean the corpus efficiently
def custom_tokenizer(doc):
    tokens = [token.lemma_.lower() 
                      for token in doc 
                          if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

clean_corpus = [custom_tokenizer(doc) for doc in corpus]
clean_corpus

['amy go class', 'matt have lunch']