In [1]:
# Tokenization - Segmenting text into words, punctuations marks etc.

# Part-of-speech (POS) Tagging - Assigning word types to tokens, like verb or noun.

# Dependency Parsing - Assigning syntactic dependency labels, describing the relations
# between individual tokens, like subject or object.

# Lemmatization	- Assigning the base forms of words. For example, the lemma of 'was'
# is 'be', and the lemma of 'rats' is 'rat'.

# Sentence Boundary Detection (SBD)	- Finding and segmenting individual sentences.

# Named Entity Recognition (NER) - Labelling named 'real-world' objects, like persons,
# companies or locations.

# Entity Linking (EL) - Disambiguating textual entities to unique identifiers in a
# knowledge base.

# Similarity - Comparing words, text spans and documents and how similar they are to each
# other.

# Text Classification - Assigning categories or labels to a whole document, or parts of a
# document.

# Rule-based Matching - Finding sequences of tokens based on their texts and linguistic
# annotations, similar to regular expressions.

# Training - Updating and improving a statistical model’s predictions.

# Serialization	- Saving objects to files or byte strings.

In [2]:
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple Universal POS part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalization, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?

import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp('Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_,
          token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN advcl xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [3]:
# ent: named entity

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [4]:
# Text: The original token text.
# has vector: Does the token have a vector representation?
# Vector norm: The L2 norm of the token’s vector (sqrt of the sum of the values squared)
# OOV: Out-of-vocabulary

import spacy

nlp = spacy.load('en_core_web_md')

tokens = nlp('dog cat banana afskfsd')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


In [6]:
import spacy

nlp = spacy.load('en_core_web_md')  # make sure to use larger package!

doc1 = nlp('I like salty fries and hamburgers.')
doc2 = nlp('Fast food tastes very good.')

# Similarity of two documents
print(doc1, '<->', doc2, doc1.similarity(doc2))

# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, '<->', burgers, french_fries.similarity(burgers))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.7799485853415737
salty fries <-> hamburgers 0.7304624


In [8]:
import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp('I love coffee')

print(doc.vocab.strings['coffee'])
print(doc.vocab.strings[3197928453018144401])

3197928453018144401
coffee


In [11]:
# Text: The original text of the lexeme.
# Orth: The hash value of the lexeme.
# Shape: The abstract word shape of the lexeme.
# Prefix: By default, the first letter of the word string.
# Suffix: By default, the last three letters of the word string.
# is alpha: Does the lexeme consist of alphabetic characters?
# is digit: Does the lexeme consist of digits?

import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp('I love coffee')

for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en
