In [2]:
!pip install -q spacy
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy
!python -m spacy download en_core_web_md
!python -m spacy download en_core_web_lg

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 2.7 MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051301 sha256=62f46dc16a1f7dbe98ba0bd8da68fc6e01dbbca73f7c3fb358c0a8159755f4cc
  Stored in directory: /tmp/pip-ephem-wheel-cache-i92tkf2r/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_

# spaCy 101: Everything you need to know
**spaCy is a free, open-source library for advanced Natural Language 
Processing (NLP) in Python.**
*   spaCy is designed specifically for production use 
*   helps you build applications that process and “understand” large volumes of text. 
*  It can be used to build information extraction or natural language understanding systems, or to pre-process text for deep learning.






## Features:

NAME | DESCRIPTION
--- | ---
Tokenization | Segmenting text into words, punctuations marks etc.
Part-of-speech (POS) Tagging | Assigning word types to tokens, like verb or noun.
Dependency Parsing | Assigning syntactic dependency labels, describing the relations between individual tokens, like subject or object.
Lemmatization |Assigning the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.
Sentence Boundary Detection (SBD) | Finding and segmenting individual sentences.
Named Entity Recognition (NER) | Labelling named “real-world” objects, like persons, companies or locations.
Entity Linking (EL) | Disambiguating textual entities to unique identifiers in a knowledge base.
Similarity |Comparing words, text spans and documents and how similar they are to each other.
Text Classification | Assigning categories or labels to a whole document, or parts of a document.
Rule-based Matching |Finding sequences of tokens based on their texts and linguistic annotations, similar to regular expressions.
Training |Updating and improving a statistical model’s predictions.
Serialization | Saving objects to files or byte strings.


## Tokenization

In [1]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

NameError: ignored

### Reference: https://spacy.io/usage/linguistic-features#language-data

## Part-of-speech (POS) Tagging

In [None]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


## Named Entity Recognition (NER)

In [None]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


## Word vectors and similarity

In [None]:
nlp = spacy.load("en_core_web_sm")
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 19.266302 True
cat True 19.220264 True
banana True 17.748499 True
afskfsd True 20.882006 True


In [None]:
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab

nlp = spacy.load("en_core_web_sm")
doc = nlp("I love coffee")  # Original Doc
print(doc.vocab.strings["coffee"])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee' 

empty_doc = Doc(Vocab())  # New Doc with empty Vocab
# empty_doc.vocab.strings[3197928453018144401] will raise an error :(

empty_doc.vocab.strings.add("coffee")  # Add "coffee" and generate hash
print(empty_doc.vocab.strings[3197928453018144401])  # 'coffee' 

new_doc = Doc(doc.vocab)  # Create new doc with first doc's vocab
print(new_doc.vocab.strings[3197928453018144401])  # 'coffee' 

3197928453018144401
coffee
coffee
coffee


In [None]:
#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

#nlp = spacy.load('en_core_web_lg')  # make sure to use larger package!
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.7687607012190486
salty fries <-> hamburgers 0.6949788


## Rule-based Matching

In [None]:
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])

doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world


# Reference: https://course.spacy.io/en/