##SpaCy Installation
##Model en_core_web_md

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.4 MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051301 sha256=36d89b75ec19de87664b5e87138178ff20cd7b8f61351506f5806cf566ff01fa
  Stored in directory: /tmp/pip-ephem-wheel-cache-4wzplpgl/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installat

##Initializing Model and Imports

In [None]:
import spacy 
import en_core_web_md
#Initialized Model
nlp = spacy.load('en_core_web_md') 

##Regular vs Customized Tokenization

In [None]:
from spacy.symbols import ORTH 
doc = nlp("lemme that") 
print([w.text for w in doc])
special_case = [{ORTH: "lem"}, {ORTH: "me"}] 
nlp.tokenizer.add_special_case("lemme", special_case) 
print([w.text for w in nlp("lemme that")])

['lemme', 'that']
['lem', 'me', 'that']


##Sentence Segmentation

In [None]:
text = "I flied to N.Y yesterday. It was around 5 pm." 
doc = nlp(text)
for sent in doc.sents: 
    print(sent.text)

I flied to N.Y yesterday.
It was around 5 pm.


##Lemmatization

In [None]:

doc = nlp("I went there for working and worked for 3 years.") 
for token in doc:
    print(token.text, token.lemma_)

I -PRON-
went go
there there
for for
working working
and and
worked work
for for
3 3
years year
. .


##Customized Lemmatization

In [None]:
from spacy.symbols import ORTH, NORM
special_case = [{ORTH: 'Angeltown', NORM: 'Los Angeles'}] 
nlp.tokenizer.add_special_case('Angeltown', special_case)
doc = nlp(u'I am flying to Angeltown') 
for token in doc:
    print(token.text, token.norm_)

I i
am am
flying flying
to to
Angeltown Los Angeles


##JSON Serialization

In [None]:
doc = nlp("Hi") 
json_doc = doc.to_json()
print(json_doc)

{'text': 'Hi', 'ents': [], 'sents': [{'start': 0, 'end': 2}], 'tokens': [{'id': 0, 'start': 0, 'end': 2, 'pos': 'INTJ', 'tag': 'UH', 'dep': 'ROOT', 'head': 0}]}


In [None]:
doc = nlp("Hi Sir, how are you? First meeting in two weeks") 
json_doc = doc.to_json() 
print(json_doc)

{'text': 'Hi Sir, how are you? First meeting in two weeks', 'ents': [{'start': 21, 'end': 26, 'label': 'ORDINAL'}, {'start': 38, 'end': 47, 'label': 'DATE'}], 'sents': [{'start': 0, 'end': 20}, {'start': 21, 'end': 47}], 'tokens': [{'id': 0, 'start': 0, 'end': 2, 'pos': 'INTJ', 'tag': 'UH', 'dep': 'intj', 'head': 4}, {'id': 1, 'start': 3, 'end': 6, 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'npadvmod', 'head': 0}, {'id': 2, 'start': 6, 'end': 7, 'pos': 'PUNCT', 'tag': ',', 'dep': 'punct', 'head': 4}, {'id': 3, 'start': 8, 'end': 11, 'pos': 'ADV', 'tag': 'WRB', 'dep': 'advmod', 'head': 4}, {'id': 4, 'start': 12, 'end': 15, 'pos': 'AUX', 'tag': 'VBP', 'dep': 'ROOT', 'head': 4}, {'id': 5, 'start': 16, 'end': 19, 'pos': 'PRON', 'tag': 'PRP', 'dep': 'nsubj', 'head': 4}, {'id': 6, 'start': 19, 'end': 20, 'pos': 'PUNCT', 'tag': '.', 'dep': 'punct', 'head': 4}, {'id': 7, 'start': 21, 'end': 26, 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'head': 8}, {'id': 8, 'start': 27, 'end': 34, 'pos': 'NOUN', 'ta

#Dir Function 
#### Lists all the methods of the class

In [None]:
dir(doc)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_py_tokens',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_disk',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'merge',
 'noun_chunks',
 'noun_chunks_iterator',
 'print_tree',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set_extension',
 'similarity',
 'tensor',
 'text',
 'text_with_ws',
 'to_array',
 'to_byte

#Out of Vocabulary (OOV)

In [None]:
doc = nlp("I visited pneumoultramicroscopic at AMTDC internship") 
for token in doc:
    print(token, token.is_oov)

I False
visited False
pneumoultramicroscopic True
at False
AMTDC True
internship False


#General POS
## Not much relevant

In [None]:
doc = nlp("Alicia and me went to the school by bus.") 
for token in doc:
    print(token.text, token.pos_, token.tag_)
    print(spacy.explain(token.pos_), spacy.explain(token.tag_))

Alicia PROPN NNP
proper noun noun, proper singular
and CCONJ CC
coordinating conjunction conjunction, coordinating
me PRON PRP
pronoun pronoun, personal
went VERB VBD
verb verb, past tense
to ADP IN
adposition conjunction, subordinating or preposition
the DET DT
determiner determiner
school NOUN NN
noun noun, singular or mass
by ADP IN
adposition conjunction, subordinating or preposition
bus NOUN NN
noun noun, singular or mass
. PUNCT .
punctuation punctuation mark, sentence closer


##General Context Identification

In [None]:
doc = nlp("My cat will fish for a fish tomorrow in a fishy way.")
for token in doc:
    print(token.text, token.pos_, token.tag_)
    print(spacy.explain(token.pos_), spacy.explain(token.tag_))

My DET PRP$
determiner pronoun, possessive
cat NOUN NN
noun noun, singular or mass
will VERB MD
verb verb, modal auxiliary
fish VERB VB
verb verb, base form
for ADP IN
adposition conjunction, subordinating or preposition
a DET DT
determiner determiner
fish NOUN NN
noun noun, singular or mass
tomorrow NOUN NN
noun noun, singular or mass
in ADP IN
adposition conjunction, subordinating or preposition
a DET DT
determiner determiner
fishy ADJ JJ
adjective adjective
way NOUN NN
noun noun, singular or mass
. PUNCT .
punctuation punctuation mark, sentence closer


#Dependency Parsing

In [None]:
doc = nlp("We are trying to understand the difference.") 
for token in doc:
    print(token.text, token.tag_, token.dep_, token.head)

We PRP nsubj trying
are VBP aux trying
trying VBG ROOT trying
to TO aux understand
understand VB xcomp trying
the DT det difference
difference NN dobj understand
. . punct trying


###Matching With Spacy

In [None]:
from spacy.matcher import Matcher 
doc = nlp("I have a grinding machine related query. What are the types of grinding machines?") 
matcher = Matcher(nlp.vocab)
p1 = [{"TEXT":"I"}]
p2 = [{"LOWER":"query"}]
p3 = [{"LOWER":"grinding"}]
p4 = [{"LOWER":"types"}]
matcher.add("p1", [p1]) 
matcher.add("p2", [p2])
matcher.add("p3", [p3])
matcher.add("p4", [p4])
matches = matcher(doc) 
for mid, start, end in matches:
    print(start, end, doc[start:end])

0 1 I
3 4 grinding
6 7 query
11 12 types
13 14 grinding


### Phrase Matcher

In [None]:
from spacy.matcher import PhraseMatcher 
nlp = spacy.load("en_core_web_md") 
matcher = PhraseMatcher(nlp.vocab)
terms = ["Angela Merkel", "Donald Trump", "Alexis Tsipras"] 
patterns = [nlp.make_doc(term) for term in terms] 
matcher.add("politiciansList", None, *patterns) 
doc = nlp("3 EU leaders met in Berlin. German chancellor Angela Merkel first welcomed the US president Donald Trump. The following day Alexis Tsipras joined them in Brandenburg.") 
matches = matcher(doc) 
for mid, start, end in matches: 
    print(start, end, doc[start:end])

9 11 Angela Merkel
16 18 Donald Trump
22 24 Alexis Tsipras
