## Install [SpaCy](https://spacy.io/) Tutorial taken from [here](https://nlpforhackers.io/complete-guide-to-spacy/) (en)

In [None]:
!python3 -m spacy download en

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# Token Text

doc = nlp(u'Hello     World!')
print("doc contains the string is:", doc)

# Iterate over tokens in a Doc
for token in doc:
    print('"' + token.text + '"')

doc contains the string is: Hello     World!
"Hello"
"    "
"World"
"!"


In [None]:
# Token Text

doc = nlp(u"It’s official: Apple is the first U.S. public company to reach a $1 trillion market value")
print("doc contains the string is:", doc)

# Iterate over tokens in a Doc
for token in doc:
    print('"' + token.text + '"')


doc contains the string is: It’s official: Apple is the first U.S. public company to reach a $1 trillion market value
"It"
"’s"
"official"
":"
"Apple"
"is"
"the"
"first"
"U.S."
"public"
"company"
"to"
"reach"
"a"
"$"
"1"
"trillion"
"market"
"value"


In [None]:
# Token and id

doc = nlp(u'Hello     World!')
for token in doc:
    print('"' + token.text + '"', token.idx)
 

"Hello" 0
"    " 6
"World" 10
"!" 15


In [None]:
# Token class exposes a lot of word-level attributes

doc = nlp(u"Next week I'll   be in Madrid.")
print("text","\tid","\tlemma","\tpunct?","\tspace?","\tshape","\tpos","\ttag","\tdep")
print("--------------------------------------------------------------------------")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_,
        token.dep_,
        

    ))

text 	id 	lemma 	punct? 	space? 	shape 	pos 	tag 	dep
--------------------------------------------------------------------------
Next	0	next	False	False	Xxxx	ADJ	JJ	amod
week	5	week	False	False	xxxx	NOUN	NN	npadvmod
I	10	-PRON-	False	False	X	PRON	PRP	nsubj
'll	11	will	False	False	'xx	VERB	MD	aux
  	15	  	False	True	  	SPACE	_SP	
be	17	be	False	False	xx	AUX	VB	ROOT
in	20	in	False	False	xx	ADP	IN	prep
Madrid	23	Madrid	False	False	Xxxxx	PROPN	NNP	pobj
.	29	.	True	False	.	PUNCT	.	punct


In [None]:
# Sentence Detection

doc = nlp(u"Natural language (NL) refers to the language spoken/written by humans. NL is the primary mode of communication for humans. With the growth of the world wide web, data in the form of text has grown exponentially. It calls for the development of algorithms and techniques for processing natural language for the automation and development of intelligent machines.")
 
for sent in doc.sents:
    print(sent)

Natural language (NL) refers to the language spoken/written by humans.
NL is the primary mode of communication for humans.
With the growth of the world wide web, data in the form of text has grown exponentially.
It calls for the development of algorithms and techniques for processing natural language for the automation and development of intelligent machines.


In [None]:
# POS Tagging

doc = nlp(u"Next week I'll be in Madrid.")
print([(token.text, token.tag_) for token in doc])
 


[('Next', 'JJ'), ('week', 'NN'), ('I', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')]


In [None]:
# NER Named Entity Recognition

doc = nlp(u"A UN review of national plans to cut carbon says they are well short of the levels needed to keep the rise in global temperatures under 2C. Many scientists say that technology to remove carbon from the air will now be needed to meet the Paris targets.")
for ent in doc.ents:
    print(ent.text, ent.label_)


UN ORG
2C. CARDINAL
Paris GPE


In [None]:
# Spacy Entity Types

doc = nlp(u"I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ")
for ent in doc.ents:
    print(ent.text, ent.label_)


2 CARDINAL
9 a.m. TIME
30% PERCENT
just 2 days DATE
WSJ ORG


In [None]:
# Iterate over the doc.ents and print the entity text and label_ attribute.

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value. I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ"

# Process the text
doc = nlp(text)

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(f"{ent.text:<15}{ent.label_:<12}{spacy.explain(ent.label_):<12}")

Apple          ORG         Companies, agencies, institutions, etc.
first          ORDINAL     "first", "second", etc.
U.S.           GPE         Countries, cities, states
$1 trillion    MONEY       Monetary values, including unit
2              CARDINAL    Numerals that do not fall under another type
9 a.m.         TIME        Times smaller than a day
30%            PERCENT     Percentage, including "%"
just 2 days    DATE        Absolute or relative dates or periods
WSJ            ORG         Companies, agencies, institutions, etc.


In [None]:
# displaCy

from spacy import displacy
 
doc = nlp(u'I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Chunking

doc = nlp(u"Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)
 

Wall Street Journal NP Journal
an interesting piece NP piece
crypto currencies NP currencies


In [None]:
# Dependency Parsing

doc = nlp(u'Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/NNP <--compound-- currencies/NNS
currencies/NNS <--pobj-- on/IN


In [None]:
# Visualizing Dependency Parsing

from spacy import displacy
 
#doc = nlp(u'Wall Street Journal just published an interesting piece on crypto currencies')
doc = nlp(u'Wall Street Journal crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--compound-- currencies/NNS
crypto/NN <--compound-- currencies/NNS
currencies/NNS <--ROOT-- currencies/NNS
