In [1]:
import spacy
import pandas as pd
import sys
import re
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
from spacy import displacy

In [16]:
nlp= spacy.load("en_core_web_sm")
s= """The economic situation of the 
country is on edge , as the stock 
market crashed causing loss of millions. name : Shashank age: 54 email: shank@rediffmail.com Citizens who had their main 19% investment 
in the share-market are facing a great loss. Many 2000 companies might lay off 
thousands of people to 2009 reduce labor cost, name : pratham parmar age: 15 email : parmar15@yahoo.com
                 """

In [17]:
doc=nlp(s)

doc.has_annotation("DEP") #which checks whether the attribute Token.dep has been set returns a boolean value.

True

In [4]:
#Dependency Parsing :It defines the dependency relationship between headwords and their dependents.

# the process of generating the relationship among different words of a sentence and describing their syntactic roles
# The head of a sentence has no dependency and is called the root of the sentence.The verb is usually the head of the 
# sentence. All other words are linked to the headword.

# Dependency parsing helps you know what role a word plays in the text and how different words relate to each other. 
# It’s also used in "shallow parsing" and "named entity recognition".
    
print ("{:<15} | {:<8} | {:<15} | {:<10} | {:<10}".format('Token','Relation','Head','Tag', 'Children'))
print ("-" * 70)

for token in doc[:5]:
  # Print the token, dependency nature, head and all dependents of the token
  print ("{:<15} | {:<8} | {:<15} | {:<10} | {:<10}"
         .format(str(token.text), str(token.dep_), str(token.head.text),str(token.tag_),str([child for child in token.children])))

Token           | Relation | Head            | Tag        | Children  
----------------------------------------------------------------------
The             | det      | situation       | DT         | []        
economic        | amod     | situation       | JJ         | []        
situation       | nsubj    | is              | NN         | [The, economic, of]
of              | prep     | situation       | IN         | [country] 
the             | det      | country         | DT         | [
]       


In [5]:
# Shallow parsing, or chunking

#noun_chunks : You can think of noun chunks as a noun plus the words describing the noun 
rows=[]
cols = ("text","Root text","Root dep","Root_head_text")
for chunk in doc.noun_chunks:
    row=chunk.text, chunk.root.text, chunk.root.dep_,chunk.root.head.text
    rows.append(row)
df = pd.DataFrame(rows,columns=cols)
print(df[:10])

                     text  Root text Root dep Root_head_text
0  The economic situation  situation    nsubj             is
1           the \ncountry    country     pobj             of
2                    edge       edge     pobj             on
3      the stock \nmarket     market    nsubj        crashed
4                    loss       loss     dobj        causing
5                millions   millions     pobj             of
6                    name       name     ROOT           name
7            Shashank age        age     ROOT            age
8                54 email      email    appos            age
9                Citizens   Citizens    nsubj         facing


In [6]:
# Visualizing

# options = {"compact": True, "bg": "#09a3d5",
#            "color": "red", "font": "Source Sans Pro"}

# displacy.serve(doc, style="dep", options=options)
# displacy.render(doc, style="dep")



# Serve arguements:
# Docs	list, doc, Span	It represents the document to visualize.	
# Style	Unicode	We have two visualization style namely ‘dep’, or ‘ent’.	The default value is ‘dep’.
# Page	bool	It will render the markup as full HTML page.	The default value is true.
# minify	bool	This argument will minify the HTML markup.	The default value is false.
# options	dict	It represents the visualizers-specific options. For example, colors.	{}
# manual	bool	This argument will not parse Doc and instead, expect a dict or list of dicts.	The default value is false.
# Port	int	It is the port number to serve visualization.	5000
# Host	unicode	It is the Host number to serve visualization.	'0.0.0.0'



# Render arguements:(excluding port and host from serve arguments + below one)
# jupyter	Bool	To return markup ready to be rendered in a notebook, this argument will explicitly enable or disable the Jupyter mode. If we will not provide this argument, it will automatically detect.	None



# Dependency Visualizer options

# fine_grained    	bool	Put the value of this argument True, if you want to use fine-grained part-of-speech tags (Token.tag_), instead of coarse-grained tags (Token.pos_).	The default value is False.
# add_lemma       	bool	Introduced in version 2.2.4, this argument prints the lemma’s in a separate row below the token texts.	The default value is False.
# collapse_punct  	bool	It attaches punctuation to the tokens.	The default value is True.
# collapse_phrases	bool	This argument merges the noun phrases into one token.	The default value is False.
# compact         	bool	If you will take this argument as true, you will get the “Compact mode” with square arrows that takes up less space.	The default value is False.
# color           	unicode 	As name implies, this argument is for the text color (HEX, RGB or color names).	'#000000'
# bg              	unicode	As name implies, this argument is for the Background color (HEX, RGB or color names).	'#ffffff'
# font            	unicode	It is for the font name.	Default value is 'Arial'.
# offset_x        	int	This argument is used for spacing on left side of the SVG in px.	The default value of this argument is 50.
# arrow_stroke    	int	This argument is used for adjusting the width of arrow path in px.	The default value of this argument is 2.
# arrow_width     	int	This argument is used for adjusting the width of arrow head in px.	The default value of this argument is 10 / 8 (compact).
# arrow_spacing   	int	This argument is used for adjusting the spacing between arrows in px to avoid overlaps.	The default value of this argument is 20 / 12 (compact).
# word_spacing    	int	This argument is used for adjusting the vertical spacing between words and arcs in px.	The default value of this argument is 45.
# distance        	int	This argument is used for adjusting the distance between words in px.	The default value of this argument is 175 / 150 (compact).



# Named Entity Visualizer options:

# ents	list	It represents the entity types to highlight. Put None for all types.	The default value is None.
# colors	Dict	As name implies, it is use for color overrides. The entity types in uppercase must mapped to color name.	{}

In [7]:
colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["ORG"], "colors": colors}
displacy.render(doc, style="ent", options=options)

In [8]:
doc = nlp("This is krishna.com.")
options = {"compact": True, "bg": "#09a3d5",
           "color": "red", "font": "Source Sans Pro"}
displacy.render(doc, style="dep", options=options)

In [9]:
spacy.explain("AUX")

'auxiliary'

In [44]:
# Named Entity Recognition
print ("{:<15} | {:<8} | {:<15} | {:<20} | {:<10}".format("entity","start character","end character","specifies the category","Explanation"))
print ("-" * 90)

for ent in doc.ents:
    print ("{:<15} | {:<15} | {:<15} | {:<20} | {:15}"
         .format(str(ent.text), str(ent.start_char), str(ent.end_char),str(ent.label_),str(spacy.explain(ent.label_))))

entity          | start character | end character   | specifies the category | Explanation
------------------------------------------------------------------------------------------
millions        | 97              | 105             | CARDINAL             | Numerals that do not fall under another type
Shashank        | 114             | 122             | PERSON               | People, including fictional
54              | 128             | 130             | CARDINAL             | Numerals that do not fall under another type
Citizens        | 159             | 167             | PERSON               | People, including fictional
19%             | 187             | 190             | PERCENT              | Percentage, including "%"
2000            | 253             | 257             | DATE                 | Absolute or relative dates or periods
thousands       | 283             | 292             | CARDINAL             | Numerals that do not fall under another type
2009            | 306   

In [46]:
for ent in doc.ents: 
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

millions 20 21 97 105 CARDINAL
Shashank 24 25 114 122 PERSON
54 27 28 128 130 CARDINAL
Citizens 31 32 159 167 PERSON
19% 36 38 187 190 PERCENT
2000 52 53 253 257 DATE
thousands 58 59 283 292 CARDINAL
2009 62 63 306 310 DATE
pratham parmar age 69 72 337 355 ORG
15 73 74 357 359 CARDINAL


In [39]:
displacy.render(doc, style="ent")

In [52]:
# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)
print(ent_francisco)

['The', 'O', '']
['economic', 'O', '']


economic