In [29]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
text = "Switzerland as a whole is now effectively a high-risk area for coronavirus after infection rates in the country passed the Swiss government's own safety threshold. When Switzerland put in place its quarantine requirement for foreign travellers, the government established a threshold of 60 infections per 100,000 inhabitants over the previous 14 days to determine which countries were ‘high risk’."

## Create SpaCy object

In [30]:
doc = nlp(text)

## Lowercasing 

Convert text to lowercase ("Infection" and "infection" should be counted as the same word).

Lowercasing text is a normalization technique. This is not always useful, for example NER (Named Entity Recognition) wouldn't work for some words that are not capitalized. 

In [31]:
print(text.lower())


switzerland as a whole is now effectively a high-risk area for coronavirus after infection rates in the country passed the swiss government's own safety threshold. when switzerland put in place its quarantine requirement for foreign travellers, the government established a threshold of 60 infections per 100,000 inhabitants over the previous 14 days to determine which countries were ‘high risk’.


## Stop words
Very common words that do not carry a lot of semantic meaning. 

In [32]:
print(nlp.Defaults.stop_words)

{'myself', 'name', 'then', 'various', 'formerly', 'always', 'ca', 'already', 'has', 'nevertheless', 'across', '’re', 'doing', 'any', 'fifteen', 'enough', 'it', 'under', 'unless', 'ten', 'seem', 'while', 'yours', 'her', 'become', 'put', 'what', '‘m', 'as', 'made', '’m', 'moreover', 'own', 'both', 'these', 'many', 'amongst', 'beside', 'becomes', 'for', 'used', 'such', 'did', 'whatever', 'up', 'are', 'beforehand', 'whom', 'therefore', 'me', 'their', 'regarding', 'since', 'mine', 'fifty', 'indeed', 'show', 'have', 'can', "'d", 'very', 'behind', 'among', 'had', 'before', 'your', 'three', 'whereby', '‘re', 'go', 'n’t', 'less', 'via', 'several', 'something', 'using', 'front', '‘ve', 'hereafter', 'well', 'latterly', 'five', "n't", 'keep', 'please', 'twenty', 'about', 'first', 'thereafter', 'next', 'where', 'he', 'be', 'anything', 'really', 'latter', 'seems', 'only', 'otherwise', 'in', 'been', 'am', 'anyhow', 'quite', 'whither', 'either', 'why', 'hereupon', 'due', 'here', 'thence', 'ever', 'her

## Tokenize vs Lemmatize
Tokenization and Lemmatization are also text normalization techniques. 
Tokenization breaks down the text into individual units or tokens as shown below.  Lemmatization goes one step further to transform each token into it's root form. 


### Tokenize

In [33]:
tokens_all=[token.text for token in doc]
tokens= [token.text for token in doc if not token.is_stop]
print(tokens_all)
print(tokens)

['Switzerland', 'as', 'a', 'whole', 'is', 'now', 'effectively', 'a', 'high', '-', 'risk', 'area', 'for', 'coronavirus', 'after', 'infection', 'rates', 'in', 'the', 'country', 'passed', 'the', 'Swiss', 'government', "'s", 'own', 'safety', 'threshold', '.', 'When', 'Switzerland', 'put', 'in', 'place', 'its', 'quarantine', 'requirement', 'for', 'foreign', 'travellers', ',', 'the', 'government', 'established', 'a', 'threshold', 'of', '60', 'infections', 'per', '100,000', 'inhabitants', 'over', 'the', 'previous', '14', 'days', 'to', 'determine', 'which', 'countries', 'were', '‘', 'high', 'risk', '’', '.']
['Switzerland', 'effectively', 'high', '-', 'risk', 'area', 'coronavirus', 'infection', 'rates', 'country', 'passed', 'Swiss', 'government', 'safety', 'threshold', '.', 'Switzerland', 'place', 'quarantine', 'requirement', 'foreign', 'travellers', ',', 'government', 'established', 'threshold', '60', 'infections', '100,000', 'inhabitants', 'previous', '14', 'days', 'determine', 'countries', 

### Lemmatize

In [34]:
lemmas= [token.lemma_ for token in doc if not token.is_stop]
print(lemmas)

['Switzerland', 'effectively', 'high', '-', 'risk', 'area', 'coronavirus', 'infection', 'rate', 'country', 'pass', 'swiss', 'government', 'safety', 'threshold', '.', 'Switzerland', 'place', 'quarantine', 'requirement', 'foreign', 'traveller', ',', 'government', 'establish', 'threshold', '60', 'infection', '100,000', 'inhabitant', 'previous', '14', 'day', 'determine', 'country', "'", 'high', 'risk', "'", '.']


## Part of Speech (POS, POS Tagging)

Tags the words as noun, verb, adjective, adverb etc.

In [35]:
for token in doc:
    if not token.is_stop:
        print(token, token.pos_)

Switzerland PROPN
effectively ADV
high ADJ
- PUNCT
risk NOUN
area NOUN
coronavirus NOUN
infection NOUN
rates NOUN
country NOUN
passed VERB
Swiss ADJ
government NOUN
safety NOUN
threshold NOUN
. PUNCT
Switzerland PROPN
place NOUN
quarantine ADJ
requirement NOUN
foreign ADJ
travellers NOUN
, PUNCT
government NOUN
established VERB
threshold NOUN
60 NUM
infections NOUN
100,000 NUM
inhabitants NOUN
previous ADJ
14 NUM
days NOUN
determine VERB
countries NOUN
‘ PUNCT
high ADJ
risk NOUN
’ PUNCT
. PUNCT


## Dependencies 
Analyzing the structure of a sentence to determine the relationships between the tokens.

In [36]:
for token in doc:
    if not token.is_stop:
        print(token, token.pos_, token.dep_)

Switzerland PROPN nsubj
effectively ADV advmod
high ADJ amod
- PUNCT punct
risk NOUN compound
area NOUN attr
coronavirus NOUN pobj
infection NOUN compound
rates NOUN nsubj
country NOUN pobj
passed VERB advcl
Swiss ADJ amod
government NOUN poss
safety NOUN compound
threshold NOUN dobj
. PUNCT punct
Switzerland PROPN nsubj
place NOUN pobj
quarantine ADJ compound
requirement NOUN dobj
foreign ADJ amod
travellers NOUN pobj
, PUNCT punct
government NOUN nsubj
established VERB ROOT
threshold NOUN dobj
60 NUM nummod
infections NOUN pobj
100,000 NUM nummod
inhabitants NOUN pobj
previous ADJ amod
14 NUM nummod
days NOUN pobj
determine VERB relcl
countries NOUN nsubj
‘ PUNCT punct
high ADJ amod
risk NOUN attr
’ PUNCT punct
. PUNCT punct


In [28]:
from spacy import displacy
displacy.render(doc, style="dep")

## Named Entity Recognition (NER)

In [37]:
for ent in doc.ents: 
        print(token, ent, ent.label_)
#GPE -->Geopolitical entity, i.e. countries, cities, states.
#NORP -->Nationalities or religious or political groups.
#https://explosion.ai/demos/displacy-ent

. Switzerland GPE
. Swiss NORP
. Switzerland GPE
. 60 CARDINAL
. 100,000 CARDINAL
. the previous 14 days DATE


In [38]:
displacy.render(doc, style="ent")