### Using NLTK

In [3]:
#!pip3 install nltk

In [6]:
#import our librairies
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [27]:
#!pip3 install spacy
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz
#!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz

In [28]:
#!python3 -m spacy validate

In [26]:
#!python3 -m spacy download en_core_web_sm

## Information Extraction

In [54]:
#define our sentence
#text = 'The Mona Lisa is a 16th century oil painting created by Leonardo. It is held at the Louvre in Paris.'
text = 'The best of the New South Wales diamonds are harder and much whiter than the South African diamonds, and are classified as on a par with the best Brazilian gems, but no large specimens have yet been found.'

In [55]:
#we apply word tokenization and part of speech
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [56]:
sent = preprocess(text)
sent

[('The', 'DT'),
 ('best', 'JJS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('New', 'NNP'),
 ('South', 'NNP'),
 ('Wales', 'NNP'),
 ('diamonds', 'NNS'),
 ('are', 'VBP'),
 ('harder', 'JJR'),
 ('and', 'CC'),
 ('much', 'RB'),
 ('whiter', 'JJR'),
 ('than', 'IN'),
 ('the', 'DT'),
 ('South', 'JJ'),
 ('African', 'JJ'),
 ('diamonds', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('are', 'VBP'),
 ('classified', 'VBN'),
 ('as', 'IN'),
 ('on', 'IN'),
 ('a', 'DT'),
 ('par', 'NN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('Brazilian', 'JJ'),
 ('gems', 'NN'),
 (',', ','),
 ('but', 'CC'),
 ('no', 'DT'),
 ('large', 'JJ'),
 ('specimens', 'NNS'),
 ('have', 'VBP'),
 ('yet', 'RB'),
 ('been', 'VBN'),
 ('found', 'VBN'),
 ('.', '.')]

## Using Spacy

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [65]:
#we define our document
doc = nlp('The best of the New South Wales Diamonds are harder and much whiter than the South African Diamonds, and are classified as on a par with the best Brazilian gems, but no large specimens have yet been found.')

In [66]:
#print the entity level
print([(X.text, X.label_) for X in doc.ents])

[('the New South Wales Diamonds', 'ORG'), ('the South African Diamonds', 'LOC'), ('Brazilian', 'NORP')]


In [67]:
#print token-level entity annotation
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(The, 'O', ''), (best, 'O', ''), (of, 'O', ''), (the, 'B', 'ORG'), (New, 'I', 'ORG'), (South, 'I', 'ORG'), (Wales, 'I', 'ORG'), (Diamonds, 'I', 'ORG'), (are, 'O', ''), (harder, 'O', ''), (and, 'O', ''), (much, 'O', ''), (whiter, 'O', ''), (than, 'O', ''), (the, 'B', 'LOC'), (South, 'I', 'LOC'), (African, 'I', 'LOC'), (Diamonds, 'I', 'LOC'), (,, 'O', ''), (and, 'O', ''), (are, 'O', ''), (classified, 'O', ''), (as, 'O', ''), (on, 'O', ''), (a, 'O', ''), (par, 'O', ''), (with, 'O', ''), (the, 'O', ''), (best, 'O', ''), (Brazilian, 'B', 'NORP'), (gems, 'O', ''), (,, 'O', ''), (but, 'O', ''), (no, 'O', ''), (large, 'O', ''), (specimens, 'O', ''), (have, 'O', ''), (yet, 'O', ''), (been, 'O', ''), (found, 'O', ''), (., 'O', '')]


## Extracting entities from a blog

In [82]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.mdanderson.org/publications/cancerwise/my-proton-therapy-treatment-for-prostate-cancer-during-the-coronavirus-covid-19-pandemic.h00-159381945.html')
article = nlp(ny_bb)
len(article.ents)

107

There are 107 enitites in the blog and they are represented as 8 unique labels

In [84]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 30,
         'ORG': 36,
         'DATE': 14,
         'CARDINAL': 15,
         'GPE': 8,
         'PERCENT': 1,
         'FAC': 2,
         'MONEY': 1})

In [85]:
#let's show the two most frequent tokens
items = [x.text for x in article.ents]
Counter(items).most_common(2)

[('MD Anderson', 7), ('David Maddox', 4)]

In [157]:
#select randomly one sentence to learn more
sentences = [x for x in article.sents]
print(sentences[55])


As part of our mission to eliminate cancer, MD Anderson researchers conduct hundreds of clinical trials to test new treatments for both common and rare cancers.


In [158]:
#generate the raw markup
displacy.render(nlp(str(sentences[55])), jupyter=True, style='ent')

In [169]:
#show how the sentences and its dependencies lokk like
displacy.render(nlp(str(sentences[55])), style='dep', jupyter = True, options = {'distance': 43})

In [160]:
#verbatim, extract part-of-speech and lemmatize the sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[55])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('mission', 'NOUN', 'mission'),
 ('eliminate', 'VERB', 'eliminate'),
 ('cancer', 'NOUN', 'cancer'),
 ('MD', 'PROPN', 'MD'),
 ('Anderson', 'PROPN', 'Anderson'),
 ('researchers', 'NOUN', 'researcher'),
 ('conduct', 'VERB', 'conduct'),
 ('hundreds', 'NOUN', 'hundred'),
 ('clinical', 'ADJ', 'clinical'),
 ('trials', 'NOUN', 'trial'),
 ('test', 'VERB', 'test'),
 ('new', 'ADJ', 'new'),
 ('treatments', 'NOUN', 'treatment'),
 ('common', 'ADJ', 'common'),
 ('rare', 'ADJ', 'rare'),
 ('cancers', 'NOUN', 'cancer')]

In [161]:
dict([(str(x), x.label_) for x in nlp(str(sentences[55])).ents])

{'MD Anderson': 'PERSON', 'hundreds': 'CARDINAL'}

In [163]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[55]])

[(As, 'O', ''), (part, 'O', ''), (of, 'O', ''), (our, 'O', ''), (mission, 'O', ''), (to, 'O', ''), (eliminate, 'O', ''), (cancer, 'O', ''), (,, 'O', ''), (MD, 'B', 'PERSON'), (Anderson, 'I', 'PERSON'), (researchers, 'O', ''), (conduct, 'O', ''), (hundreds, 'B', 'CARDINAL'), (of, 'O', ''), (clinical, 'O', ''), (trials, 'O', ''), (to, 'O', ''), (test, 'O', ''), (new, 'O', ''), (treatments, 'O', ''), (for, 'O', ''), (both, 'O', ''), (common, 'O', ''), (and, 'O', ''), (rare, 'O', ''), (cancers, 'O', ''), (., 'O', '')]


In [164]:
#visualize the entity of the entire blog
displacy.render(article, jupyter=True, style='ent')