In [1]:
import nltk, re, pprint

In [2]:
from nltk import word_tokenize

Accessing Text from the Web

In [3]:
from urllib import request

In [4]:
url = 'https://www.medicalnewstoday.com/articles/242842#1'
response = request.urlopen(url)
raw = response.read().decode('utf8')

In [5]:
len(raw)

73189

In [6]:
raw[:60]

'<!DOCTYPE html><html lang="en"><head>\n<link rel=preload href'

the contents from HTML includes meta tags, an image map, JavaScript, forms and tables

In [7]:
from bs4 import BeautifulSoup

In [8]:
html_raw = BeautifulSoup(raw, 'html.parser').get_text()

In [9]:
tokens = word_tokenize(html_raw)

In [10]:
text = nltk.Text(tokens)

In [11]:
text[:9]

['Daily',
 'Sugary',
 'Drinks',
 'Raise',
 'Heart',
 'Disease',
 'Risk',
 'In',
 'Males']

In [12]:
tokens = tokens[5360:5468]

In [13]:
main_text = nltk.Text(tokens)

In [14]:
main_text.concordance("heart")

Displaying 5 of 5 matches:
d raise a man ’ s risk of developing heart disease by 20 % if he drinks one per
nce portion . Risk factors linked to heart disease include poor diet , obesity 
r diet , obesity , family history of heart disease , regular tobacco smoking , 
diabetes . According to the American Heart Association , heart disease is the l
 to the American Heart Association , heart disease is the largest cause of deat


main_text = text[5343:5468]

This is the raw content from webpage, including many details we are not interested in. For our language processing, we want to break up 
the string into words and punctuation. This step is called tokenization

In [15]:
pos_text = text[:9]

In [16]:
nltk.pos_tag(pos_text)

[('Daily', 'JJ'),
 ('Sugary', 'NNP'),
 ('Drinks', 'NNP'),
 ('Raise', 'NNP'),
 ('Heart', 'NNP'),
 ('Disease', 'NNP'),
 ('Risk', 'NNP'),
 ('In', 'IN'),
 ('Males', 'NNP')]

In [17]:
pos_tag_text = [ ('Daily', 'JJ'),
 ('Sugary', 'JJ'),
 ('Drinks', 'NNP'),
 ('Raise', 'VB'),
 ('Heart', 'NNP'),
 ('Disease', 'NNP'),
 ('Risk', 'NNP'),
 ('In', 'IN'),
 ('Males', 'NNP')]

regular expression     ``` *(0 or more)  ?(0 or one)  ```

In [48]:
grammar = r""" NP: {<DT|PP>?<JJ>*<NNP><NNP>*}"""

In [49]:
cp = nltk.RegexpParser(grammar)

In [50]:
chunk_result = cp.parse(pos_tag_text)

In [51]:
print(chunk_result)

(S
  (NP Daily/JJ Sugary/JJ Drinks/NNP)
  Raise/VB
  (NP Heart/NNP Disease/NNP Risk/NNP)
  In/IN
  (NP Males/NNP))


In [52]:
type(chunk_result)

nltk.tree.Tree

In [None]:
chunk_result.draw()

In [24]:
sent = nltk.corpus.treebank.tagged_sents()[22]

In [25]:
print(sent)

[('The', 'DT'), ('U.S.', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('few', 'JJ'), ('industrialized', 'VBN'), ('nations', 'NNS'), ('that', 'WDT'), ('*T*-7', '-NONE-'), ('does', 'VBZ'), ("n't", 'RB'), ('have', 'VB'), ('a', 'DT'), ('higher', 'JJR'), ('standard', 'NN'), ('of', 'IN'), ('regulation', 'NN'), ('for', 'IN'), ('the', 'DT'), ('smooth', 'JJ'), (',', ','), ('needle-like', 'JJ'), ('fibers', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('crocidolite', 'NN'), ('that', 'WDT'), ('*T*-1', '-NONE-'), ('are', 'VBP'), ('classified', 'VBN'), ('*-5', '-NONE-'), ('as', 'IN'), ('amphobiles', 'NNS'), (',', ','), ('according', 'VBG'), ('to', 'TO'), ('Brooke', 'NNP'), ('T.', 'NNP'), ('Mossman', 'NNP'), (',', ','), ('a', 'DT'), ('professor', 'NN'), ('of', 'IN'), ('pathlogy', 'NN'), ('at', 'IN'), ('the', 'DT'), ('University', 'NNP'), ('of', 'IN'), ('Vermont', 'NNP'), ('College', 'NNP'), ('of', 'IN'), ('Medicine', 'NNP'), ('.', '.')]


In [26]:
type(sent)

list

In [27]:
print(nltk.ne_chunk(pos_tag_text, binary=True))

(S
  (NE Daily/JJ)
  Sugary/JJ
  Drinks/NNP
  Raise/VB
  (NE Heart/NNP Disease/NNP Risk/NNP)
  In/IN
  (NE Males/NNP))


In [28]:
print(nltk.ne_chunk(pos_tag_text))

(S
  (GPE Daily/JJ)
  (ORGANIZATION Sugary/JJ Drinks/NNP)
  Raise/VB
  (PERSON Heart/NNP Disease/NNP Risk/NNP)
  In/IN
  (GPE Males/NNP))
