### Week 4 Topic 3: Extracting Information using BeautifulSoup and prettify

#### Web page scraping tutorial with BeautifulSoup

##### HTML

In [10]:
# Step 1: view the website
# Step 2: download the website
import requests

page = requests.get("https://www.smh.com.au/")
print(page)

print(page.status_code) # if this = 200 then the website was successfully downloaded (smth starting with 4 or 5 generally
                        #  indicates an error)
#print(page.content) # prints the page content

# Step 3: parsing a page with BeautifulSoup
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, 'html.parser')
#print(soup.prettify()) # prints the page content nicely formatted

children = list(soup.children) # select all the elements at the top level of the page. children calls a list generator so
                                #   need to call a list function on it
#print(children)

print([type(item) for item in list(soup.children)])

html = list(soup.children)[1]
#print(html)
list(html.children)

head = list(html.children)[0]
head_items = list(head.children)
#print(head_items)
print(len(head_items))

title_text = head_items[23].attrs['content']
print(title_text)

# Step 4: finding all instances of a tag at once
soup = BeautifulSoup(page.content, 'html.parser')
#soup.find_all('p')

print(soup.find_all('p')[4].get_text())

# or only the first instance
print(soup.find('p').get_text())

# Steps 5-7 demonstration
# Step 7: extract all headlines
page = requests.get('https://www.smh.com.au/')
soup = BeautifulSoup(page.text, 'html.parser')
#we notice that news headlines are kept in the h3 tag
items = soup.select('h3') #get all h3 tags
for item in items:
    print(item.get_text())


<Response [200]>
200
[<class 'bs4.element.Doctype'>, <class 'bs4.element.Tag'>]
98
Australian Breaking News Headlines & World News Online | SMH.com.au
After six weeks, the Morrison government is reeling from shocking allegations of rape, sexual misconduct and more. Does it have a problem with women? And can it recover?
A weekly newsletter for book lovers from books editor Jason Steger.
The Booklist newsletter
Daily Crosswords
Greater Good newsletter
‘I intend to own those mistakes’: Andrew Laming won’t recontest after bad behaviour put under spotlight
Coalition hit by escalating series of missteps and scandals relating to women, sex and power
PM Scott Morrison admits Coalition failings on sexism, bad behaviour in leaked audio
‘Hacks, stacks and freaks’: Why do political staffers behave so badly?
Father asked school to report teen to police. Months later, his daughter was allegedly raped
At present rates, the gender pay gap will vanish in 268 years
Queensland finds missing link to clust

In [None]:
# Part 1: access a website
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
print(html[:60])

# Part 2: extract text with 'BeautifulSoup'
from bs4 import BeautifulSoup

raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = word_tokenize(raw)
print(tokens[:60])

# Part 3: clean the text and transform into a NLTK object
tokens = tokens[110:390]
text = nltk.Text(tokens)

print(text.concordance('gene'))

##### RSS feeds

In [None]:
import feedparser

llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
print(llog['feed']['title'])
print(len(llog.entries))

post = llog.entries[2]
print(post.title)

content = post.content[0].value
print(content[:70])

raw = BeautifulSoup(content, 'html.parser').get_text()
print(word_tokenize(raw)[:60])

##### Local files

In [None]:
# Part 1: load the text file
#f = open('document.txt')
#raw = f.read()

# Part 2: if errors appear, check all the files in the directory where IDLE is running
#import os
#print(os.listdir('.'))

# Part 3: read in file line by line
#f = open('document.txt', 'rU')

#for line in f:
#    print(line.strip())

# Part 4: access NLTK's corpus files
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'rU').read()

##### Prompting user input

In [None]:
s = input("Enter some text: ")

print("You typed", len(word_tokenize(s)), "words.")

#### The NLP Pipeline

In [None]:
# download webpage, strip HTML if necessary and trim to desired content
import nltk
from urllib import request

url = "https://www.abc.net.au/news/2021-03-19/bom-weather-forecast-dangerous-nsw-rain-floods-over-weekend/100017410"

html = request.urlopen(url).read().decode('utf-8')
raw = BeautifulSoup(html).get_text()

raw.find("The Bureau of Meteorology (BOM)")
raw.find('the road."') + len('the road."')
raw = raw[1127:4011]

# tokenise the text, select tokens of interest (if applicable) and create a NLTK text
tokens = nltk.wordpunct_tokenize(raw)
print(type(tokens))

#tokens = tokens[:1000]

text = nltk.Text(tokens)
print(type(text))

# normalise words and build the vocabulary
words = [w.lower() for w in text]
print(type(words))

vocab = sorted(set(words))
print(type(vocab))

#### Unicode

In [None]:
# Part 1: locate the file
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')

# Part 2: open and inspect the file contents
f = open(path, encoding = 'latin2')
#for line in f:
#    line = line.strip()
#   print(line)
    
# Part 3: convert all non-ASCII characters into their two- and four-digit representations if contents don't display
#           correctly or wish to see the underlying numerical values
for line in f:
    line = line.strip()
    print(line.encode('unicode_escape'))

# Part 4-6
print(ord('ń')) # locate the integer ordinal of a character

print(hex(324)) # define strings with their appropriate escape sequence
nacute = '\u0144'
print(nacute)

print(nacute.encode('utf8')) # determine how a character is represented as a sequence of bites inside a text file

# Part 7: inspect the properties of unicode characters
import unicodedata

lines = open(path, encoding = 'latin2').readlines()
line = lines[2]
print(line.encode('unicode_escape'))

for c in line:
    if ord(c) > 127:
        print('{} U+{:04x} {}'.format(c.encode('utf8'), ord(c), unicodedata.name(c)))

In [None]:
#### Regular Expressions

In [None]:
## Using basic meta-characters
import re

wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

# search for words ending in 'ed'
print([w for w in wordlist if re.search('ed$', w)][:20])

# count the number of occurences of a word
print(sum(1 for w in text if re.search('^e-?mail$', w)))

## Useful applications of regular expressions
# 1. Extracting word pieces
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
# re.findall() finds all non-overlapping matches of a given regex
                  for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)

# 2. Finding word stems
# function to strip anything looking like a suffix from a word
def stem_word(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

# build a disjunction of all suffixes
print(re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing'))
print(re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')) # specifies scope of disjunction but not
                                                                             #  material to be output
print(re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')) # split word into stem and suffix
print(re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes'))  # but there's a problem...
print(re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')) # need to use the non-greedy version of *
print(re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')) # works even with a non-existent suffix when ?
                                                                             #  included at end of second parentheses

# application to a whole text
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. 
         Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
print([stem(t) for t in tokens])

## Lemmatisation
# will only remove affixes if the word is in its dictionary
wnl = nltk.WordNetLemmatizer()
print([wnl.lemmatize(t) for t in tokens])

## Regular expression tokeniser
text = 'That U.S.A. poster-print costs $12.40...'

pattern = r'''(?x)     # set flag to allow verbose regexps
     ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(-\w+)*        # words with optional internal hyphens
   | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
 '''

print(nltk.regexp_tokenize(text, pattern))

## Sentence segmentation
print(len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents()))

text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])

#### Learning to Classify Text

##### Supervised Classification

In [None]:
## Gender identification
# Part 1: define a function to build a dictionary containing relevant information about a given name
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')

# Part 2: prepare a list of examples and corresponding class labels
from nltk.corpus import names
labelled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])

import random
random.shuffle(labelled_names)

# Part 3: use a feature extractor to process the data and split it into a training and test set
featuresets = [(gender_features(n), gender) for (n, gender) in labelled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Part 4: test it on some names in the test set
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

# Part 5: evaluate it on a larger test set
print(nltk.classify.accuracy(classifier, test_set))

# Part 6: determine which features the classifier found most useful for distinguishing gender
print(classifier.show_most_informative_features(5))

## Document classification
# Part 1: choose a corpus
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Part 2: define a feature extractor that checks whether one of the 2,000 most-frequent words from the corpus are in a
#           particular document
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

dict = document_features(movie_reviews.words('pos/cv957_8737.txt'))
for key in list(dict)[:10]:
    print(key, dict[key])

# Part 3: use feature extractor to train classifier to label new movie reviews
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(5))

##### Evaluation

In [None]:
# Option 1: create training and test sets by randomly assigning sentences from a data source reflecting a single genre
# training and test sets will be very similar, and will not be able to confidently generalise results to other genres
import random
from nltk.corpus import brown

tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

# Option 2: create training and test sets from different documents 
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

# Option 3: create test set from documents less-closely related to those in training set
# more stringent evaluation of classifier. If it performs well here, confident that classifier will perform well on data
#   very different to that it was trained on 
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

# Accuracy - simplest metric used to evaluate a classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))