### Week 3 Topic 3: Text Processing with NLTK Part 2

#### Accessing Text From the Web and From Disk

In [1]:
import nltk, re, pprint
from nltk import word_tokenize

##### Electronic books

In [2]:
# Part 1: access text 2554 ("Crime and Punishment")
# the read() process can take a couple of seconds as it's downloading a large book. If an internet proxy is being used that
# isn't correctly detected by Python, the proxy may need to be specified manually before using urlopen()

from urllib import request

# proxies = {'http': 'http://www.someproxy.com:3128'}
# request.ProxyHandler(proxies)

url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')

print(type(raw))
print(len(raw))
print(raw[:75])

# Part 2: text tokenisation
tokens = word_tokenize(raw)
print(type(tokens))
print(len(tokens))
print(tokens[:10])

# Part 3: create an NLTK text from this list
text = nltk.Text(tokens)
print(type(text))
print(text[1024:1062])
print(text.collocations())

# Part 4: locate unique strings marking the beginning and end before trimming
print(raw.find("PART I"))
print(raw.rfind("End of Project Gutenberg's Crime")) # reverse find

raw = raw[5336:-1]
print(raw.find("PART I"))

<class 'str'>
1176967
﻿The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky
<class 'list'>
257085
['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by']
<class 'nltk.text.Text'>
['I', 'CHAPTER', 'I', 'On', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 'S.', 'Place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K.', 'bridge']
Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Ilya Petrovitch; Project
Gutenberg; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens
None
5336
-1
0


##### HTML

In [3]:
# Part 1: access a website
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
print(html[:60])

# Part 2: extract text with 'BeautifulSoup'
from bs4 import BeautifulSoup

raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = word_tokenize(raw)
print(tokens[:60])

# Part 3: clean the text and transform into a NLTK object
tokens = tokens[110:390]
text = nltk.Text(tokens)

print(text.concordance('gene'))

<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN
['BBC', 'NEWS', '|', 'Health', '|', 'Blondes', "'to", 'die', 'out', 'in', '200', "years'", 'NEWS', 'SPORT', 'WEATHER', 'WORLD', 'SERVICE', 'A-Z', 'INDEX', 'SEARCH', 'You', 'are', 'in', ':', 'Health', 'News', 'Front', 'Page', 'Africa', 'Americas', 'Asia-Pacific', 'Europe', 'Middle', 'East', 'South', 'Asia', 'UK', 'Business', 'Entertainment', 'Science/Nature', 'Technology', 'Health', 'Medical', 'notes', '--', '--', '--', '--', '--', '--', '-', 'Talking', 'Point', '--', '--', '--', '--', '--', '--', '-']
Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin
None


##### RSS feeds

In [4]:
import feedparser

llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
print(llog['feed']['title'])
print(len(llog.entries))

post = llog.entries[2]
print(post.title)

content = post.content[0].value
print(content[:70])

raw = BeautifulSoup(content, 'html.parser').get_text()
print(word_tokenize(raw)[:60])

Language Log
13
Equal representation in the halls of quackery
<p><a href="https://www.smbc-comics.com/comic/pp" rel="noopener" targe
['Today', "'s", 'SMBC', 'starts', 'this', 'way', ':', 'The', 'rant', 'continues', 'at', 'some', 'length', '—', 'I', 'think', 'my', 'favorite', 'parts', 'are', 'the', 'digital', 'chakra-detoxification', 'algorithms', 'and', 'the', 'Turing-complete', 'microbiome', '(', 'with', 'its', 'obvious', 'connections', 'to', 'nanotechnology', ',', 'edge', 'computing', ',', 'message-passing', 'via', 'mRNA', ',', 'etc', '.', ')', ':', 'The', 'mouseover', 'title', '[', 'link', 'added', ']', ':', '``', 'As', 'I', 'post', 'this']


##### Local files

In [5]:
# Part 1: load the text file
#f = open('document.txt')
#raw = f.read()

# Part 2: if errors appear, check all the files in the directory where IDLE is running
#import os
#print(os.listdir('.'))

# Part 3: read in file line by line
#f = open('document.txt', 'rU')

#for line in f:
#    print(line.strip())

# Part 4: access NLTK's corpus files
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'rU').read()

  raw = open(path, 'rU').read()


##### Prompting user input

In [6]:
s = input("Enter some text: ")

print("You typed", len(word_tokenize(s)), "words.")

Enter some text: helo
You typed 1 words.


#### The NLP Pipeline

In [None]:
# download webpage, strip HTML if necessary and trim to desired content
import nltk
from urllib import request

url = "https://www.abc.net.au/news/2021-03-19/bom-weather-forecast-dangerous-nsw-rain-floods-over-weekend/100017410"

html = request.urlopen(url).read().decode('utf-8')
raw = BeautifulSoup(html).get_text()

raw.find("The Bureau of Meteorology (BOM)")
raw.find('the road."') + len('the road."')
raw = raw[1127:4011]

# tokenise the text, select tokens of interest (if applicable) and create a NLTK text
tokens = nltk.wordpunct_tokenize(raw)
print(type(tokens))

#tokens = tokens[:1000]

text = nltk.Text(tokens)
print(type(text))

# normalise words and build the vocabulary
words = [w.lower() for w in text]
print(type(words))

vocab = sorted(set(words))
print(type(vocab))

#### Unicode

In [9]:
# Part 1: locate the file
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')

# Part 2: open and inspect the file contents
f = open(path, encoding = 'latin2')
#for line in f:
#    line = line.strip()
#   print(line)
    
# Part 3: convert all non-ASCII characters into their two- and four-digit representations if contents don't display
#           correctly or wish to see the underlying numerical values
for line in f:
    line = line.strip()
    print(line.encode('unicode_escape'))

# Part 4-6
print(ord('ń')) # locate the integer ordinal of a character

print(hex(324)) # define strings with their appropriate escape sequence
nacute = '\u0144'
print(nacute)

print(nacute.encode('utf8')) # determine how a character is represented as a sequence of bites inside a text file

# Part 7: inspect the properties of unicode characters
import unicodedata

lines = open(path, encoding = 'latin2').readlines()
line = lines[2]
print(line.encode('unicode_escape'))

for c in line:
    if ord(c) > 127:
        print('{} U+{:04x} {}'.format(c.encode('utf8'), ord(c), unicodedata.name(c)))

b'Pruska Biblioteka Pa\\u0144stwowa. Jej dawne zbiory znane pod nazw\\u0105'
b'"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez'
b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y'
b'odnalezione po 1945 r. na terytorium Polski. Trafi\\u0142y do Biblioteki'
b'Jagiello\\u0144skiej w Krakowie, obejmuj\\u0105 ponad 500 tys. zabytkowych'
b'archiwali\\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.'
324
0x144
ń
b'\xc5\x84'
b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'
b'\xc3\xb3' U+00f3 LATIN SMALL LETTER O WITH ACUTE
b'\xc5\x9b' U+015b LATIN SMALL LETTER S WITH ACUTE
b'\xc5\x9a' U+015a LATIN CAPITAL LETTER S WITH ACUTE
b'\xc4\x85' U+0105 LATIN SMALL LETTER A WITH OGONEK
b'\xc5\x82' U+0142 LATIN SMALL LETTER L WITH STROKE


In [None]:
#### Regular Expressions

In [10]:
## Using basic meta-characters
import re

wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

# search for words ending in 'ed'
print([w for w in wordlist if re.search('ed$', w)][:20])

# count the number of occurences of a word
print(sum(1 for w in text if re.search('^e-?mail$', w)))

## Useful applications of regular expressions
# 1. Extracting word pieces
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
# re.findall() finds all non-overlapping matches of a given regex
                  for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)

# 2. Finding word stems
# function to strip anything looking like a suffix from a word
def stem_word(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

# build a disjunction of all suffixes
print(re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing'))
print(re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')) # specifies scope of disjunction but not
                                                                             #  material to be output
print(re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')) # split word into stem and suffix
print(re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes'))  # but there's a problem...
print(re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')) # need to use the non-greedy version of *
print(re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')) # works even with a non-existent suffix when ?
                                                                             #  included at end of second parentheses

# application to a whole text
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. 
         Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
print([stem(t) for t in tokens])

## Lemmatisation
# will only remove affixes if the word is in its dictionary
wnl = nltk.WordNetLemmatizer()
print([wnl.lemmatize(t) for t in tokens])

## Regular expression tokeniser
text = 'That U.S.A. poster-print costs $12.40...'

pattern = r'''(?x)     # set flag to allow verbose regexps
     ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(-\w+)*        # words with optional internal hyphens
   | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
 '''

print(nltk.regexp_tokenize(text, pattern))

## Sentence segmentation
print(len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents()))

text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])

['abaissed', 'abandoned', 'abased', 'abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed', 'absconded', 'absorbed', 'abstracted', 'abstricted', 'accelerated', 'accepted', 'accidented', 'accoladed', 'accolated', 'accomplished', 'accosted']
0
['ing']
['processing']
[('process', 'ing')]
[('processe', 's')]
[('process', 'es')]
[('language', '')]
['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'Supreme', 'execut', 'power', 'deriv', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']
['DENNIS', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']
[('', '',

#### Learning to Classify Text

##### Supervised Classification

In [11]:
## Gender identification
# Part 1: define a function to build a dictionary containing relevant information about a given name
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')

# Part 2: prepare a list of examples and corresponding class labels
from nltk.corpus import names
labelled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])

import random
random.shuffle(labelled_names)

# Part 3: use a feature extractor to process the data and split it into a training and test set
featuresets = [(gender_features(n), gender) for (n, gender) in labelled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Part 4: test it on some names in the test set
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

# Part 5: evaluate it on a larger test set
print(nltk.classify.accuracy(classifier, test_set))

# Part 6: determine which features the classifier found most useful for distinguishing gender
print(classifier.show_most_informative_features(5))

## Document classification
# Part 1: choose a corpus
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Part 2: define a feature extractor that checks whether one of the 2,000 most-frequent words from the corpus are in a
#           particular document
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

dict = document_features(movie_reviews.words('pos/cv957_8737.txt'))
for key in list(dict)[:10]:
    print(key, dict[key])

# Part 3: use feature extractor to train classifier to label new movie reviews
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(5))

male
female
0.808
Most Informative Features
             last_letter = 'k'              male : female =     44.5 : 1.0
             last_letter = 'a'            female : male   =     35.5 : 1.0
             last_letter = 'f'              male : female =     17.1 : 1.0
             last_letter = 'p'              male : female =     12.4 : 1.0
             last_letter = 'd'              male : female =      9.9 : 1.0
None
contains(,) True
contains(the) True
contains(.) True
contains(a) True
contains(and) True
contains(of) True
contains(to) True
contains(') True
contains(is) True
contains(in) True
0.84
Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.6 : 1.0
         contains(mulan) = True              pos : neg    =      8.4 : 1.0
        contains(seagal) = True              neg : pos    =      7.4 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.4 : 1.0
         contains(damon) = True              pos : neg    =      6.

##### Evaluation

In [21]:
# Option 1: create training and test sets by randomly assigning sentences from a data source reflecting a single genre
# training and test sets will be very similar, and will not be able to confidently generalise results to other genres
import random
from nltk.corpus import brown

tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

# Option 2: create training and test sets from different documents 
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

# Option 3: create test set from documents less-closely related to those in training set
# more stringent evaluation of classifier. If it performs well here, confident that classifier will perform well on data
#   very different to that it was trained on 
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

# Accuracy - simplest metric used to evaluate a classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))