# NLTK for Python Test Bed
## Chapter 3:  Processing Raw Text
### Using examples from http://www.nltk.org
##### Whitney King  (1/10/2018)

In [1]:
import nltk
import pandas as pd
import re
import pprint as pp
from nltk import word_tokenize
#nltk.download()

In [2]:
#Define custom functions

def line():
    print("------------------------------------------------------------------")

## _Accessing Text from the Web and from Disk_

### _Electronic Books_

Resources such as *Project Gutenberg* have text versions of books online for free. There are over 25,000 to choose from on the PG website, in over 50 languages, all of which can be downloaded in ASCII.

In [3]:
from urllib import request
url = "http://www.gutenberg.org/files/19033/19033.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
print(len(raw), ' | ', type(raw), ' | ',  raw[:69])
line()
tokens = word_tokenize(raw)
print(type(tokens), len(tokens), tokens[:10])

74726  |  <class 'str'>  |  The Project Gutenberg EBook of Alice in Wonderland, by Lewis Carroll
------------------------------------------------------------------
<class 'list'> 15758 ['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Alice', 'in', 'Wonderland', ',', 'by']


Since a lot of descriptive information about the book is appearing in the collocations, they should be trimmed from the raw data

In [4]:
text = nltk.Text(tokens)
print(type(text))
pp.pprint(text[976:991])
pp.pprint(text.collocations())

<class 'nltk.text.Text'>
['alas',
 '!',
 'either',
 'the',
 'locks',
 'were',
 'too',
 'large',
 ',',
 'or',
 'the',
 'key',
 'was',
 'too',
 'small']
Project Gutenberg-tm; Project Gutenberg; said Alice; Literary Archive;
White Rabbit; Archive Foundation; Gutenberg-tm electronic; Gutenberg
Literary; electronic works; United States; March Hare; public domain;
set forth; golden key; electronic work; white kid-gloves; Gutenberg-tm
License; play croquet; Mary Ann; thought Alice
None


In [5]:
raw = raw[raw.find("I--DOWN THE RABBIT-HOLE"):raw.rfind("End of the Project Gutenberg EBook")]

raw.find("I--DOWN THE RABBIT-HOLE") #Trimmed book text

0

In [6]:
tokens = word_tokenize(raw)
text = nltk.Text(tokens)
text.collocations()
pp.pprint(text.collocations())

said Alice; White Rabbit; March Hare; golden key; white kid-gloves;
play croquet; Mary Ann; thought Alice; inches high; little golden;
feet high; cool fountains; yer honor; good deal; low voice; asking
riddles; right size; trembling voice; shrinking rapidly; came upon
said Alice; White Rabbit; March Hare; golden key; white kid-gloves;
play croquet; Mary Ann; thought Alice; inches high; little golden;
feet high; cool fountains; yer honor; good deal; low voice; asking
riddles; right size; trembling voice; shrinking rapidly; came upon
None


### _HTML_

Packages such as BeautifulSoup enable the ability to wrange text from HTML web pages.

In [8]:
from bs4 import BeautifulSoup
url = "http://sailormoon.wikia.com/wiki/Sailor_Galaxia"
html = request.urlopen(url).read().decode('utf8')
print(html[:1000])
line()

html_raw = BeautifulSoup(html[554:126566], "lxml")
tokens = word_tokenize(html_raw.get_text())
print(sorted(list(set([w.lower() for w in tokens if w.isalpha() and len(w) > 3]))[:100]))
line()

html_text = nltk.Text([w.lower() for w in tokens if w.isalpha() and len(w) > 3])
html_text.concordance('sailor')
line()

html_text.collocations()

<!doctype html>
<html lang="en" dir="ltr" class="">
<head>

<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="generator" content="MediaWiki 1.19.24" />
<meta name="keywords" content="Sailor Moon Wiki,sailormoon,Sailor Galaxia,Chaos,Sailor Chaos,Sailor Chi and Sailor Phi,Sailor Lethe,Sailor Mnemosyne,Sailor Iron Mouse,Sailor Aluminum Siren,Sailor Lead Crow,Sailor Tin Nyanko,Sailor Heavy Metal Papillon" />
<meta name="description" content="Sailor Galaxia is one of the main antagonists in the final arc of Sailor Moon. She is well known..." />
<meta name="twitter:card" content="summary" />
<meta name="twitter:site" content="@getfandom" />
<meta name="twitter:url" content="http://sailormoon.wikia.com/wiki/Sailor_Galaxia" />
<meta name="twitter:title" content="Sailor Galaxia | Sailor Moon Wiki | FANDOM powered by Wikia" />
<meta name="twitter:description" content="Sailor G

### _Reading Local Files_

In addition to the ```open()``` function built into Pythonthat read flat files, third party packages such as ```pypdf``` and ```pywin32``` will read PDF and Word files.

* Corpus files can also be read in this same manner
* Any string, including ```input()``` can be tokenized

In [9]:
f = 'Pokemon.txt'
with open(f, 'r') as fraw:
    line = fraw.readline()
    while line:
        print("{}".format(line.strip()))
        line = fraw.readline()

Key	Number	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary
1	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	FALSE
2	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	FALSE
3	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	FALSE
4	3	VenusaurMega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	FALSE
5	4	Charmander	Fire		309	39	52	43	60	50	65	1	FALSE
6	5	Charmeleon	Fire		405	58	64	58	80	65	80	1	FALSE
7	6	Charizard	Fire	Flying	534	78	84	78	109	85	100	1	FALSE
8	6	CharizardMega Charizard X	Fire	Dragon	634	78	130	111	130	85	100	1	FALSE
9	6	CharizardMega Charizard Y	Fire	Flying	634	78	104	78	159	115	100	1	FALSE
10	7	Squirtle	Water		314	44	48	65	50	64	43	1	FALSE
11	8	Wartortle	Water		405	59	63	80	65	80	58	1	FALSE
12	9	Blastoise	Water		530	79	83	100	85	105	78	1	FALSE
13	9	BlastoiseMega Blastoise	Water		630	79	103	120	135	115	78	1	FALSE
14	10	Caterpie	Bug		195	45	30	35	20	20	45	1	FALSE
15	11	Metapod	Bug		205	50	20	55	25	25	30	1	FALSE
16	12	Butterfree	Bug	F

![NLPPipeline](http://whitneyontheweb.com/images/pipeline1.png "NLP Pipeline")

When we tokenize a string we produce a list (of words), and this is Python's ```<list>``` type

|```Method``` | Functionality | 
|-----|-----|-----|
|```s.find(t)``` | index of first instance of string t inside s (-1 if not found) | 
|```s.rfind(t)``` | index of last instance of string t inside s (-1 if not found) | 
|```s.index(t)``` | like s.find(t) except it raises ValueError if not found | 
|```s.rindex(t)``` | like s.rfind(t) except it raises ValueError if not found | 
|```s.join(text)``` | combine the words of the text into a string using s as the glue | 
|```s.split(t)``` | split s into a list wherever a t is found (whitespace by default) | 
|```s.splitlines()``` | split s into a list of strings, one per line | 
|```s.lower()``` | a lowercased version of the string s | 
|```s.upper()``` | an uppercased version of the string s | 
|```s.title()``` | a titlecased version of the string s | 
|```s.strip()``` | a copy of s without leading or trailing whitespace | 
|```s.replace(t, u)``` | replace instances of t with u inside s | 

## _Regular Expressions_

Regexes are pattern matching expressions.

In [17]:
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
#[w for w in wordlist if re.search('ed$', w)]
print([w for w in wordlist if re.search('^..j..t..$', w)])

['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter', 'rejector', 'unjilted', 'unjolted', 'unjustly']


The T9 system is used for entering text on mobile phones

* Two or more words that are entered with the same sequence of keystrokes are known as **textonyms**
 - Note that the + and * symbols are sometimes referred to as **Kleene closures**, or simply **closures**

In [14]:
[w for w in wordlist if re.search('^[abc][abc][def]$', w)]

['ace', 'bad', 'bae', 'cad']

Regular expressions are great for extracting parts of words, amoung other things.

|```Operator``` | Behavior | 
|-----|-----|-----|
|```.``` | Wildcard, matches any character | 
|```^abc``` | Matches some pattern abc at the start of a string | 
|```abc$``` | Matches some pattern abc at the end of a string | 
|```[abc]``` | Matches one of a set of characters | 
|```[A-Z0-9]``` | Matches one of a range of characters | 
|```ed\ing\s``` | Matches one of the specified strings (disjunction) | 
|```*``` | Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure) | 
|```+``` | One or more of previous item, e.g. a+, [a-z]+ | 
|```?``` | Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]? | 
|```{n}``` | Exactly n repeats where n is a non-negative integer | 
|```{n,}``` | At least n repeats | 
|```{,n}``` | No more than n repeats | 
|```{m,n}``` | At least m and no more than n repeats | 
|```a(b\c)+``` | Parentheses that indicate the scope of the operators | 

### _Extracting Word Pieces_ 
The re.findall() ("find all") method finds all (non-overlapping) matches of the given regular expression

In [16]:
word = 'supercalifragilisticexpialidocious'
print(re.findall(r'[aeiou]', word))

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']


Let's look for all sequences of two or more vowels in some text, and determine their relative frequency

In [18]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
                   for vs in re.findall(r'[aeiou]{2,}', word))

fd.most_common(12)

[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106),
 ('ue', 105),
 ('ui', 95)]

In [40]:
[int(n) for n in re.findall(r'\d+', '2009-12-31')]

[2009, 12, 31]

In [17]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

print(nltk.tokenwrap(compress(w) for w in tokens[10:85]))

vry trd of sttng by hr sstr on the bnk , and of hvng nthng to do .
Once or twce she hd ppd into the bk hr sstr ws rdng , bt it hd no
pctrs or cnvrstns in it , `` and wht is the use of a bk , '' thght
Alce , `` wtht pctrs or cnvrstns ? '' So she ws cnsdrng in hr own mnd
( as wll as


In [41]:
print(re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes'))
line()

def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

print([stem(t) for t in tokens[10:85]])


[('process', 'es')]
------------------------------------------------------------------
['very', 'tir', 'of', 'sitt', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'hav', 'noth', 'to', 'do', '.', 'Once', 'or', 'twice', 'she', 'had', 'peep', 'into', 'the', 'book', 'her', 'sister', 'wa', 'read', ',', 'but', 'it', 'had', 'no', 'pictur', 'or', 'conversation', 'in', 'it', ',', '``', 'and', 'what', 'i', 'the', 'use', 'of', 'a', 'book', ',', "''", 'thought', 'Alice', ',', '``', 'without', 'pictur', 'or', 'conversation', '?', "''", 'So', 'she', 'wa', 'consider', 'in', 'her', 'own', 'mind', '(', 'a', 'well', 'a']


It is easy to build search patterns when the linguistic phenomenon we're studying is tied to particular words. In some cases, a little creativity will go a long way. For instance, searching a large text corpus for expressions of the form x and other ys allows us to discover hypernyms

In [42]:
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


There are also some prebuilt word stemmers that handle this work more as expected

In [45]:
porter = nltk.PorterStemmer()
print([porter.stem(t) for t in tokens[10:200]])

['veri', 'tire', 'of', 'sit', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'have', 'noth', 'to', 'do', '.', 'onc', 'or', 'twice', 'she', 'had', 'peep', 'into', 'the', 'book', 'her', 'sister', 'wa', 'read', ',', 'but', 'it', 'had', 'no', 'pictur', 'or', 'convers', 'in', 'it', ',', '``', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', ',', "''", 'thought', 'alic', ',', '``', 'without', 'pictur', 'or', 'convers', '?', "''", 'So', 'she', 'wa', 'consid', 'in', 'her', 'own', 'mind', '(', 'as', 'well', 'as', 'she', 'could', ',', 'for', 'the', 'day', 'made', 'her', 'feel', 'veri', 'sleepi', 'and', 'stupid', ')', ',', 'whether', 'the', 'pleasur', 'of', 'make', 'a', 'daisy-chain', 'would', 'be', 'worth', 'the', 'troubl', 'of', 'get', 'up', 'and', 'pick', 'the', 'daisi', ',', 'when', 'suddenli', 'a', 'white', 'rabbit', 'with', 'pink', 'eye', 'ran', 'close', 'by', 'her', '.', 'there', 'wa', 'noth', 'so', 'veri', 'remark', 'in', 'that', ',', 'nor', 'did', 'alic', 'think', 'it

### Lemmatization

* The WordNet lemmatizer only removes affixes if the resulting word is in its dictionary
* The WordNet lemmatizer is a good choice if you want to compile the vocabulary of some texts and want a list of valid lemmas (or lexicon headwords)

In [46]:
wnl = nltk.WordNetLemmatizer()
print([wnl.lemmatize(t)  for t in tokens[10:200]])

['very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', '.', 'Once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'wa', 'reading', ',', 'but', 'it', 'had', 'no', 'picture', 'or', 'conversation', 'in', 'it', ',', '``', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', ',', "''", 'thought', 'Alice', ',', '``', 'without', 'picture', 'or', 'conversation', '?', "''", 'So', 'she', 'wa', 'considering', 'in', 'her', 'own', 'mind', '(', 'a', 'well', 'a', 'she', 'could', ',', 'for', 'the', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid', ')', ',', 'whether', 'the', 'pleasure', 'of', 'making', 'a', 'daisy-chain', 'would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and', 'picking', 'the', 'daisy', ',', 'when', 'suddenly', 'a', 'White', 'Rabbit', 'with', 'pink', 'eye', 'ran', 'close', 'by', 'her', '.', 'There', 'wa', 'nothing', 'so', 'very', 'remarkable', 'in

## Tokenizing Text

The very simplest method for tokenizing text is to split on whitespace

 - When using a regular expression, you must also account for tabs and new lines
 - This can be further refined into splitting the text on anything other than a word character

In [54]:
re_split_raw = re.split(r'[ \W]+', raw)

print(re_split_raw[:75])

['I', 'DOWN', 'THE', 'RABBIT', 'HOLE', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', 'and', 'of', 'having', 'nothing', 'to', 'do', 'Once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', 'thought', 'Alice', 'without', 'pictures', 'or', 'conversations', 'So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', 'as', 'well', 'as', 'she', 'could']


| Symbol | Function | 
|-----|-----|-----|
| \b | Word boundary (zero width) | 
| \d | Any decimal digit (equivalent to [0-9]) | 
| \D | Any non-digit character (equivalent to [^0-9]) | 
| \s | Any whitespace character (equivalent to [ \t\n\r\f\v]) | 
| \S | Any non-whitespace character (equivalent to [^ \t\n\r\f\v]) | 
| \w | Any alphanumeric character (equivalent to [a-zA-Z0-9_]) | 
| \W | Any non-alphanumeric character (equivalent to [^a-zA-Z0-9_]) | 
| \t | The tab character | 
| \n | The newline character | 
