# NLTK
NLTK is a Python module

In [3]:
import nltk

Let's do some simple statistics on the Gutenberg corpus

In [4]:
nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()

[nltk_data] Downloading package gutenberg to /Users/noah/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [5]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)
# all the words in one of the book

192427

In [6]:
emma[:20]
# first 20 words from emma

['[',
 'Emma',
 'by',
 'Jane',
 'Austen',
 '1816',
 ']',
 'VOLUME',
 'I',
 'CHAPTER',
 'I',
 'Emma',
 'Woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich']

In [7]:
nltk.download('punkt')
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
#     print(num_chars)
    num_words = len(gutenberg.words(fileid))
#     print(num_words)
    num_sents = len(gutenberg.sents(fileid))
#     print(num_sents)
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)
# for austen-emma.txt, 5 characters per words, 25 words per sentence, 26 words per vocabulary

[nltk_data] Downloading package punkt to /Users/noah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


5 25 26 austen-emma.txt
5 26 17 austen-persuasion.txt
5 28 22 austen-sense.txt
4 34 79 bible-kjv.txt
5 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 18 12 burgess-busterbrown.txt
4 20 13 carroll-alice.txt
5 20 12 chesterton-ball.txt
5 23 11 chesterton-brown.txt
5 19 11 chesterton-thursday.txt
4 21 25 edgeworth-parents.txt
5 26 15 melville-moby_dick.txt
5 52 11 milton-paradise.txt
4 12 9 shakespeare-caesar.txt
4 12 8 shakespeare-hamlet.txt
4 12 7 shakespeare-macbeth.txt
5 36 12 whitman-leaves.txt


### Counting Words

In [8]:
import collections
emma_counter = collections.Counter(emma)
emma_counter.most_common(10)

[(',', 11454),
 ('.', 6928),
 ('to', 5183),
 ('the', 4844),
 ('and', 4672),
 ('of', 4279),
 ('I', 3178),
 ('a', 3004),
 ('was', 2385),
 ('her', 2381)]

In [9]:
emma_counter['Emma']
# frequency of the words 'emma'

865

### Count Bigrams
A bigram is a sequence of two words.

In [10]:
list(nltk.bigrams([1,2,3,4,5,6]))

[(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)]

In [11]:
list(nltk.bigrams(emma))[:5]
#the first five bigrams

[('[', 'Emma'),
 ('Emma', 'by'),
 ('by', 'Jane'),
 ('Jane', 'Austen'),
 ('Austen', '1816')]

* A bigram is an ngram where n is 2
* A trigram is an ngram where n is 3

In [12]:
list(nltk.ngrams(emma,4))[:7]
#the first seven 4grams in emma

[('[', 'Emma', 'by', 'Jane'),
 ('Emma', 'by', 'Jane', 'Austen'),
 ('by', 'Jane', 'Austen', '1816'),
 ('Jane', 'Austen', '1816', ']'),
 ('Austen', '1816', ']', 'VOLUME'),
 ('1816', ']', 'VOLUME', 'I'),
 (']', 'VOLUME', 'I', 'CHAPTER')]

### Exercises
1. Find the most frequent bigram in Austin's Emma.
2. Find the most frequent bigram that begins with 'the'.

In [13]:
from collections import Counter
bigrams=list(nltk.bigrams(emma))
counts = Counter(bigrams)
print(counts.most_common(5))

[((',', 'and'), 1879), (('Mr', '.'), 1153), (("'", 's'), 932), ((';', 'and'), 866), (('."', '"'), 757)]


In [14]:
from collections import Counter
bigrams=list(nltk.bigrams(emma))
bigrams_the=[]
for b in bigrams:
    if b[0]=='the':
        bigrams_the.append(b)
  
counts = Counter(bigrams_the)
print(counts.most_common(5))

[(('the', 'same'), 98), (('the', 'very'), 92), (('the', 'world'), 76), (('the', 'other'), 73), (('the', 'first'), 69)]


# Text Processing in Python

### Sorting
* The function `sorted()` returns a sorted copy.
* Sequences can be sorted in place with the `sort()` method.
* Python 3 does not support sorting of lists with mixed contents.

In [15]:
foo = [2,5,9,1,11]
sorted(foo)

[1, 2, 5, 9, 11]

In [16]:
foo

[2, 5, 9, 1, 11]

In [17]:
foo.sort()

In [18]:
foo

[1, 2, 5, 9, 11]

In [19]:
foo2 = [2,5,6,1,'a']
sorted(foo2)

TypeError: '<' not supported between instances of 'str' and 'int'

### Sorting with a custom sorting criterion

In [19]:
l = ['a','abc','b','c','aa','bb','cc']

In [20]:
sorted(l)

['a', 'aa', 'abc', 'b', 'bb', 'c', 'cc']

In [21]:
sorted(l,key=len)

['a', 'b', 'c', 'aa', 'bb', 'cc', 'abc']

In [22]:
sorted(l,key=len,reverse=True)

['abc', 'aa', 'bb', 'cc', 'a', 'b', 'c']

In [23]:
def my_len(x):
    return -len(x)

In [24]:
sorted(l,key=my_len)

['abc', 'aa', 'bb', 'cc', 'a', 'b', 'c']

In [25]:
sorted(l,key = lambda x: -len(x))

['abc', 'aa', 'bb', 'cc', 'a', 'b', 'c']

### Exercises
You're given data of the following form:

```python
namedat = dict()
namedat['mc'] = ('Madonna', 45)
namedat['sc'] = ('Steve', 41)
```

1. How would you print a list ordered by name?
2. How would you print a list ordered by age?

In [132]:
namedat = dict()
namedat['mc'] = ('Madonna', 45)
namedat['sc'] = ('Steve', 41)


dict(sorted(namedat.items(), key=lambda x: x[1][1]))

{'sc': ('Steve', 41), 'mc': ('Madonna', 45)}

### Strings in Python
* String is a base type.
* Strings are sequences and can use operations like lists or tuples.

In [26]:
foo = "A string"
len(foo)

8

In [27]:
foo[0]

'A'

In [28]:
foo[0:3]

'A s'

In [40]:
multifoo = """A multiline 
string"""

In [41]:
multifoo

'A multiline \nstring'

In [42]:
"my string".capitalize()

'My string'

In [43]:
capitalize("my string")

NameError: name 'capitalize' is not defined

In [44]:
"my string".upper()

'MY STRING'

In [45]:
"My String".lower()

'my string'

In [46]:
a = "my string with my other text"
a.count("my")

2

In [47]:
a.find("with")

10

In [48]:
a.find("nothing")

-1

### Split
* `split(sep)` is a central string operation.
* It splits a string wherever `sep` occurs (blank space by default)

In [49]:
foo = "one :: two :: three"
foo.split()

['one', '::', 'two', '::', 'three']

In [50]:
foo.split('::')

['one ', ' two ', ' three']

In [51]:
foo.split(' :: ')

['one', 'two', 'three']

In [52]:
"this is a test".split()

['this', 'is', 'a', 'test']

### Join
* Join is another useful function/method in the string module.
* It takes a list and joins the elements using some delimiter.


In [53]:
text="this is some text to analyse"
words=text.split()
print(words)
words.sort()
print(words)
print(", ".join(words))

['this', 'is', 'some', 'text', 'to', 'analyse']
['analyse', 'is', 'some', 'text', 'this', 'to']
analyse, is, some, text, this, to


### Replace

In [54]:
def censor(text):
   'replace bad words in a text with XXX'
   badwords = ['poo', 'bottom']
   for b in badwords:
      text = text.replace(b, 'XXX')
   return text

In [55]:
censor("this is all poo and more poo")

'this is all XXX and more XXX'

### Text Preprocessing with NLTK
#### Tokenisation

In [56]:
import nltk
nltk.download("punkt")
text = "This is a sentence. This is another sentence."
nltk.sent_tokenize(text)
# first tokenize the text and tokenize the words with for loop

[nltk_data] Downloading package punkt to /Users/noah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['This is a sentence.', 'This is another sentence.']

In [69]:
for s in nltk.sent_tokenize(text):
    for w in nltk.word_tokenize(s):
        print(w)
    print()

This
is
a
sentence
.

This
is
another
sentence
.



#### Part of speech tagging

* Often it is useful to know whether a word is a noun, or an adjective, etc. These are called **parts of speech**.
* NLTK has a part of speech tagger that tags a list of tokens.
* The default list of parts of speech is fairly detailed but we can set a simplified version (called `universal` by NLTK).

List of universal tagsets:

| Tag | Meaning | English Examples |
| --- | --- | --- |
| `ADJ` | adjective | new, good, high, special, big, local |
| `ADP` | adposition | on, of, at, with, by, into, under |
| `ADV` | adverb | really, already, still, early, now |
| `CONJ` | conjunction | and, or, but, if, while, although |
| `DET` | determiner, article | the, a, some, most, every, no, which |
| `NOUN` | noun | year, home, costs, time, Africa |
| `NUM` | numeral | twenty-four, fourth, 1991, 14:24 |
| `PRT` | particle | at, on, out, over per, that, up, with |
| `PRON` | pronoun | he, their, her, its, my, I, us |
| `VERB` | verb | is, say, told, given, playing, would |
| `.` | punctuation marks | . , ; ! |
| `X` | other | ersatz, esprit, dunno, gr8, univeristy |


![WordPosPipeline](WordPosPipeline.png)

In [57]:
nltk.download("averaged_perceptron_tagger")
nltk.pos_tag(["this", "is", "a", "test"])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/noah/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('this', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('test', 'NN')]

In [58]:
nltk.download("universal_tagset")
nltk.pos_tag(["this", "is", "a", "test"], tagset="universal")

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/noah/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[('this', 'DET'), ('is', 'VERB'), ('a', 'DET'), ('test', 'NOUN')]

In [72]:
nltk.pos_tag(nltk.word_tokenize("this is a test"), tagset="universal")

[('this', 'DET'), ('is', 'VERB'), ('a', 'DET'), ('test', 'NOUN')]

![SentPosPipeline](SentPosPipeline.png)

In [73]:
text = "This is a sentence. This is another sentence."
text_sent_tokens = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)]
text_sent_tokens

[['This', 'is', 'a', 'sentence', '.'],
 ['This', 'is', 'another', 'sentence', '.']]

In [74]:
nltk.pos_tag_sents(text_sent_tokens, tagset="universal")

[[('This', 'DET'),
  ('is', 'VERB'),
  ('a', 'DET'),
  ('sentence', 'NOUN'),
  ('.', '.')],
 [('This', 'DET'),
  ('is', 'VERB'),
  ('another', 'DET'),
  ('sentence', 'NOUN'),
  ('.', '.')]]

Below is an implementation that has the same behaviour as `pos_tag_sents`. Hopefully this can help you understand how it works:

In [75]:
def my_pos_tag_sents(text_sent_tokens, tagset="universal"):
    return [nltk.pos_tag(s, tagset=tagset) for s in text_sent_tokens]

In [76]:
my_pos_tag_sents(text_sent_tokens, tagset="universal")

[[('This', 'DET'),
  ('is', 'VERB'),
  ('a', 'DET'),
  ('sentence', 'NOUN'),
  ('.', '.')],
 [('This', 'DET'),
  ('is', 'VERB'),
  ('another', 'DET'),
  ('sentence', 'NOUN'),
  ('.', '.')]]

#### Stemming

* Often it is useful to remove information such as verb form, or the difference between singular and plural.
* NLTK offers stemming, which removes suffixes.
    * The Porter stemmer is a popular stemmer.
* The remaining stem is not a word but can be used, for example, by search engines (we'll see more of this in another lecture).

In [59]:
s = nltk.PorterStemmer()

In [60]:
s.stem("books")

'book'

In [61]:
s.stem("running")

'run'

In [62]:
s.stem("run")

'run'

In [63]:
s.stem("goes")

'goe'

In [64]:
[s.stem(w) for w in nltk.word_tokenize("I'm running and he goes")]

['i', "'m", 'run', 'and', 'he', 'goe']

### Exercises
1.  What is the sentence with the largest number of tokens
    in Austen's "Emma"?
2. What is the number of distinct stems in Austen's "Emma"?
3. What is the most ambiguous stem in Austen's "Emma"?
    (meaning, which stem in Austen's "Emma" maps to the
    largest number of distinct tokens?)

In [148]:
# 1) sentence with the largest number of tokens
emmaa=gutenberg.raw('austen-emma.txt')
# nltk.sent_tokenize(emmaa)
emmaa_sent_tokens = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(emmaa)]
longestLen=0
longestSen=''
for e in emmaa_sent_tokens:
    if len(e)>longestLen:
        longestSen=e
        longestLen=len(e)
print(longestSen,longestLen)

['While', 'he', 'lived', ',', 'it', 'must', 'be', 'only', 'an', 'engagement', ';', 'but', 'she', 'flattered', 'herself', ',', 'that', 'if', 'divested', 'of', 'the', 'danger', 'of', 'drawing', 'her', 'away', ',', 'it', 'might', 'become', 'an', 'increase', 'of', 'comfort', 'to', 'him.', '--', 'How', 'to', 'do', 'her', 'best', 'by', 'Harriet', ',', 'was', 'of', 'more', 'difficult', 'decision', ';', '--', 'how', 'to', 'spare', 'her', 'from', 'any', 'unnecessary', 'pain', ';', 'how', 'to', 'make', 'her', 'any', 'possible', 'atonement', ';', 'how', 'to', 'appear', 'least', 'her', 'enemy', '?', '--', 'On', 'these', 'subjects', ',', 'her', 'perplexity', 'and', 'distress', 'were', 'very', 'great', '--', 'and', 'her', 'mind', 'had', 'to', 'pass', 'again', 'and', 'again', 'through', 'every', 'bitter', 'reproach', 'and', 'sorrowful', 'regret', 'that', 'had', 'ever', 'surrounded', 'it.', '--', 'She', 'could', 'only', 'resolve', 'at', 'last', ',', 'that', 'she', 'would', 'still', 'avoid', 'a', 'meet

In [153]:
# 2) number of different stems in emma
emmaa_sent_tokens2=nltk.word_tokenize(emmaa)
stems=[s.stem(w) for w in emmaa_sent_tokens2]
stems

['[',
 'emma',
 'by',
 'jane',
 'austen',
 '1816',
 ']',
 'volum',
 'i',
 'chapter',
 'i',
 'emma',
 'woodhous',
 ',',
 'handsom',
 ',',
 'clever',
 ',',
 'and',
 'rich',
 ',',
 'with',
 'a',
 'comfort',
 'home',
 'and',
 'happi',
 'disposit',
 ',',
 'seem',
 'to',
 'unit',
 'some',
 'of',
 'the',
 'best',
 'bless',
 'of',
 'exist',
 ';',
 'and',
 'had',
 'live',
 'nearli',
 'twenty-on',
 'year',
 'in',
 'the',
 'world',
 'with',
 'veri',
 'littl',
 'to',
 'distress',
 'or',
 'vex',
 'her',
 '.',
 'she',
 'wa',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughter',
 'of',
 'a',
 'most',
 'affection',
 ',',
 'indulg',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequ',
 'of',
 'her',
 'sister',
 "'s",
 'marriag',
 ',',
 'been',
 'mistress',
 'of',
 'hi',
 'hous',
 'from',
 'a',
 'veri',
 'earli',
 'period',
 '.',
 'her',
 'mother',
 'had',
 'die',
 'too',
 'long',
 'ago',
 'for',
 'her',
 'to',
 'have',
 'more',
 'than',
 'an',
 'indistinct',
 'remembr',
 'of',
 'her',
 'caress'

In [156]:
set_stems=set(stems)
len(set_stems)

5369

In [154]:
# 3)
import collections
emma_stems_counter = collections.Counter(stems)
emma_stems_counter.most_common(10)

[(',', 12016),
 ('.', 6346),
 ('the', 5201),
 ('to', 5181),
 ('and', 4877),
 ('of', 4284),
 ('i', 3177),
 ('a', 3124),
 ('--', 3100),
 ('it', 2625)]