# Choosing the data
Choose existing large documents from NLTK or the Gutenberg collection on the web.

In [135]:
# Getting started to process a text example
import nltk
from nltk import FreqDist
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [136]:
nltk.corpus.gutenberg.fileids()[7]

'carroll-alice.txt'

In [137]:
nltk.corpus.gutenberg.fileids()[3]

'bible-kjv.txt'

# Analyze the text
Examine the text in the documents that you chose and decide how to process the words, i.e. decide on tokenization and whether to use all lowercase, stopwords, or lemmatization.

In [138]:
file1 = nltk.corpus.gutenberg.fileids()[7]
alice = nltk.corpus.gutenberg.raw(file1)
alicetokens = nltk.word_tokenize(alice) 
alicewords = [w.lower() for w in alicetokens] 
# show some of the words
print(len(alicewords))
print(alicewords[:110])

33494
['[', 'alice', "'s", 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', '1865', ']', 'chapter', 'i', '.', 'down', 'the', 'rabbit-hole', 'alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', ',', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', ',', "'and", 'what', 'is', 'the', 'use', 'of', 'a', 'book', ',', "'", 'thought', 'alice', "'without", 'pictures', 'or', 'conversation', '?', "'", 'so', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(', 'as', 'well', 'as', 'she', 'could', ',', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid', ')', ',']


In [139]:
file2 = nltk.corpus.gutenberg.fileids()[3]
bible = nltk.corpus.gutenberg.raw(file2)
bibletokens = nltk.word_tokenize(bible) 
biblewords = [w.lower() for w in bibletokens] 
# show some of the words
print(len(biblewords))
print(biblewords[:110])

946812
['[', 'the', 'king', 'james', 'bible', ']', 'the', 'old', 'testament', 'of', 'the', 'king', 'james', 'bible', 'the', 'first', 'book', 'of', 'moses', ':', 'called', 'genesis', '1:1', 'in', 'the', 'beginning', 'god', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.', '1:2', 'and', 'the', 'earth', 'was', 'without', 'form', ',', 'and', 'void', ';', 'and', 'darkness', 'was', 'upon', 'the', 'face', 'of', 'the', 'deep', '.', 'and', 'the', 'spirit', 'of', 'god', 'moved', 'upon', 'the', 'face', 'of', 'the', 'waters', '.', '1:3', 'and', 'god', 'said', ',', 'let', 'there', 'be', 'light', ':', 'and', 'there', 'was', 'light', '.', '1:4', 'and', 'god', 'saw', 'the', 'light', ',', 'that', 'it', 'was', 'good', ':', 'and', 'god', 'divided', 'the', 'light', 'from', 'the', 'darkness', '.', '1:5', 'and', 'god', 'called', 'the', 'light']


# List the top 50 words by frequency (normalized by the length of the document)

In [140]:
# Creating a frequency distribution of words
ndist = FreqDist(alicewords)

# print the top 50 tokens by frequency
nitems_alice = ndist.most_common(50)
print('Alice in Wonderland top 50 words:')
for item in nitems_alice:
    print (item[0], '\t', item[1])

Alice in Wonderland top 50 words:
, 	 2418
the 	 1616
' 	 1309
. 	 975
and 	 810
to 	 720
a 	 631
she 	 544
it 	 539
i 	 533
of 	 499
said 	 462
! 	 450
alice 	 396
was 	 366
in 	 359
you 	 356
that 	 284
-- 	 264
as 	 256
her 	 248
: 	 233
at 	 209
n't 	 204
? 	 202
's 	 194
; 	 194
on 	 191
had 	 184
with 	 179
all 	 178
be 	 148
for 	 146
so 	 144
very 	 139
not 	 135
they 	 135
but 	 131
this 	 131
little 	 128
do 	 125
he 	 117
is 	 113
out 	 113
what 	 103
down 	 102
one 	 99
up 	 97
his 	 95
about 	 94


In [141]:
# Creating a frequency distribution of words
ndist = FreqDist(biblewords)

# print the top 50 tokens by frequency
nitems = ndist.most_common(50)
print('KJV Bible top 50 words:')
for item in nitems:
    print (item[0], '\t', item[1])

KJV Bible top 50 words:
, 	 70573
the 	 64023
and 	 51696
of 	 34670
. 	 26202
to 	 13580
that 	 12912
: 	 12706
in 	 12667
he 	 10419
; 	 10139
shall 	 9838
unto 	 8997
for 	 8971
i 	 8854
his 	 8473
a 	 8177
lord 	 7944
they 	 7376
be 	 7013
is 	 6989
not 	 6780
him 	 6659
them 	 6430
it 	 6129
with 	 6012
all 	 5620
thou 	 5474
thy 	 4600
was 	 4522
god 	 4467
which 	 4413
my 	 4368
me 	 4096
said 	 3999
but 	 3992
ye 	 3983
their 	 3942
have 	 3904
will 	 3836
thee 	 3826
from 	 3642
as 	 3520
? 	 3297
are 	 2950
when 	 2834
this 	 2785
out 	 2775
were 	 2772
upon 	 2748


# Additional preprocessing - remove stopwords and non-alphabetical characters

In [142]:
import re
from nltk.collocations import *

def remove_non_alpha(word_list):
    
    # pattern to match word of non-alphabetical characters
    pattern = re.compile('^[^a-z]+$')
    new_word_list = []
    for word in word_list:
        if not (pattern.match(word)):
            new_word_list.append(word)
    return new_word_list


def remove_stopwords(word_list):
    # get a list of stopwords from nltk
    nltk.download('stopwords')
    nltkstopwords = nltk.corpus.stopwords.words('english')

    morestopwords = ['could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve", "n't"]

    stopwords = nltkstopwords + morestopwords
    new_word_list = []
    for word in word_list:
        if word not in stopwords:
            new_word_list.append(word)
    return new_word_list

In [143]:
alice_processed = remove_non_alpha(alicewords)
alice_processed = remove_stopwords(alice_processed)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [144]:
bible_processed = remove_non_alpha(biblewords)
bible_processed = remove_stopwords(bible_processed)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Top 50 words after additional preprocessing

In [145]:
# Creating a frequency distribution of words
ndist_processed = FreqDist(alice_processed)

# print the top 50 tokens by frequency
nitems = ndist_processed.most_common(50)
print('Alice in Wonderland top 50 words (after additional preprocessing):')
for item in nitems:
    print (item[0], '\t', item[1])

Alice in Wonderland top 50 words (after additional preprocessing):
said 	 462
alice 	 396
little 	 128
one 	 99
know 	 88
like 	 85
went 	 83
queen 	 75
thought 	 74
time 	 68
see 	 67
king 	 62
began 	 58
turtle 	 58
'and 	 56
hatter 	 56
mock 	 56
quite 	 55
'it 	 55
gryphon 	 54
think 	 53
way 	 53
much 	 51
say 	 51
first 	 50
head 	 50
'you 	 50
thing 	 49
go 	 48
voice 	 48
rabbit 	 47
looked 	 45
never 	 45
got 	 45
get 	 44
mouse 	 42
duchess 	 42
round 	 41
came 	 40
tone 	 40
dormouse 	 40
great 	 39
'but 	 39
'what 	 38
well 	 37
back 	 37
two 	 37
cat 	 36
march 	 34
large 	 33


In [146]:
# Creating a frequency distribution of words
ndist_processed = FreqDist(bible_processed)

# print the top 50 tokens by frequency
nitems = ndist_processed.most_common(50)
print('Bible top 50 words (after additional preprocessing):')
for item in nitems:
    print (item[0], '\t', item[1])

Bible top 50 words (after additional preprocessing):
shall 	 9838
unto 	 8997
lord 	 7944
thou 	 5474
thy 	 4600
god 	 4467
said 	 3999
ye 	 3983
thee 	 3826
upon 	 2748
man 	 2728
israel 	 2571
king 	 2506
son 	 2388
hath 	 2264
people 	 2143
came 	 2093
house 	 2024
come 	 1971
one 	 1969
children 	 1818
also 	 1769
day 	 1740
land 	 1718
men 	 1673
shalt 	 1616
let 	 1511
go 	 1492
hand 	 1466
us 	 1448
saying 	 1445
made 	 1405
went 	 1400
even 	 1393
behold 	 1326
saith 	 1262
therefore 	 1237
every 	 1236
things 	 1162
father 	 1111
sons 	 1090
hast 	 1070
david 	 1058
make 	 1056
say 	 1056
may 	 1027
earth 	 987
jesus 	 983
great 	 962
name 	 955


# List the top 50 bigrams by frequencies

In [147]:
# Bigrams and Bigram frequency distribution
alicebigrams = list(nltk.bigrams(alicewords))
print(alicewords[:21])
print(alicebigrams[:20])

['[', 'alice', "'s", 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', '1865', ']', 'chapter', 'i', '.', 'down', 'the', 'rabbit-hole', 'alice', 'was', 'beginning', 'to']
[('[', 'alice'), ('alice', "'s"), ("'s", 'adventures'), ('adventures', 'in'), ('in', 'wonderland'), ('wonderland', 'by'), ('by', 'lewis'), ('lewis', 'carroll'), ('carroll', '1865'), ('1865', ']'), (']', 'chapter'), ('chapter', 'i'), ('i', '.'), ('.', 'down'), ('down', 'the'), ('the', 'rabbit-hole'), ('rabbit-hole', 'alice'), ('alice', 'was'), ('was', 'beginning'), ('beginning', 'to')]


In [148]:
# Bigrams and Bigram frequency distribution
biblebigrams = list(nltk.bigrams(biblewords))
print(biblewords[:21])
print(biblebigrams[:20])

['[', 'the', 'king', 'james', 'bible', ']', 'the', 'old', 'testament', 'of', 'the', 'king', 'james', 'bible', 'the', 'first', 'book', 'of', 'moses', ':', 'called']
[('[', 'the'), ('the', 'king'), ('king', 'james'), ('james', 'bible'), ('bible', ']'), (']', 'the'), ('the', 'old'), ('old', 'testament'), ('testament', 'of'), ('of', 'the'), ('the', 'king'), ('king', 'james'), ('james', 'bible'), ('bible', 'the'), ('the', 'first'), ('first', 'book'), ('book', 'of'), ('of', 'moses'), ('moses', ':'), (':', 'called')]


In [149]:
# setup for bigrams and bigram measures
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [150]:
# create the bigram finder and score the bigrams by frequency
finder_alice = BigramCollocationFinder.from_words(alicewords)
scored_alice = finder_alice.score_ngrams(bigram_measures.raw_freq)

# scored is a list of bigram pairs with their score
print(type(scored_alice))

# scores are sorted in decreasing frequency
for bscore in scored_alice[:50]:
    print(bscore)

<class 'list'>
((',', 'and'), 0.013733803069206425)
((',', "'"), 0.012808264166716427)
(("'", 'said'), 0.009852510897474175)
(('!', "'"), 0.008449274496924822)
(('.', "'"), 0.00782229653072192)
(('said', 'the'), 0.006180211381142891)
(("'", 'i'), 0.005045679823251926)
(('?', "'"), 0.00468740669970741)
(('of', 'the'), 0.0038215799844748314)
(('said', 'alice'), 0.003433450767301606)
(("'", 'the'), 0.003224458111900639)
(('in', 'a'), 0.0028960410819848332)
((',', 'i'), 0.002418343583925479)
(('and', 'the'), 0.00235863139666806)
(('alice', ','), 0.0023287753030393505)
(('in', 'the'), 0.0023287753030393505)
(('it', 'was'), 0.0021794948348958024)
(('the', 'queen'), 0.0020600704603809636)
(('to', 'the'), 0.0020600704603809636)
((',', 'but'), 0.0018510778049799965)
((',', 'as'), 0.0018212217113512867)
(('as', 'she'), 0.0018212217113512867)
(('the', 'king'), 0.0018212217113512867)
(("'", 'she'), 0.0017913656177225771)
(('at', 'the'), 0.0017913656177225771)
(('she', 'had'), 0.0017913656177225771

In [151]:
# create the bigram finder and score the bigrams by frequency
finder_bible = BigramCollocationFinder.from_words(biblewords)
scored_bible = finder_bible.score_ngrams(bigram_measures.raw_freq)

# scored is a list of bigram pairs with their score
print(type(scored_bible))

# scores are sorted in decreasing frequency
for bscore in scored_bible[:50]:
    print(bscore)

<class 'list'>
((',', 'and'), 0.026345251221995495)
(('of', 'the'), 0.01218932586405749)
(('the', 'lord'), 0.007410129994127662)
(('and', 'the'), 0.006616941906101739)
(('in', 'the'), 0.005312564690772825)
((';', 'and'), 0.0033966616392694642)
((':', 'and'), 0.0031991567491751268)
((',', 'that'), 0.0031590220656265446)
(('and', 'he'), 0.002946730713172203)
((',', 'the'), 0.0026013611994778266)
(('shall', 'be'), 0.0025992488477121116)
(('to', 'the'), 0.002272890499909169)
(('all', 'the'), 0.002258104037549165)
(('and', 'they'), 0.0022031828916405792)
(('him', ','), 0.0021514302733805653)
(('unto', 'the'), 0.0021461493939662784)
(('i', 'will'), 0.0020225768156719604)
((',', 'which'), 0.0018937233579633549)
(('lord', ','), 0.0018050045838033317)
(('of', 'israel'), 0.0017902181214433277)
(('said', ','), 0.0017743754832004663)
(('for', 'the'), 0.001765926076137607)
(('said', 'unto'), 0.0017363531514175993)
((':', 'for'), 0.0017236790408233103)
(('the', 'king'), 0.001713117281994736)
((',', 

# Remove non-alphabetical characters and list top 50 bigrams

In [152]:
def alpha_filter(w):
  # pattern to match word of non-alphabetical characters
  pattern = re.compile('^[^a-z]+$')
  if (pattern.match(w)):
    return True
  else:
    return False

In [153]:
# apply a filter to remove non-alphabetical tokens from the bigram finder
finder_alice.apply_word_filter(alpha_filter)
scored_alice = finder_alice.score_ngrams(bigram_measures.raw_freq)
for bscore in scored_alice[:50]:
    print(bscore)

(('said', 'the'), 0.006180211381142891)
(('of', 'the'), 0.0038215799844748314)
(('said', 'alice'), 0.003433450767301606)
(('in', 'a'), 0.0028960410819848332)
(('and', 'the'), 0.00235863139666806)
(('in', 'the'), 0.0023287753030393505)
(('it', 'was'), 0.0021794948348958024)
(('the', 'queen'), 0.0020600704603809636)
(('to', 'the'), 0.0020600704603809636)
(('as', 'she'), 0.0018212217113512867)
(('the', 'king'), 0.0018212217113512867)
(('at', 'the'), 0.0017913656177225771)
(('she', 'had'), 0.0017913656177225771)
(('a', 'little'), 0.0017615095240938676)
(('i', "'m"), 0.0016719412432077386)
(('she', 'was'), 0.0016719412432077386)
(('mock', 'turtle'), 0.001642085149579029)
(('and', 'she'), 0.0015823729623216098)
(('the', 'mock'), 0.0015823729623216098)
(('do', "n't"), 0.0015525168686929003)
(('the', 'gryphon'), 0.0015525168686929003)
(('the', 'hatter'), 0.0015525168686929003)
(('to', 'be'), 0.0015226607750641907)
(('went', 'on'), 0.0014330924941780617)
(('to', 'herself'), 0.001343524213291933

In [154]:
# apply a filter to remove non-alphabetical tokens from the bigram finder
finder_bible.apply_word_filter(alpha_filter)
scored_bible = finder_bible.score_ngrams(bigram_measures.raw_freq)
for bscore in scored_bible[:50]:
    print(bscore)

(('of', 'the'), 0.01218932586405749)
(('the', 'lord'), 0.007410129994127662)
(('and', 'the'), 0.006616941906101739)
(('in', 'the'), 0.005312564690772825)
(('and', 'he'), 0.002946730713172203)
(('shall', 'be'), 0.0025992488477121116)
(('to', 'the'), 0.002272890499909169)
(('all', 'the'), 0.002258104037549165)
(('and', 'they'), 0.0022031828916405792)
(('unto', 'the'), 0.0021461493939662784)
(('i', 'will'), 0.0020225768156719604)
(('of', 'israel'), 0.0017902181214433277)
(('for', 'the'), 0.001765926076137607)
(('said', 'unto'), 0.0017363531514175993)
(('the', 'king'), 0.001713117281994736)
(('son', 'of'), 0.0016909375884547303)
(('out', 'of'), 0.0015863761760518456)
(('the', 'son'), 0.0015842638242861307)
(('the', 'children'), 0.0014997697536575372)
(('children', 'of'), 0.0014680844771718146)
(('the', 'land'), 0.00133078161240035)
(('and', 'i'), 0.0013255007329860628)
(('thou', 'shalt'), 0.0013202198535717756)
(('the', 'people'), 0.0012906469288517677)
(('the', 'house'), 0.001237838134708

# Remove stopwords and list top 50 bigrams

In [155]:
# apply a filter to remove stop words
finder_alice.apply_word_filter(lambda w: w in stopwords)
scored_alice = finder_alice.score_ngrams(bigram_measures.raw_freq)
for bscore in scored_alice[:50]:
    print(bscore)

(('said', 'alice'), 0.003433450767301606)
(('mock', 'turtle'), 0.001642085149579029)
(('march', 'hare'), 0.0009255389024899983)
(('thought', 'alice'), 0.0007762584343464501)
(('white', 'rabbit'), 0.0006568340598316116)
(('alice', 'thought'), 0.00035827312354451543)
(("'of", 'course'), 0.0003284170299158058)
(('alice', 'said'), 0.0003284170299158058)
(('poor', 'alice'), 0.0003284170299158058)
(('alice', 'replied'), 0.00026870484265838657)
(('alice', 'looked'), 0.00023884874902967696)
(('king', 'said'), 0.00023884874902967696)
(('little', 'thing'), 0.00023884874902967696)
(('poor', 'little'), 0.00023884874902967696)
(('alice', 'began'), 0.00020899265540096732)
(('cried', 'alice'), 0.00020899265540096732)
(('good', 'deal'), 0.00020899265540096732)
(('oh', 'dear'), 0.00020899265540096732)
(('beautiful', 'soup'), 0.00017913656177225771)
(('golden', 'key'), 0.00017913656177225771)
(('great', 'hurry'), 0.00017913656177225771)
(('little', 'door'), 0.00017913656177225771)
(('said', 'nothing'), 

In [156]:
# apply a filter to remove stop words
finder_bible.apply_word_filter(lambda w: w in stopwords)
scored_bible = finder_bible.score_ngrams(bigram_measures.raw_freq)
for bscore in scored_bible[:50]:
    print(bscore)

(('said', 'unto'), 0.0017363531514175993)
(('thou', 'shalt'), 0.0013202198535717756)
(('thou', 'hast'), 0.000811143078034499)
(('ye', 'shall'), 0.0007995251433230673)
(('lord', 'god'), 0.0005756158561572942)
(('unto', 'thee'), 0.0005291441173115676)
(('thus', 'saith'), 0.0004689420919886947)
(('say', 'unto'), 0.00042035800137725336)
(('shall', 'come'), 0.00041824564961153853)
(('thy', 'god'), 0.0003770547901800991)
(('thou', 'art'), 0.0003358639307486597)
(('every', 'one'), 0.0003326954031000875)
(('lord', 'thy'), 0.00032318982015437067)
(('every', 'man'), 0.0003094595336772242)
(('lord', 'hath'), 0.0003041786542629371)
(('spake', 'unto'), 0.00028622366425436093)
(('shalt', 'thou'), 0.00027249337777721447)
(('lord', 'said'), 0.00023341487011148992)
(('let', 'us'), 0.0002270778148143454)
(('unto', 'moses'), 0.0002165160559857712)
(('jesus', 'christ'), 0.00020912282480576926)
(('burnt', 'offering'), 0.00019433636244576538)
(('lord', 'shall'), 0.00019222401068005052)
(('came', 'unto'), 0.

# List the top 50 bigrams by their Mutual Information scores (using min frequency 5)

In [157]:
### pointwise mutual information
pmi_alice = BigramCollocationFinder.from_words(alicewords)
scored_alice = pmi_alice.score_ngrams(bigram_measures.pmi)
for bscore in scored_alice[:50]:
    print(bscore)

(("'any", 'shrimp'), 15.03161505883576)
(("'cheshire", 'puss'), 15.03161505883576)
(("'orange", 'marmalade'), 15.03161505883576)
(("'ou", 'est'), 15.03161505883576)
(("'rule", 'forty-two'), 15.03161505883576)
(("'seven", 'jogged'), 15.03161505883576)
(("'than", 'waste'), 15.03161505883576)
(("'with", 'extras'), 15.03161505883576)
(('abide', 'figures'), 15.03161505883576)
(('barking', 'hoarsely'), 15.03161505883576)
(('bathing', 'machines'), 15.03161505883576)
(('bright-eyed', 'terrier'), 15.03161505883576)
(('buttered', 'toast'), 15.03161505883576)
(('canvas', 'bag'), 15.03161505883576)
(('carroll', '1865'), 15.03161505883576)
(('crocodile', 'improve'), 15.03161505883576)
(('daresay', "it's"), 15.03161505883576)
(('deepest', 'contempt'), 15.03161505883576)
(('draggled', 'feathers'), 15.03161505883576)
(('edgar', 'atheling'), 15.03161505883576)
(('energetic', 'remedies'), 15.03161505883576)
(('exact', 'shape'), 15.03161505883576)
(('feather', 'flock'), 15.03161505883576)
(('graceful', '

In [158]:
### pointwise mutual information
pmi_bible = BigramCollocationFinder.from_words(biblewords)
scored_bible = pmi_bible.score_ngrams(bigram_measures.pmi)
for bscore in scored_bible[:50]:
    print(bscore)

(('anathema', 'maranatha'), 19.852718465501717)
(('appii', 'forum'), 19.852718465501717)
(('ashteroth', 'karnaim'), 19.852718465501717)
(('barbed', 'irons'), 19.852718465501717)
(('changeable', 'suits'), 19.852718465501717)
(('dromedary', 'traversing'), 19.852718465501717)
(('equally', 'distant'), 19.852718465501717)
(('eubulus', 'greeteth'), 19.852718465501717)
(('fared', 'sumptuously'), 19.852718465501717)
(('grandmother', 'lois'), 19.852718465501717)
(("herod's", 'jurisdiction'), 19.852718465501717)
(('infallible', 'proofs'), 19.852718465501717)
(('je', 'hoshaphat'), 19.852718465501717)
(('monthly', 'prognosticators'), 19.852718465501717)
(('narrowed', 'rests'), 19.852718465501717)
(('sergius', 'paulus'), 19.852718465501717)
(('sticketh', 'closer'), 19.852718465501717)
(('talitha', 'cumi'), 19.852718465501717)
(('tottering', 'fence'), 19.852718465501717)
(('unequally', 'yoked'), 19.852718465501717)
(('warreth', 'entangleth'), 19.852718465501717)
(('119:9', 'wherewithal'), 18.8527184

# Add frequency filter to PMI results

In [159]:
# to get good results, must first apply frequency filter
pmi_alice.apply_freq_filter(5)
scored_alice = pmi_alice.score_ngrams(bigram_measures.pmi)
for bscore in scored_alice[:50]:
    print(bscore)

(('play', 'croquet'), 11.768580653001967)
(('golden', 'key'), 11.639297636056998)
(('kid', 'gloves'), 11.572183440198463)
(('few', 'minutes'), 10.987220939477307)
(("'of", 'course'), 10.262227986977177)
(('white', 'kid'), 10.124724463227242)
(('beautiful', 'soup'), 9.987220939477307)
(('three', 'gardeners'), 9.972721369782192)
(('march', 'hare'), 9.94415221758542)
(('good', 'deal'), 9.669044979451051)
(('any', 'rate'), 9.613762543949864)
(('cheshire', 'cat'), 9.598655651559655)
(('trembling', 'voice'), 9.183618152280811)
(('their', 'slates'), 9.166544638921868)
(("'d", 'better'), 9.165366447724587)
(('mock', 'turtle'), 9.147638855175243)
(('next', 'witness'), 9.124724463227242)
(('feet', 'high'), 9.105615640279538)
(('*', '*'), 9.050723881783464)
(('white', 'rabbit'), 9.029567230186903)
(('your', 'majesty'), 8.999193581143384)
(('great', 'hurry'), 8.87174372205737)
(('your', 'pardon'), 8.86169005739345)
(('right', 'size'), 8.746212839973513)
(('beg', 'your'), 8.709686963948398)
(('offe

In [160]:
# to get good results, must first apply frequency filter
pmi_bible.apply_freq_filter(5)
scored_bible = pmi_bible.score_ngrams(bigram_measures.pmi)
for bscore in scored_bible[:50]:
    print(bscore)

(('untempered', 'morter'), 16.39328684686442)
(('cock', 'crew'), 16.26775596478056)
(('gray', 'hairs'), 15.945827869893197)
(('cock', 'crow'), 15.782329137610319)
(('filthy', 'lucre'), 15.502221218417583)
(('skins', 'dyed'), 14.782329137610319)
(('ill', 'favoured'), 14.72343544855675)
(('judas', 'iscariot'), 14.518817728948278)
(('curious', 'girdle'), 14.28286285717077)
(('brook', 'kidron'), 14.277809629444484)
(('measuring', 'reed'), 14.156247649563568)
(('divers', 'colours'), 14.058302599151611)
(('mary', 'magdalene'), 13.972300081254389)
(('wreathen', 'chains'), 13.643265099872766)
(('dyed', 'red'), 13.639371183768276)
(('fiery', 'furnace'), 13.623899775005835)
(('committeth', 'adultery'), 13.604790952058131)
(('earthen', 'vessel'), 13.592190915278495)
(('golden', 'spoon'), 13.545289940309468)
(('bright', 'spot'), 13.520806282041733)
(('tenth', 'deals'), 13.512868462617092)
(('bottomless', 'pit'), 13.393286846864417)
(('familiar', 'spirits'), 13.329156509444704)
(('solemn', 'feasts'

# Comparison of writing styles between the two texts
Hypothesis: one aspect of the writing styles of Alice in Wonderland and the KJV Bible is words ending in 'eth', 'est', 'ath', and 'eth'. Alice in Wonderland, being a more contemporary piece of literature, will contain more common words ending in these suffixes such as "best", "smallest", or "last", while the KJV Bible will contain more archaic terms such as "ruleth", "hast", and "lovest".

In [165]:
pattern = r'\b\w+(?:ast|eth|ath|est)\b'

In [166]:
matches = re.findall(pattern, ' '.join(alicewords))
results_alice = []

for match in matches:
    results_alice.append(match)
print(set(results_alice))

{'last', 'breath', 'best', 'twentieth', 'coast', 'least', 'conquest', 'driest', 'death', 'lest', 'nest', 'honest', 'slightest', 'highest', 'roast', 'stupidest', 'rest', 'toast', 'largest', 'deepest', 'fast', 'queerest', 'oldest', 'loveliest', 'interest', 'teeth', 'past', 'beast', 'smallest', 'underneath'}


In [167]:
matches = re.findall(pattern, ' '.join(biblewords))
results_bible = []

for match in matches:
    results_bible.append(match)
print(set(results_bible))

{'harosheth', 'afflictest', 'withholdeth', 'shaketh', 'counteth', 'dasheth', 'jerubbesheth', 'mayest', 'reproveth', 'wanderest', 'mustereth', 'committeth', 'scarest', 'hindereth', 'twentieth', 'rejoiceth', 'holdeth', 'callest', 'weakeneth', 'shameth', 'consulteth', 'riphath', 'tikvath', 'happeneth', 'drewest', 'selleth', 'findeth', 'watereth', 'possessest', 'rescueth', 'boasteth', 'pleaseth', 'spoileth', 'tilleth', 'meetest', 'requireth', 'provideth', 'bewrayeth', 'settlest', 'satest', 'scaleth', 'repentest', 'appertaineth', 'dishonourest', 'partakest', 'lackest', 'bringeth', 'slumbereth', 'eldest', 'turneth', 'readest', 'fadeth', 'earneth', 'baptizest', 'croucheth', 'tabbath', 'searchest', 'writest', 'delayeth', 'burneth', 'rewardeth', 'saluteth', 'feareth', 'meddleth', 'prevaileth', 'upholdest', 'gathereth', 'shihorlibnath', 'striveth', 'believest', 'broughtest', 'sewest', 'restest', 'causeth', 'walketh', 'markest', 'trembleth', 'delivereth', 'grieveth', 'sufferest', 'paweth', 'chinn