Workbook

In [112]:
import nltk
import re
import os

In [113]:
from nltk.corpus import PlaintextCorpusReader     #need to seperate text into words and sentences
from nltk.probability import FreqDist     #need to do frequency calculations
from nltk.collocations import BigramCollocationFinder     #need to find collocations
from nltk.stem.snowball import SnowballStemmer     #used to stem

In [114]:
#load texts
corpusDirectory = "./Corpus_Data"
corpus = PlaintextCorpusReader(corpusDirectory, ".*")

#load texts into nltk word list, this includes punctuation 
#punctuation is treated as a word in these lists!

chomskyWords = corpus.words("ChomskyInterview.txt")
eurogamerWords = corpus.words("Eurogamer_Corpus.txt")
wozWords = corpus.words("WOZScript.txt")

#text divided into sentences
chomskySents = corpus.sents("ChomskyInterview.txt")
euroSents = corpus.sents("Eurogamer_Corpus.txt")
wozSents = corpus.sents("WOzScript.txt")

In [115]:
#frequency dist for words beginning with vowels

def startvowelfreq(corpus):
    
    #create a list for each vowel
    wordA = []
    wordE = []
    wordI = []
    wordO = []
    wordU = []
    
    wordVowel = [wordA, wordE, wordI, wordO, wordU]     #put all the vowel lists in single list
    
    for word in corpus:      #find words beginning with the vowel, add the lower case version to the corresponding list
        if re.search("^[aA]", word):
            wordA.append(word.lower())
            
        elif re.search("^[eE]", word):
            wordE.append(word.lower())
            
        elif re.search("^[iI]", word):
            wordI.append(word.lower())
            
        elif re.search("^[oO]", word):
            wordO.append(word.lower())
            
        elif re.search("^[uU]", word):
            wordU.append(word.lower())

    for vowel in wordVowel:         # for each vowel list, run FreqDist to get frequency, and then print the top 10
        vowelFreq = FreqDist(vowel)
        vowelTop10 = vowelFreq.most_common(10)
        print(vowelTop10)

In [116]:
startvowelfreq(chomskyWords)

[('a', 267), ('and', 217), ('are', 72), ('about', 68), ('as', 46), ('at', 29), ('all', 28), ('an', 27), ('any', 19), ('american', 15)]
[('ezra', 35), ('even', 15), ('example', 7), ('every', 7), ('end', 6), ('early', 6), ('economic', 6), ('everything', 6), ('ever', 5), ('essentially', 5)]
[('it', 261), ('in', 188), ('is', 158), ('i', 153), ('if', 54), ('into', 19), ('industry', 14), ('idea', 8), ('international', 7), ('its', 6)]
[('of', 356), ('on', 55), ('or', 31), ('one', 31), ('out', 31), ('our', 24), ('other', 18), ('over', 16), ('own', 12), ('others', 7)]
[('up', 18), ('us', 16), ('united', 8), ('used', 7), ('under', 7), ('understand', 6), ('use', 5), ('u', 5), ('using', 4), ('unfortunately', 4)]


In [117]:
startvowelfreq(eurogamerWords)

[('a', 183), ('and', 175), ('an', 37), ('as', 33), ('are', 30), ('at', 20), ('all', 15), ('army', 10), ('about', 9), ('also', 8)]
[('engage', 18), ('even', 15), ('each', 14), ('every', 13), ('estelle', 10), ('emblem', 10), ('enemy', 6), ('end', 5), ('entirely', 4), ('enemies', 4)]
[('in', 78), ('it', 70), ('i', 69), ('is', 56), ('its', 21), ('into', 18), ('if', 18), ('inkulinati', 13), ('instead', 7), ('isn', 7)]
[('of', 125), ('or', 24), ('on', 19), ('one', 18), ('off', 16), ('out', 15), ('over', 13), ('other', 10), ('own', 8), ('only', 7)]
[('up', 16), ('unbound', 11), ('units', 9), ('unique', 5), ('until', 4), ('unit', 3), ('unless', 2), ('underneath', 2), ('used', 2), ('upon', 2)]


In [118]:
startvowelfreq(wozWords)

[('and', 768), ('as', 344), ('a', 331), ('at', 167), ('all', 115), ('aunt', 69), ('about', 66), ('are', 64), ('away', 52), ('around', 40)]
[('em', 101), ('els', 43), ('exits', 32), ('enters', 26), ('ever', 23), ('enter', 23), ('eyes', 22), ('exit', 19), ('emerald', 14), ('even', 12)]
[('i', 474), ('in', 360), ('it', 261), ('is', 102), ('into', 72), ('if', 58), ('int', 30), ('isn', 9), ('images', 6), ('incubator', 5)]
[('of', 449), ('oh', 278), ('on', 233), ('o', 225), ('out', 158), ('oz', 128), ('over', 59), ('one', 40), ('others', 37), ('old', 32)]
[('up', 181), ('uncle', 45), ('us', 27), ('under', 17), ('use', 6), ('uh', 5), ('upon', 3), ('until', 3), ('used', 2), ('ugly', 2)]


In [119]:
#number of words

def num_words(corpus):      #returns the number of words without punctuation
    
    actualwords = []
    
    for word in corpus:     #only include words that begin with a letter
        if re.search("^[A-Za-z]", word):
            actualwords.append(word)
            
    numwords = len(actualwords)
    
    return numwords



print(num_words(chomskyWords))

print(num_words(eurogamerWords))

print(num_words(wozWords))


chomskyWordsactual = num_words(chomskyWords)
euroWordsactual = num_words(eurogamerWords)
wozWordsactual = num_words(wozWords)
#these word lists don't include punctuation


11165
5858
29996


In [120]:
#lexical diversity

def lexical_diversity(text):
    return len(set(text)) / len(text)

In [121]:
lexical_diversity(chomskyWords)

0.1660031548110869

In [122]:
lexical_diversity(eurogamerWords)

0.29085392284351974

In [123]:
lexical_diversity(wozWords)

0.07287356905348123

In [124]:
#collocation finder

def collocationfinder(corpus):
  
    bigram_measures = nltk.collocations.BigramAssocMeasures()     #import bigram finder
    
    collocater = BigramCollocationFinder.from_words(corpus)     #set up finder to search text
    
    #filter out meaningless associations
    collocater.apply_word_filter(lambda w: w in ("'", ',',".","re","?","!","nt","’",":","--","S"))   
    
    print (collocater.nbest(bigram_measures.likelihood_ratio, 10))     #print top ten bigrams

In [125]:
collocationfinder(chomskyWords)

[('EZRA', 'KLEIN'), ('NOAM', 'CHOMSKY'), ('going', 'to'), ('I', 'think'), ('in', 'the'), ('of', 'the'), ('a', 'lot'), ('I', 'mean'), ('United', 'States'), ('I', 'don')]


In [126]:
collocationfinder(eurogamerWords)

[('A', 'Space'), ('Fire', 'Emblem'), ('Space', 'for'), ('the', 'Unbound'), ('This', 'is'), ('even', 'if'), ('Three', 'Houses'), ('for', 'the'), ('Emblem', 'Engage'), ('you', 'can')]


In [127]:
collocationfinder(wozWords)

[('Tin', 'Man'), ('TIN', 'MAN'), ('CAMERA', 'PANS'), ('CAMERA', 'TRUCKS'), ('DOROTHY', 'Oh'), ('Aunt', 'Em'), ('LAP', 'DISSOLVE'), ('DISSOLVE', 'TO'), ('shooting', 'past'), ('Dorothy', 'and')]


In [128]:
#longest sentence

def longsent(sentcorpus):     #prints info about longsent, num and joined longsent
    
    sentedit = []

    for sent in sentcorpus:
        for word in sent:
            if re.search("[^A-Za-z]", word):
                sent.remove(word)
        sentedit.append(sent)
        
    longsent = max(sentedit, key=len)     #creates a list

    print(longsent)
    print(len(longsent))

    longsent_join = " ".join(longsent)
    print(longsent_join)
    
    
    
def longsentvar(sentcorpus):     #used for stemming function, only returns longsent, does not print
    
    sentedit = []

    for sent in sentcorpus:
        for word in sent:
            if re.search("[^A-Za-z]", word):
                sent.remove(word)
        sentedit.append(sent)
        
    longsent = max(sentedit, key=len)     #creates a list
    return longsent

chomskylong = longsentvar(chomskySents)
eurolong = longsentvar(euroSents)
wozlong = longsentvar(wozSents)

In [129]:
longsent(chomskySents)

['If', 'I', 'go', 'back', 'to', 'early', 'childhood', 'some', 'of', 'the', 'reading', 'that', 'had', 'a', 'lasting', 'impact', 'which', 'I', 'don', 't', 'even', 'think', 'it', 's', 'in', 'English', 'was', 'the', 'Hebrew', 'essays', 'of', 'a', 'turn', 'of', 'the', 'century', 'essayist', 'that', 'went', 'by', 'the', 'name', 'of', 'Ahad', 'Ha', 'am', 'writing', 'about', 'partly', 'intellectual', 'contributions', 'which', 'were', 'significant', 'partly', 'talking', 'about', 'the', 'developing', 'situation', 'in', 'what', 'was', 'then', 'Palestine', 'which', 'had', 'a', 'large', 'effect', 'on', 'my', 'thinking', 'ever', 'since']
75
If I go back to early childhood some of the reading that had a lasting impact which I don t even think it s in English was the Hebrew essays of a turn of the century essayist that went by the name of Ahad Ha am writing about partly intellectual contributions which were significant partly talking about the developing situation in what was then Palestine which had 

In [130]:
longsent(euroSents)

['As', 'a', 'surprisingly', 'linear', 'adventure', 'it', 's', 'difficult', 'to', 'talk', 'much', 'about', 'Estelle', 's', 'journey', 'without', 'giving', 'something', 'important', 'away', 'and', 'as', 'her', 'story', 'is', 'essentially', 'as', 'long', 'or', 'as', 'brief', 'as', 'you', 'want', 'it', 'to', 'be', 'you', 'can', 'belt', 'through', 'it', 'and', 'be', 'done', 'within', 'a', 'few', 'hours', 'or', 'savour', 'each', 'new', 'environment', 'and', 'scour', 'every', 'inch', 'of', 'it', 'for', 'days', 'I', 'imagine', 'there', 'may', 'be', 'some', 'people', 'or', 'places', 'you', 'could', 'miss', 'entirely', 'and', 'complete', 'the', 'game', 'never', 'knowing', 'that', 'they', 'existed']
84
As a surprisingly linear adventure it s difficult to talk much about Estelle s journey without giving something important away and as her story is essentially as long or as brief as you want it to be you can belt through it and be done within a few hours or savour each new environment and scour eve

In [131]:
longsent(wozSents)

['Gale', 'Sitting', 'room', 'Aunt', 'Em', 'and', 'Miss', 'Gulch', 'seated', 'Dorothy', 'enters', 'carrying', 'Toto', 'in', 'her', 'arms', 'CAMERA', 'TRUCKS', 'forward', 'on', 'them', 'PANS', 'to', 'right', 'with', 'Dorothy', 'to', 'Uncle', 'Henry', 'then', 'Pans', 'her', 'left', 'to', 'Aunt', 'Em', 'and', 'Miss', 'Gulch', 'Miss', 'Gulch', 'shows', 'order', 'to', 'Aunt', 'Em', 'Uncle', 'Henry', 'enters', 'looks', 'at', 'the', 'order', 'Miss', 'Gulch', 'picks', 'up', 'basket', 'rises', 'Dorothy', 'screams', 'at', 'Miss', 'Gulch', 'Miss', 'Gulch', 'tries', 'to', 'take', 'Toto', 'away', 'from', 'Dorothy', 'Uncle', 'Henry', 'takes', 'Toto', 'puts', 'him', 'into', 'basket']
81
Gale Sitting room Aunt Em and Miss Gulch seated Dorothy enters carrying Toto in her arms CAMERA TRUCKS forward on them PANS to right with Dorothy to Uncle Henry then Pans her left to Aunt Em and Miss Gulch Miss Gulch shows order to Aunt Em Uncle Henry enters looks at the order Miss Gulch picks up basket rises Dorothy s

In [132]:
#stemmer

stemmer = SnowballStemmer("english")

def stemming(longsent):
    
    for word in longsent:
        print(stemmer.stem(word))

In [133]:
stemming(chomskylong)

if
i
go
back
to
earli
childhood
some
of
the
read
that
had
a
last
impact
which
i
don
t
even
think
it
s
in
english
was
the
hebrew
essay
of
a
turn
of
the
centuri
essayist
that
went
by
the
name
of
ahad
ha
am
write
about
part
intellectu
contribut
which
were
signific
part
talk
about
the
develop
situat
in
what
was
then
palestin
which
had
a
larg
effect
on
my
think
ever
sinc


In [134]:
stemming(eurolong)

as
a
surpris
linear
adventur
it
s
difficult
to
talk
much
about
estell
s
journey
without
give
someth
import
away
and
as
her
stori
is
essenti
as
long
or
as
brief
as
you
want
it
to
be
you
can
belt
through
it
and
be
done
within
a
few
hour
or
savour
each
new
environ
and
scour
everi
inch
of
it
for
day
i
imagin
there
may
be
some
peopl
or
place
you
could
miss
entir
and
complet
the
game
never
know
that
they
exist


In [135]:
stemming(wozlong)

gale
sit
room
aunt
em
and
miss
gulch
seat
dorothi
enter
carri
toto
in
her
arm
camera
truck
forward
on
them
pan
to
right
with
dorothi
to
uncl
henri
then
pan
her
left
to
aunt
em
and
miss
gulch
miss
gulch
show
order
to
aunt
em
uncl
henri
enter
look
at
the
order
miss
gulch
pick
up
basket
rise
dorothi
scream
at
miss
gulch
miss
gulch
tri
to
take
toto
away
from
dorothi
uncl
henri
take
toto
put
him
into
basket
