# rlueck
# IST664
# Homework1

In [1]:
#import the necessary libraries
import re
import pandas as pd
import nltk
from nltk import FreqDist

#setup for bigrams and bigram measures
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

# Functions to minimize program length

In [2]:
#regex for keeping abbreviated and apostrophised words together prior to text cleanup
def custom_tokenizer(text):
    pattern = r''' (?x) 		# set flag to allow verbose regexps
        [\w+]+\'[\w+]\b 	# words with apostrophes
        |(?:[A-Za-z\.])+    	# abbreviations, e.g. U.S.A.
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, $12.40, 50%
        | \w+(?:-\w+)*  	# words with internal hyphens
        | \.\.\.        	# ellipsis
        | [][.,;"'?():-_%#']  # separate tokens
        '''
    tokenized_text = nltk.regexp_tokenize(text, pattern)
    return tokenized_text

In [3]:
# function that takes a word and returns true if it consists only
#   of non-alphabetic characters (borrowed from class lab)

def alpha_filter(w):
  # pattern to match a word of non-alphabetical characters
    pattern = re.compile('^[^a-z]+$')
    if (pattern.match(w)):
        return True
    else:
        return False

In [4]:
#transform selected text documents into tokenized format and change all letters to lowercase
def transform_text(text):
    text_file = open(text, encoding = 'utf-8')
    raw_text = text_file.read()
    #tokens = custom_tokenizer(raw_text)
    tokens = custom_tokenizer(raw_text)
    
    #change all words to lowercase
    doc_words = [w.lower() for w in tokens]
    return doc_words

In [5]:
#find the top tokens of a text and save them in an array
def toptokens(freqdist):
    toptokenlist = []
    items = freqdist.most_common(50)
    for item in items:
        #save each word and count as a list
        toptokenlist.append([item[0], item[1]])
    return toptokenlist

In [6]:
#removes stopwords from a given text
def stoppedwords(text):
    nltkstopwords = nltk.corpus.stopwords.words('english')
    addl_stopwords = ['could', 'would', 'said', 'gregor', ':', '[', ']', '\'s', '\'ll', 'e', 'n\'t', 'n', 'h', 'f', 'l', 'c']
    stopwords = nltkstopwords + addl_stopwords
    
    word_removal = [w for w in text if not w in stopwords]
    return word_removal

In [7]:
#function to try different stemmers
def stem(tokens):
    stemmer = nltk.PorterStemmer()
    stemmed_text = [stemmer.stem(t) for t in tokens]
    return stemmed_text

In [8]:
#set up some bigrams
def bigram_generator(text):
    bigrams= list(nltk.bigrams(text))
    return bigrams

In [9]:
#create the bigram finder and score the bigrams by frequency
def bigram_score(text, pmi = False, filternum = None):
    finder = BigramCollocationFinder.from_words(text)
    if pmi == True:
        pmi_scores = []
        finder.apply_freq_filter(filternum)
        pmi_score = finder.score_ngrams(bigram_measures.pmi)
        for bscore in pmi_score:
            pmi_scores.append(bscore)
        return pmi_scores
    else:
        scores = finder.score_ngrams(bigram_measures.raw_freq)
        return scores

In [10]:
#print texts in side by side table for easy comparison
def print_table(text1, text2, list_length = None):
    df = pd.DataFrame({'metamorphosis':pd.Series(text1[:list_length]), 'death':pd.Series(text2[:list_length])})
    return df

# Import, transform, and analyze the two texts

In [11]:
#display the length of the texts using the "transform_text" function
metamorphosis = transform_text('metamorphosis-kafka.txt')
death = transform_text('death_of_a_salesman-miller.txt')

#print text lengths in readable format
print('metamorphosis len: {}\n death len: {}'.format(len(metamorphosis), len(death)))

metamorphosis len: 23970
 death len: 41511


In [12]:
# filter both texts using the alpha-filter
alphamm = [w for w in metamorphosis if not alpha_filter(w)]
alphadeath = [w for w in death if not alpha_filter(w)]
print_table(alphamm, alphadeath, 10)

Unnamed: 0,metamorphosis,death
0,metamorphosis,the
1,franz,action
2,kafka,takes
3,translated,place
4,by,in
5,david,willy
6,wyllie,loman
7,i,s
8,one,house
9,morning,and


# Remove stopwords & lemmitize using Porter 

In [15]:
#lemmitization
newmm = stem(alphamm)
newdeath = stem(alphadeath)

#remove stop words
newmm = stoppedwords(newmm)
newdeath = stoppedwords(newdeath)

#print text lengths in readable format
print('metamorphosis len: {}\n death len: {}'.format(len(newmm), len(newdeath)))

metamorphosis len: 11017
 death len: 17216


In [21]:
#change type to freq dist
newmm = FreqDist(newmm)
newdeath = FreqDist(newdeath)

#save the toptokens information to a dataframe to compare the two texts
newmm_toptokens = toptokens(newmm)
newdeath_toptokens = toptokens(newdeath)
print_table(newmm_toptokens, newdeath_toptokens)

Unnamed: 0,metamorphosis,death
0,"[hi, 550]","[willi, 799]"
1,"[wa, 406]","[biff, 523]"
2,"[room, 110]","[linda, 318]"
3,"[even, 99]","[happi, 282]"
4,"[gregor', 99]","[hi, 223]"
5,"[sister, 94]","[go, 208]"
6,"[father, 93]","[wa, 156]"
7,"[door, 92]","[get, 149]"
8,"[thi, 89]","[ben, 144]"
9,"[mother, 83]","[come, 138]"


In [31]:
#check top 30 bigrams for both works after removing stop words
mm_bg_scores = bigram_score(newmm)
death_bg_scores = bigram_score(newdeath)
print_table(mm_bg_scores, death_bg_scores, 50)

Unnamed: 0,metamorphosis,death
0,"((abandon, family.), 0.00044014084507042255)","((abl, welcom), 0.00031308703819661864)"
1,"((abil, iii), 0.00044014084507042255)","((about., foundat), 0.00031308703819661864)"
2,"((abl, cover), 0.00044014084507042255)","((abov, unseen), 0.00031308703819661864)"
3,"((about., irritably.), 0.00044014084507042255)","((abrupt, convers), 0.00031308703819661864)"
4,"((abov, hung), 0.00044014084507042255)","((abso, lute), 0.00031308703819661864)"
5,"((abruptli, support), 0.00044014084507042255)","((absolut, forgot), 0.00031308703819661864)"
6,"((absolut, household), 0.00044014084507042255)","((absolutely., points.), 0.00031308703819661864)"
7,"((accept, doctor'), 0.00044014084507042255)","((accent, dread), 0.00031308703819661864)"
8,"((accumulated., accumulating.), 0.000440140845...","((accept, s.), 0.00031308703819661864)"
9,"((accumulating., nod), 0.00044014084507042255)","((accid, accidents.), 0.00031308703819661864)"


In [30]:
#check top 50 bigrams by pmi
mm_pmi_scores = bigram_score(metamorphosis, pmi = True, filternum = 5)
death_pmi_scores = bigram_score(death, pmi = True, filternum = 5)
print_table(mm_pmi_scores, death_pmi_scores, 50)

Unnamed: 0,metamorphosis,death
0,"((mrs., samsa), 9.50454816917804)","((ebbets, field), 12.756243565712321)"
1,"((chief, clerk), 9.339488922907545)","((con, ﬁdence), 12.171281064991167)"
2,"((mr., samsa), 9.302914308008393)","((terri, ﬁc), 12.171281064991167)"
3,"((three, gentlemen), 9.188879064762363)","((miss, forsythe), 11.640766348292384)"
4,"((o, '), 8.96397978781534)","((high, school), 10.88177444779618)"
5,"((', ll), 8.963979787815338)","((wall, line), 10.586318564270009)"
6,"((', ve), 8.963979787815337)","((slight, pause.), 10.503262824542452)"
7,"((', re), 8.83844890573148)","((mrs., loman), 10.483225071305904)"
8,"((some, kind), 8.74158736647889)","((slight, pause), 10.296811947075025)"
9,"((two, women), 8.20909228565187)","((y, know.), 10.09327855298989)"
