In [1]:
# imports
import re
import nltk
from nltk.corpus import wordnet as wn
import os.path as p
import statistics as stats
import string
import re
import random

In [2]:
# neccessary downloads (uncomment if you need any of them, then recomment the code and run the notebook again)
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')

# Part 1

## Part A (distance between speeches)

In [3]:
# full speech excerpts
trump_speech = "from a young age my parents impressed on me the values that you work hard for what you want in life that your word is your bond and you do what you say and keep your promise that you treat people with respect they taught and showed me values and morals in their daily life"
obama_speech = "and barack and i were raised with so many of the same values that you work hard for what you want in life that your word is your bond and you do what you say youre going to do that you treat people with dignity and respect even if you dont know them and even if you dont agree with them"
print("Trump speech excerpt (punctuation removed):", trump_speech)
print("Obama speech excerpt (punctuation removed):", obama_speech)

Trump speech excerpt (punctuation removed): from a young age my parents impressed on me the values that you work hard for what you want in life that your word is your bond and you do what you say and keep your promise that you treat people with respect they taught and showed me values and morals in their daily life
Obama speech excerpt (punctuation removed): and barack and i were raised with so many of the same values that you work hard for what you want in life that your word is your bond and you do what you say youre going to do that you treat people with dignity and respect even if you dont know them and even if you dont agree with them


In [4]:
# distance between full speech excerpts
print("Distance between speech excerpts is",nltk.edit_distance(trump_speech, obama_speech))

Distance between speech excerpts is 118


In [5]:
# shortened speech excerpts
trump_speech_shortened = re.search('that you work hard(.+?)respect', trump_speech)
trump_speech_shortened = 'that you work hard'+ trump_speech_shortened.group(1) + 'respect'
obama_speech_shortened = re.search('that you work hard(.+?)respect', obama_speech)
obama_speech_shortened = 'that you work hard'+ obama_speech_shortened.group(1) + 'respect'
print("Trump speech shortened:", trump_speech_shortened)
print("Obama speech shortened:", obama_speech_shortened)

Trump speech shortened: that you work hard for what you want in life that your word is your bond and you do what you say and keep your promise that you treat people with respect
Obama speech shortened: that you work hard for what you want in life that your word is your bond and you do what you say youre going to do that you treat people with dignity and respect


In [6]:
# distance between shortened speech excerpts
print("Distance between shorted speech excerpts is", nltk.edit_distance(trump_speech_shortened, obama_speech_shortened))

Distance between shorted speech excerpts is 29


## Part B (lemmas in each substring)

In [7]:
# init lemmatizer, make function to convert Upenn to Wordnet tags, and make function to find lemmas appearing in both
lemmatizer = nltk.WordNetLemmatizer()

def upenn_lemmatizer(tagged_samples):
    """
    Replaces upenn pos tags with wordnet pos tag objects and then lemmatizes the words
    :tagged_samples: list of pos tuples generated using nltk.pos_tag
    :return: list of lemmas
    """
    lemmas = []
    upenn_to_wordnet_dic = {  
        'J':wn.ADJ,
        'V':wn.VERB,
        'N':wn.NOUN,
        'R':wn.ADV
    }
    for word in tagged_samples:
        word = list(word)  # convert tuple to list so it's mutable
        word[1] = word[1][0]  # reduce pos string to one character to match upenn_to_wordnet_dic formatting
        if word[1] in upenn_to_wordnet_dic:  # convert from upenn to wordnet
            word[1] = upenn_to_wordnet_dic[word[1]]
        else:
            word[1] = wn.NOUN
        lemmas.append(lemmatizer.lemmatize(*word))
    return lemmas

def token_similarities(speech_a, speech_b):
    """
    Finds all the tokens in common between two documents
    :speech_a: First document for comparison (can be any iterable that is able to be converted to a set)
    :speech_b: Second document, same rules as speech_a
    :returns: a list of common tokens
    """
    tokens_in_common = []
    speech_a = set(speech_a)
    speech_b = set(speech_b)
    if (speech_a & speech_b):
        tokens_in_common.extend(speech_a)
    return tokens_in_common

In [8]:
# make pos list
trump_speech_shortened_tokenized = nltk.word_tokenize(trump_speech_shortened)
trump_speech_shortened_tagged = nltk.pos_tag(trump_speech_shortened_tokenized)
obama_speech_shortened_tokenized = nltk.word_tokenize(obama_speech_shortened)
obama_speech_shortened_tagged = nltk.pos_tag(obama_speech_shortened_tokenized)
print("Trump POS tagged:", trump_speech_shortened_tagged)
print("Obama POS tagged:", obama_speech_shortened_tagged)

Trump POS tagged: [('that', 'IN'), ('you', 'PRP'), ('work', 'VBP'), ('hard', 'RB'), ('for', 'IN'), ('what', 'WP'), ('you', 'PRP'), ('want', 'VBP'), ('in', 'IN'), ('life', 'NN'), ('that', 'IN'), ('your', 'PRP$'), ('word', 'NN'), ('is', 'VBZ'), ('your', 'PRP$'), ('bond', 'NN'), ('and', 'CC'), ('you', 'PRP'), ('do', 'VBP'), ('what', 'WP'), ('you', 'PRP'), ('say', 'VBP'), ('and', 'CC'), ('keep', 'VB'), ('your', 'PRP$'), ('promise', 'NN'), ('that', 'IN'), ('you', 'PRP'), ('treat', 'VBP'), ('people', 'NNS'), ('with', 'IN'), ('respect', 'NN')]
Obama POS tagged: [('that', 'IN'), ('you', 'PRP'), ('work', 'VBP'), ('hard', 'RB'), ('for', 'IN'), ('what', 'WP'), ('you', 'PRP'), ('want', 'VBP'), ('in', 'IN'), ('life', 'NN'), ('that', 'IN'), ('your', 'PRP$'), ('word', 'NN'), ('is', 'VBZ'), ('your', 'PRP$'), ('bond', 'NN'), ('and', 'CC'), ('you', 'PRP'), ('do', 'VBP'), ('what', 'WP'), ('you', 'PRP'), ('say', 'VBP'), ('youre', 'RB'), ('going', 'VBG'), ('to', 'TO'), ('do', 'VB'), ('that', 'IN'), ('you',

In [9]:
# feed pos list into lemmatizer and make a list of these lemmas
trump_lemmas = upenn_lemmatizer(trump_speech_shortened_tagged)
obama_lemmas = upenn_lemmatizer(obama_speech_shortened_tagged)
print("Trump's lemmas:", trump_lemmas)
print("Obama's lemmas:", obama_lemmas)

Trump's lemmas: ['that', 'you', 'work', 'hard', 'for', 'what', 'you', 'want', 'in', 'life', 'that', 'your', 'word', 'be', 'your', 'bond', 'and', 'you', 'do', 'what', 'you', 'say', 'and', 'keep', 'your', 'promise', 'that', 'you', 'treat', 'people', 'with', 'respect']
Obama's lemmas: ['that', 'you', 'work', 'hard', 'for', 'what', 'you', 'want', 'in', 'life', 'that', 'your', 'word', 'be', 'your', 'bond', 'and', 'you', 'do', 'what', 'you', 'say', 'youre', 'go', 'to', 'do', 'that', 'you', 'treat', 'people', 'with', 'dignity', 'and', 'respect']


In [10]:
# find lemmas in common
common_lemmas = token_similarities(trump_lemmas, obama_lemmas)
print("Lemmas in common in substrings:", common_lemmas)

Lemmas in common in substrings: ['be', 'bond', 'people', 'your', 'work', 'hard', 'in', 'you', 'respect', 'that', 'treat', 'want', 'life', 'what', 'with', 'do', 'keep', 'say', 'promise', 'for', 'and', 'word']


## Part C (words in both excerpts)

In [11]:
# find words in common (utilizing token_similarities from Part B)
trump_speech_full_tokenized = nltk.word_tokenize(trump_speech)
obama_speech_full_tokenized = nltk.word_tokenize(obama_speech)
common_words = token_similarities(trump_speech_full_tokenized, obama_speech_full_tokenized)
print("Words in common in excerpts:", common_words)

Words in common in excerpts: ['showed', 'the', 'bond', 'from', 'they', 'impressed', 'people', 'your', 'values', 'age', 'a', 'on', 'is', 'work', 'hard', 'in', 'young', 'you', 'respect', 'me', 'that', 'their', 'my', 'treat', 'want', 'life', 'what', 'with', 'do', 'morals', 'keep', 'say', 'taught', 'promise', 'parents', 'for', 'daily', 'and', 'word']


## Part D (analysis of results)

### Results Discussion

From these tests, the two speech excerpts do appear to be very similar. First, the distance between the two excepts and the distance between two substrings are compared. This metric gives a sense of how similar the overall structure and content of the excerpts are. The distances came out to be 118 and 29 (after the removal of punctuation). To get a sense of how small that number is, below is the number of characters each preprocessed speech has.

In [12]:
print("Character length of Trump speech:", len(trump_speech))
print("Character length of Obama speech:", len(obama_speech))
print("Character length of Trump speech substring:", len(trump_speech_shortened))
print("Character length of Obama speech substring:", len(obama_speech_shortened))

Character length of Trump speech: 272
Character length of Obama speech: 286
Character length of Trump speech substring: 153
Character length of Obama speech substring: 161


Because of the way distance between texts is calculated, this means that the full excerpts consist of over 2/3 characters that are identical. The substrings are even more similar, consisting of only about 1/5 original content each. Comparing original lemmas in the substring and words in the excerpts gives a more high level overview of the similarities, as this helps capture the similarities in meaning rather than just structure. This metric also reveals that the two speeches are very similar. From looking at the list of lemmas in common, both speeches have "work", "hard", "respect", "promise", and "bond". Considering how short this substring is, it's clear that they both must have the same message if they have these highly specific lemmas in common. The words in common in the full excerpts give further confirmation of this, as this list contains all the past lemmas and addionally has "parents", "values", and "morals". To get a numerical sense of how many lemmas and words are in common, below is a list of each speechs' number of lemmas and words.

In [13]:
print("Number of unique lemmas in Trump's substring:", len(trump_lemmas))
print("Number of unique lemmas in Obama's substring:", len(obama_lemmas))
print("Number of lemmas in common:", len(common_lemmas))

Number of unique lemmas in Trump's substring: 32
Number of unique lemmas in Obama's substring: 34
Number of lemmas in common: 22


In [14]:
print("Number of words in Trump's excerpt:", len(trump_speech_full_tokenized))
print("Number of words in Obama's excerpt:", len(obama_speech_full_tokenized))
print("Number of words in common:", len(common_words))

Number of words in Trump's excerpt: 55
Number of words in Obama's excerpt: 61
Number of words in common: 39


About 2/3 of the lemmas in the substring and 1/2 the words in the speech are identical, giving even more evidence that the two speeches are very similar.

### Min Edit Distance Algorithm Explanation

The min edit distance algorithm calculates the minimum number of edits required to convert one string to another. An edit can be the addition of a character, the deletion of a character, or the conversion of one character to another. For example, the distance between the strings "bleed" and "read" would be calculated by doing the following conversions: "bleed" -> "rleed" -> "reed" -> "read". This conversion had two conversions and one deletion, making the distance between the words 3. The algorithm is useful for finding strings that are similar, as a low distance between documents indicates that there are not many character differences between them.

# Part 2

## Part A (download and preprocess text)

In [15]:
# init regex for removing punctuation
regex = re.compile('[%s]' % re.escape(string.punctuation))

# init function to help find most frequent unigrams and bigrams (returns a list of the highest keys and their values)
def highest_dict_values(dic, num_values):
    highest_keys = ['']  # default value that is replaced
    highest_values = [0]  # ensures the default value is replaced, all words must have a frequency >= 1
    for word in dic:  # finding the highest values
        for n in highest_values:
            if n < dic[word]:
                highest_keys.insert(highest_values.index(n), word)
                highest_values.insert(highest_values.index(n), dic[word])
                highest_values = highest_values[:num_values]
                highest_keys = highest_keys[:num_values]
                break
    highest_val_and_keys = []
    for i in range(len(highest_values)):
        highest_val_and_keys.append([highest_keys[i], highest_values[i]])
    return highest_val_and_keys

In [16]:
# download text from file
document = open('island_of_dr_moreau_processed.txt', 'r')  # removed boilerplate, chapter names, and any other non-story parts
text = document.read()
text = regex.sub(' ', text)  # remove punctuation using the regex compiler
text = text.lower()
print(text)

on february the first 1887  the lady vain was lost by collision
with a derelict when about the latitude 1 degree s  and longitude
107 degrees w 

on january the fifth  1888  that is eleven months and four days after  my
uncle  edward prendick  a private gentleman  who certainly went
aboard the lady vain at callao  and who had been considered drowned 
was picked up in latitude 5 degrees 3  s  and longitude 101 degrees w 
in a small open boat of which the name was illegible  but which is
supposed to have belonged to the missing schooner ipecacuanha 
he gave such a strange account of himself that he was supposed demented 
subsequently he alleged that his mind was a blank from the moment
of his escape from the lady vain   his case was discussed among
psychologists at the time as a curious instance of the lapse
of memory consequent upon physical and mental stress 
the following narrative was found among his papers by the undersigned 
his nephew and heir  but unaccompanied by any definite re

In [17]:
# tokenize text
tokenized_text = nltk.word_tokenize(text)

## Part B (Regex and list comprehension)

### List Comprehension

In [18]:
# find the most common bigrams
bigrams = []
num_words = len(tokenized_text)
for i in range(num_words - 1):  # minus one to stop from running on the last word
    bigrams.append([tokenized_text[i], tokenized_text[i+1]])
frequency_dict_bi = {}
for bi in bigrams:
    bi_string = ' '
    bi_string = bi_string.join(bi)
    if bi_string in frequency_dict_bi:
        frequency_dict_bi[bi_string] += 1
    else:
        frequency_dict_bi[bi_string] = 1
highest_dict_values(frequency_dict_bi, 100)

[['of the', 446],
 ['in the', 226],
 ['to the', 142],
 ['i had', 131],
 ['and the', 113],
 ['i was', 107],
 ['it was', 103],
 ['on the', 86],
 ['of a', 85],
 ['said i', 81],
 ['at the', 80],
 ['and then', 77],
 ['with a', 74],
 ['in a', 74],
 ['that i', 70],
 ['with the', 69],
 ['to me', 69],
 ['into the', 68],
 ['i saw', 67],
 ['of my', 67],
 ['as i', 65],
 ['from the', 64],
 ['and i', 61],
 ['the beast', 61],
 ['the beach', 59],
 ['was a', 58],
 ['i heard', 58],
 ['out of', 57],
 ['the law', 56],
 ['he said', 53],
 ['he was', 51],
 ['in my', 51],
 ['i could', 49],
 ['at me', 48],
 ['me i', 45],
 ['beast people', 45],
 ['he had', 44],
 ['then i', 44],
 ['by the', 43],
 ['a little', 43],
 ['had been', 40],
 ['said the', 40],
 ['me and', 40],
 ['the island', 40],
 ['m ling', 40],
 ['did not', 39],
 ['it is', 39],
 ['said he', 39],
 ['there was', 39],
 ['for a', 38],
 ['that the', 37],
 ['but i', 37],
 ['and a', 37],
 ['upon the', 37],
 ['was the', 37],
 ['the thing', 36],
 ['began to', 

In [19]:
# find context for how the main character describes beasts
elevengrams = []
for i in range(len(tokenized_text)):
    if "beast" in tokenized_text[i]:
        elevengrams.append(tokenized_text[i-5:i+6])
for gram in elevengrams:
    print(gram)

['forgot', 'the', 'noise', 'of', 'the', 'beast', 'that', 'had', 'troubled', 'me', 'after']
['said', 'montgomery', 'what', 'are', 'these', 'beasts', 'for', 'merchandise', 'curios', 'does', 'the']
['you', 'agreed', 'to', 'take', 'the', 'beasts', 'i', 'wish', 'i', 'd', 'never']
['island', 'what', 'the', 'devil', 'want', 'beasts', 'for', 'on', 'an', 'island', 'like']
['did', 'he', 'want', 'with', 'the', 'beasts', 'why', 'too', 'had', 'he', 'pretended']
['captain', 'this', 'ship', 'aint', 'for', 'beasts', 'and', 'cannibals', 'and', 'worse', 'than']
['and', 'cannibals', 'and', 'worse', 'than', 'beasts', 'any', 'more', 'overboard', 'you', 'go']
['on', 'all', 'fours', 'like', 'a', 'beast', 'he', 'was', 'clothed', 'in', 'bluish']
['the', 'unmistakable', 'mark', 'of', 'the', 'beast', 'i', 'stood', 'overcome', 'by', 'this']
['earth', 'was', 'he', 'man', 'or', 'beast', 'what', 'did', 'he', 'want', 'with']
['after', 'me', 'was', 'it', 'a', 'beast', 'or', 'was', 'it', 'a', 'man']
['off', 'a', 'lost'

### Regex Expressions

In [20]:
# try to find the location of the island
re.findall("[latitude|longitude]+\s+\d+\s+degrees\s+\w+", text)

['longitude\n107 degrees w',
 'latitude 5 degrees 3',
 'longitude 101 degrees w',
 'latitude 5 degrees s',
 'longitude 105 degrees e']

In [21]:
# find the laws of the beast folk and their mentions
re.findall("(not to)(.*)(that is the law)", text) # makes the results into a 3 element tuple, with the center 
                                                  # element being unique portion of the law

[('not to', ' go on all fours  ', 'that is the law'),
 ('not to', ' go on all fours  ', 'that is the law'),
 ('not to', ' suck up drink  ', 'that is the law'),
 ('not to', ' eat fish or flesh  ', 'that is the law'),
 ('not to', ' claw the bark of trees  ', 'that is the law'),
 ('not to', ' chase other men  ', 'that is the law'),
 ('not to', ' run on all fours  ', 'that is the law'),
 ('not to', ' chase other men  ', 'that is the law'),
 ('not to', ' eat flesh or fish  ', 'that is the law'),
 ('not to', ' go on all fours  ', 'that is the law'),
 ('not to', ' suck your drink  ', 'that is the law')]

## Part C (basic corpus statistics)

In [22]:
# calculating V (found N in part B)
num_unique_words = len(set(tokenized_text))
print(num_words, num_unique_words)

44298 5266


In [23]:
# calculating mean, median, and standard deviation of list elements
token_lengths = []
for token in tokenized_text:
    token_lengths.append(len(token))
mean = sum(token_lengths)/len(token_lengths)
print('Mean word length:', mean)
median = stats.median(token_lengths)
print('Median word length:', median)
sd = stats.stdev(token_lengths)
print('Standard Deviation of the word lengths:', sd)

Mean word length: 4.212131473204208
Median word length: 4.0
Standard Deviation of the word lengths: 2.2779598008692217


In [24]:
# calculating hapaxes statistics
not_hapaxes = set()
hapaxes = list()
for token in tokenized_text:
    token = token.lower()
    if not (token in not_hapaxes):
        if token in hapaxes:
            hapaxes.remove(token)
            not_hapaxes.add(token)
        else:
            hapaxes.append(token)
print("Number of hapaxes:", len(hapaxes))
print("Percentage of the total book:", len(hapaxes) / num_words)
print("Five random hapaxes:")
random_nums = []
for i in range(0, 5):
    n = random.randint(0, len(hapaxes))
    random_nums.append(n)
for r in random_nums:
    print(hapaxes[r])

Number of hapaxes: 2612
Percentage of the total book: 0.05896428732674162
Five random hapaxes:
list
litter
sexual
glide
hemmed


In [25]:
# plot of the frequency of the 5 most used words
frequency_dict = {}
for token in tokenized_text:
    if token in frequency_dict:
        frequency_dict[token] += 1
    else:
        frequency_dict[token] = 1
most_frequent_words = highest_dict_values(frequency_dict, 30)
print(most_frequent_words)

[['the', 2976], ['and', 1634], ['i', 1579], ['of', 1406], ['a', 1040], ['to', 923], ['in', 663], ['was', 613], ['my', 562], ['that', 539], ['he', 528], ['me', 485], ['it', 483], ['with', 428], ['had', 382], ['his', 367], ['at', 344], ['said', 284], ['as', 259], ['then', 249], ['but', 237], ['you', 236], ['for', 235], ['on', 221], ['not', 206], ['man', 205], ['montgomery', 204], ['him', 202], ['this', 192], ['is', 191]]
