In [1]:
# imports
import re
import nltk
from nltk.corpus import wordnet as wn
import os.path as p
import statistics as stats
import string
import re
import random

In [2]:
# neccessary downloads (uncomment if you need any of them, then recomment the code and run the notebook again)
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')

# Part 1

## Part A (distance between speeches)

In [3]:
# full speech excerpts
trump_speech = "from a young age my parents impressed on me the values that you work hard for what you want in life that your word is your bond and you do what you say and keep your promise that you treat people with respect they taught and showed me values and morals in their daily life"
obama_speech = "and barack and i were raised with so many of the same values that you work hard for what you want in life that your word is your bond and you do what you say youre going to do that you treat people with dignity and respect even if you dont know them and even if you dont agree with them"
print("Trump speech excerpt (punctuation removed):", trump_speech)
print("Obama speech excerpt (punctuation removed):", obama_speech)

Trump speech excerpt (punctuation removed): from a young age my parents impressed on me the values that you work hard for what you want in life that your word is your bond and you do what you say and keep your promise that you treat people with respect they taught and showed me values and morals in their daily life
Obama speech excerpt (punctuation removed): and barack and i were raised with so many of the same values that you work hard for what you want in life that your word is your bond and you do what you say youre going to do that you treat people with dignity and respect even if you dont know them and even if you dont agree with them


In [4]:
# distance between full speech excerpts
print("Distance between speech excerpts is",nltk.edit_distance(trump_speech, obama_speech))

Distance between speech excerpts is 118


In [5]:
# shortened speech excerpts
trump_speech_shortened = re.search('that you work hard(.+?)respect', trump_speech)
trump_speech_shortened = 'that you work hard'+ trump_speech_shortened.group(1) + 'respect'
obama_speech_shortened = re.search('that you work hard(.+?)respect', obama_speech)
obama_speech_shortened = 'that you work hard'+ obama_speech_shortened.group(1) + 'respect'
print("Trump speech shortened:", trump_speech_shortened)
print("Obama speech shortened:", obama_speech_shortened)

Trump speech shortened: that you work hard for what you want in life that your word is your bond and you do what you say and keep your promise that you treat people with respect
Obama speech shortened: that you work hard for what you want in life that your word is your bond and you do what you say youre going to do that you treat people with dignity and respect


In [6]:
# distance between shortened speech excerpts
print("Distance between shorted speech excerpts is", nltk.edit_distance(trump_speech_shortened, obama_speech_shortened))

Distance between shorted speech excerpts is 29


## Part B (lemmas in each substring)

In [7]:
# init lemmatizer, make function to convert Upenn to Wordnet tags, and make function to find lemmas appearing in both
lemmatizer = nltk.WordNetLemmatizer()

def upenn_lemmatizer(tagged_samples):
    """
    Replaces upenn pos tags with wordnet pos tag objects and then lemmatizes the words
    :tagged_samples: list of pos tuples generated using nltk.pos_tag
    :return: list of lemmas
    """
    lemmas = []
    upenn_to_wordnet_dic = {  
        'J':wn.ADJ,
        'V':wn.VERB,
        'N':wn.NOUN,
        'R':wn.ADV
    }
    for word in tagged_samples:
        word = list(word)  # convert tuple to list so it's mutable
        word[1] = word[1][0]  # reduce pos string to one character to match upenn_to_wordnet_dic formatting
        if word[1] in upenn_to_wordnet_dic:  # convert from upenn to wordnet
            word[1] = upenn_to_wordnet_dic[word[1]]
        else:
            word[1] = wn.NOUN
        lemmas.append(lemmatizer.lemmatize(*word))
    return lemmas

def token_similarities(speech_a, speech_b):
    """
    Finds all the tokens in common between two documents
    :speech_a: First document for comparison (can be any iterable that is able to be converted to a set)
    :speech_b: Second document, same rules as speech_a
    :returns: a list of common tokens
    """
    tokens_in_common = []
    speech_a = set(speech_a)
    speech_b = set(speech_b)
    if (speech_a & speech_b):
        tokens_in_common.extend(speech_a)
    return tokens_in_common

In [8]:
# make pos list
trump_speech_shortened_tokenized = nltk.word_tokenize(trump_speech_shortened)
trump_speech_shortened_tagged = nltk.pos_tag(trump_speech_shortened_tokenized)
obama_speech_shortened_tokenized = nltk.word_tokenize(obama_speech_shortened)
obama_speech_shortened_tagged = nltk.pos_tag(obama_speech_shortened_tokenized)

In [9]:
# feed pos list into lemmatizer and make a list of these lemmas
trump_lemmas = upenn_lemmatizer(trump_speech_shortened_tagged)
obama_lemmas = upenn_lemmatizer(obama_speech_shortened_tagged)
print("Trump's lemmas:", trump_lemmas)
print("Obama's lemmas:", obama_lemmas)

Trump's lemmas: ['that', 'you', 'work', 'hard', 'for', 'what', 'you', 'want', 'in', 'life', 'that', 'your', 'word', 'be', 'your', 'bond', 'and', 'you', 'do', 'what', 'you', 'say', 'and', 'keep', 'your', 'promise', 'that', 'you', 'treat', 'people', 'with', 'respect']
Obama's lemmas: ['that', 'you', 'work', 'hard', 'for', 'what', 'you', 'want', 'in', 'life', 'that', 'your', 'word', 'be', 'your', 'bond', 'and', 'you', 'do', 'what', 'you', 'say', 'youre', 'go', 'to', 'do', 'that', 'you', 'treat', 'people', 'with', 'dignity', 'and', 'respect']


In [10]:
# find lemmas in common
common_lemmas = token_similarities(trump_lemmas, obama_lemmas)
print("Lemmas in common in substrings:", common_lemmas)

Lemmas in common in substrings: ['that', 'in', 'be', 'respect', 'hard', 'promise', 'you', 'bond', 'with', 'your', 'work', 'people', 'want', 'word', 'and', 'keep', 'life', 'do', 'what', 'for', 'say', 'treat']


## Part C (words in both excerpts)

In [11]:
# find words in common (utilizing token_similarities from Part B)
trump_speech_full_tokenized = nltk.word_tokenize(trump_speech)
obama_speech_full_tokenized = nltk.word_tokenize(obama_speech)
common_words = token_similarities(trump_speech_full_tokenized, obama_speech_full_tokenized)
print("Words in common in excerpts:", common_words)

Words in common in excerpts: ['age', 'parents', 'that', 'in', 'young', 'from', 'a', 'respect', 'hard', 'promise', 'you', 'bond', 'with', 'their', 'your', 'on', 'work', 'me', 'daily', 'people', 'they', 'want', 'word', 'values', 'and', 'keep', 'life', 'the', 'do', 'what', 'morals', 'my', 'impressed', 'for', 'say', 'is', 'taught', 'treat', 'showed']


## Part D (analysis of results)

### Numerics used in Results Discussion

In [12]:
# character lengths of each excerpt and substring
print("Character length of Trump speech:", len(trump_speech))
print("Character length of Obama speech:", len(obama_speech))
print("Character length of Trump speech substring:", len(trump_speech_shortened))
print("Character length of Obama speech substring:", len(obama_speech_shortened))

Character length of Trump speech: 272
Character length of Obama speech: 286
Character length of Trump speech substring: 153
Character length of Obama speech substring: 161


In [13]:
# total lemmas in each compared to lemmas in common
print("Number of lemmas in Trump's substring:", len(trump_lemmas))
print("Number of lemmas in Obama's substring:", len(obama_lemmas))
print("Number of lemmas in common:", len(common_lemmas))

Number of lemmas in Trump's substring: 32
Number of lemmas in Obama's substring: 34
Number of lemmas in common: 22


In [14]:
# words in each excerpt compared to words in common
print("Number of words in Trump's excerpt:", len(trump_speech_full_tokenized))
print("Number of words in Obama's excerpt:", len(obama_speech_full_tokenized))
print("Number of words in common:", len(common_words))

Number of words in Trump's excerpt: 55
Number of words in Obama's excerpt: 61
Number of words in common: 39


# Part 2

## Part A (download and preprocess text)

In [15]:
# init regex for removing punctuation
regex = re.compile('[%s]' % re.escape(string.punctuation))

# init function to help find most frequent unigrams and bigrams (returns a list of the highest keys and their values)
def highest_dict_values(dic, num_values):
    highest_keys = ['']  # default value that is replaced
    highest_values = [0]  # ensures the default value is replaced, all words must have a frequency >= 1
    for word in dic:  # finding the highest values
        for n in highest_values:
            if n < dic[word]:
                highest_keys.insert(highest_values.index(n), word)
                highest_values.insert(highest_values.index(n), dic[word])
                highest_values = highest_values[:num_values]
                highest_keys = highest_keys[:num_values]
                break
    highest_val_and_keys = []
    for i in range(len(highest_values)):
        highest_val_and_keys.append([highest_keys[i], highest_values[i]])
    return highest_val_and_keys

In [25]:
# download text from file
document = open('ps1.2-PeeksOkafor.txt', 'r')  # removed boilerplate, chapter names, and any other non-story parts
text = document.read()
text = regex.sub(' ', text)  # remove punctuation using the regex compiler
text = text.lower()
document.close()


In [27]:
# tokenize text
tokenized_text = nltk.word_tokenize(text)


## Part B (Regex and list comprehension)

### List Comprehension

In [21]:
# find the most common bigrams
bigrams = []
num_words = len(tokenized_text)
for i in range(num_words - 1):  # minus one to stop from running on the last word
    bigrams.append([tokenized_text[i], tokenized_text[i+1]])
frequency_dict_bi = {}
for bi in bigrams:
    bi_string = ' '
    bi_string = bi_string.join(bi)
    if bi_string in frequency_dict_bi:
        frequency_dict_bi[bi_string] += 1
    else:
        frequency_dict_bi[bi_string] = 1
most_common_bigrams = highest_dict_values(frequency_dict_bi, 200)
boringbigrams = 'oftheandisinthenittoatmedidnothadbeentherewastheywerewithmy'
goodbigrams = []
for bi in most_common_bigrams:
    bilist = bi[0].split()
    if not bilist[0] in boringbigrams and not bilist[1] in boringbigrams:
        goodbigrams.append(bi)
print(goodbigrams)

[['beast people', 90], ['said montgomery', 56], ['ape man', 56], ['haired man', 42], ['his face', 40], ['his eyes', 36], ['beast folk', 36], ['leopard man', 36], ['hyena swine', 36], ['away from', 34], ['one another', 32], ['went on', 32], ['his head', 32]]


In [19]:
# find context for how the main character describes beasts
elevengrams = []
for i in range(len(tokenized_text)):
    if "beast" in tokenized_text[i]:
        elevengrams.append(tokenized_text[i-5:i+6])
for gram in elevengrams:
    print(' '.join(gram))

forgot the noise of the beast that had troubled me after
said montgomery what are these beasts for merchandise curios does the
you agreed to take the beasts i wish i d never
island what the devil want beasts for on an island like
did he want with the beasts why too had he pretended
captain this ship aint for beasts and cannibals and worse than
and cannibals and worse than beasts any more overboard you go
on all fours like a beast he was clothed in bluish
the unmistakable mark of the beast i stood overcome by this
earth was he man or beast what did he want with
after me was it a beast or was it a man
off a lost soul a beast to the rest of their
my encounters with the other beast men you he said in
a great proportion of these beast people had malformed hands lacking
and some more of these beast people lived i might perhaps
face of neither man nor beast but a mere shock of
so said one of the beasts in the doorway evil are
escape none escape said the beast folk glancing furtively at one
co

### Regex Expressions

In [20]:
# try to find the location of the island
re.findall("[latitude|longitude]+\s+\d+\s+degrees\s+\w+", text)

['longitude\n107 degrees w',
 'latitude 5 degrees 3',
 'longitude 101 degrees w',
 'latitude 5 degrees s',
 'longitude 105 degrees e',
 'longitude\n107 degrees w',
 'latitude 5 degrees 3',
 'longitude 101 degrees w',
 'latitude 5 degrees s',
 'longitude 105 degrees e']

In [21]:
# find the laws of the beast folk and their mentions
re.findall("(not to)(.*)(that is the law)", text) # makes the results into a list of 3 element tuples,
                                                  # with the center element being unique portion of 
                                                  # the law that is unique

[('not to', ' go on all fours  ', 'that is the law'),
 ('not to', ' go on all fours  ', 'that is the law'),
 ('not to', ' suck up drink  ', 'that is the law'),
 ('not to', ' eat fish or flesh  ', 'that is the law'),
 ('not to', ' claw the bark of trees  ', 'that is the law'),
 ('not to', ' chase other men  ', 'that is the law'),
 ('not to', ' run on all fours  ', 'that is the law'),
 ('not to', ' chase other men  ', 'that is the law'),
 ('not to', ' eat flesh or fish  ', 'that is the law'),
 ('not to', ' go on all fours  ', 'that is the law'),
 ('not to', ' suck your drink  ', 'that is the law'),
 ('not to', ' go on all fours  ', 'that is the law'),
 ('not to', ' go on all fours  ', 'that is the law'),
 ('not to', ' suck up drink  ', 'that is the law'),
 ('not to', ' eat fish or flesh  ', 'that is the law'),
 ('not to', ' claw the bark of trees  ', 'that is the law'),
 ('not to', ' chase other men  ', 'that is the law'),
 ('not to', ' run on all fours  ', 'that is the law'),
 ('not to'

## Part C (basic corpus statistics)

In [28]:
# calculating V (found N in part B)
num_unique_words = len(set(tokenized_text))
print('N:', num_words//2)  # for some reason, the tokenized text variable has
print('V:', num_unique_words)

N: 44297
V: 5266


In [23]:
# calculating mean, median, and standard deviation of list elements
token_lengths = []
for token in tokenized_text:
    token_lengths.append(len(token))
mean = sum(token_lengths)/len(token_lengths)
print('Mean word length:', mean)
median = stats.median(token_lengths)
print('Median word length:', median)
sd = stats.stdev(token_lengths)
print('Standard Deviation of the word lengths:', sd)

Mean word length: 4.212066143687567
Median word length: 4
Standard Deviation of the word lengths: 2.27791644410513


In [24]:
# calculating hapaxes statistics
frequency_dict = {}  # find frequency of all the words (also used to find the
                     # most common words later)
for token in tokenized_text:
    if token in frequency_dict:
        frequency_dict[token] += 1
    else:
        frequency_dict[token] = 0
hapaxes = []
for word in frequency_dict:
    if frequency_dict[word] == 1:
        hapaxes.append(word)
print("Number of hapaxes:", len(hapaxes))
print("Percentage of the total vocabulary:", len(hapaxes) / num_unique_words)
print("Five random hapaxes:")
random_nums = []
for i in range(0, 5):
    n = random.randint(0, len(hapaxes))
    random_nums.append(n)
for r in random_nums:
    print(hapaxes[r])

Number of hapaxes: 2612
Percentage of the total vocabulary: 0.49601215343714394
Five random hapaxes:
skull
grapple
peaked
courtyard
obtaining


In [25]:
# plot of the frequency of the 5 most used words
most_frequent_words = highest_dict_values(frequency_dict, 30)
print(most_frequent_words)

[['the', 5951], ['and', 3267], ['i', 3157], ['of', 2811], ['a', 2079], ['to', 1845], ['in', 1325], ['was', 1225], ['my', 1123], ['that', 1077], ['he', 1055], ['me', 969], ['it', 965], ['with', 855], ['had', 763], ['his', 733], ['at', 687], ['said', 567], ['as', 517], ['then', 497], ['but', 473], ['you', 471], ['for', 469], ['on', 441], ['not', 411], ['man', 409], ['montgomery', 407], ['him', 403], ['this', 383], ['is', 381]]
