In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
import string

In [3]:
from urllib import request

url_addr = "https://www.gutenberg.org/cache/epub/73057/pg73057.txt"
response = request.urlopen(url_addr)
text = response.read().decode('utf8')

In [4]:
path = "nlp_text_lab_1.txt"

with open(path, "w", encoding="utf-8") as file:
    file.write(text)

In [5]:
print(len(text))

599919


In [6]:
print(text[:90])

The Project Gutenberg eBook of History of the United States of America, Volume 8 (of 9)



In [7]:
start_index = text.find("CHAPTER I")

text_without_header = text[start_index+len("CHAPTER I."):].lstrip()

print(text_without_header)

AT the beginning of the year 1814, the attitude of New England
pleased no one, and perhaps annoyed most the New England people
themselves, who were conscious of showing neither dignity, power,
courage, nor intelligence. Nearly one half the people of the five New
England States supported the war, but were paralyzed by the other half,
which opposed it. Of the peace party, one half wished to stop the war,
but was paralyzed by the other half, which threatened to desert their
leaders at the first overt act of treason. In this dead-lock every one
was dissatisfied, but no one seemed disposed to yield.

Such a situation could not last. In times of revolution treason might
be necessary, but inert perversity could at no time serve a useful
purpose. Yet the Massachusetts Federalists professed only a wish to
remain inert. Josiah Quincy, who fretted at restraints, and whose
instincts obliged him to act as energetically as he talked, committed
his party to the broad assertion that “a moral and relig

In [8]:
sentences = sent_tokenize(text_without_header)
no_of_sentences = len(sentences)
print(no_of_sentences)

4240


In [9]:
total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
average_len = total_words / no_of_sentences if no_of_sentences > 0 else 0
print(average_len)

26.993632075471698


In [10]:
all_words = word_tokenize(text_without_header)

In [11]:
bigram_finder = BigramCollocationFinder.from_words(all_words)
trigram_finder = TrigramCollocationFinder.from_words(all_words)

In [12]:
all_bigrams = bigram_finder.ngram_fd.items()
all_trigrams = trigram_finder.ngram_fd.items()

In [13]:
all_unique_bigrams = set(bigram_finder.ngram_fd)
all_uique_trigrams = set(trigram_finder.ngram_fd)

In [14]:
print(all_unique_bigrams)



In [15]:
import re

In [16]:
filtered_words = [word.lower() for word in all_words if re.match("^[a-zA-Z0-9]*$", word)]
print(filtered_words)



In [17]:
from collections import Counter

In [18]:
def extract_most_freq(lw, N):
    words_counter = Counter(lw)
    return words_counter.most_common(N)

In [19]:
print(extract_most_freq(filtered_words, 5))

[('the', 7616), ('of', 3649), ('and', 2939), ('to', 2809), ('in', 1710)]


In [20]:
from nltk.corpus import stopwords

In [21]:
stop_words = set(stopwords.words('english'))

In [22]:
lws = [word for word in filtered_words if word not in stop_words]
print(lws)



In [23]:
print(extract_most_freq(lws, 5))

[('british', 470), ('hundred', 387), ('could', 322), ('men', 318), ('1814', 312)]


In [24]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

In [25]:
ps = PorterStemmer()

In [26]:
list_stemmed = [ps.stem(ws) for ws in lws]
print(list_stemmed[:200])

['begin', 'year', '1814', 'attitud', 'new', 'england', 'pleas', 'one', 'perhap', 'annoy', 'new', 'england', 'peopl', 'consciou', 'show', 'neither', 'digniti', 'power', 'courag', 'intellig', 'nearli', 'one', 'half', 'peopl', 'five', 'new', 'england', 'state', 'support', 'war', 'paralyz', 'half', 'oppos', 'peac', 'parti', 'one', 'half', 'wish', 'stop', 'war', 'paralyz', 'half', 'threaten', 'desert', 'leader', 'first', 'overt', 'act', 'treason', 'everi', 'one', 'dissatisfi', 'one', 'seem', 'dispos', 'yield', 'situat', 'could', 'last', 'time', 'revolut', 'treason', 'might', 'necessari', 'inert', 'pervers', 'could', 'time', 'serv', 'use', 'purpos', 'yet', 'massachusett', 'federalist', 'profess', 'wish', 'remain', 'inert', 'josiah', 'quinci', 'fret', 'restraint', 'whose', 'instinct', 'oblig', 'act', 'energet', 'talk', 'commit', 'parti', 'broad', 'assert', 'moral', 'religi', 'peopl', 'could', 'express', 'admir', 'heroism', 'display', 'caus', 'disapprov', 'would', 'defend', 'massachusett', 'wa

In [27]:
from nltk.corpus import wordnet

In [28]:
# we can see with our own eyes that there are some words in the list of stemmed words that do not
# appear in the dictionary, as they are missing one last letter (e.g. 'e' from 'attitud' or 'peopl')
# I will also use a more automate way to find these words and print them
not_in_dictionary = [word for word in list_stemmed if not wordnet.synsets(word)]
print(not_in_dictionary)

['1814', 'attitud', 'perhap', 'peopl', 'consciou', 'digniti', 'courag', 'intellig', 'nearli', 'peopl', 'paralyz', 'oppos', 'peac', 'parti', 'paralyz', 'everi', 'dissatisfi', 'dispos', 'situat', 'could', 'revolut', 'necessari', 'pervers', 'could', 'serv', 'purpos', 'massachusett', 'josiah', 'quinci', 'whose', 'oblig', 'energet', 'parti', 'religi', 'peopl', 'could', 'admir', 'caus', 'disapprov', 'would', 'massachusett', 'invad', 'safeti', 'would', 'refus', 'seiz', 'oblig', 'massachusett', 'territori', 'invad', 'would', 'januari', '1814', 'battl', 'arriv', 'invas', 'becam', 'immin', 'offici', 'privat', 'doctrin', 'legislatur', 'januari', '1814', 'easili', 'inflam', 'victori', 'mortif', 'occas', 'chargeabl', 'upon', 'upon', 'voluntarili', 'without', 'oblig', 'encourag', 'massachusett', 'senat', 'languag', 'emphat', 'submiss', 'agreeabl', 'constitut', 'necessari', 'oblig', 'hostil', 'invas', 'justifi', 'peopl', 'encourag', 'charact', 'without', 'becom', 'partak', 'obnoxi', 'retribut', 'divi

In [29]:
ls = LancasterStemmer()
snb = SnowballStemmer('english')

In [30]:
print(ps.stem('being'))
print(ls.stem('being'))
print(snb.stem('being'))

be
being
be


In [31]:
stemming_res = {}

In [32]:
NW = 500

In [33]:
for word in lws:
    porter_stem = ps.stem(word)
    lancaster_stem = ls.stem(word)
    snowball_stem = snb.stem(word)
    
    if porter_stem != lancaster_stem or lancaster_stem != snowball_stem:
        stemming_res[word] = (porter_stem, lancaster_stem, snowball_stem)


In [34]:
max_stem_length = max(len(stem) for stems in stemming_res.values() for stem in stems) + 2

In [35]:
print(f"{'Porter':<{max_stem_length}} | {'Lancaster':<{max_stem_length}} | {'Snowball':<{max_stem_length}}")
print(f"{'-' * (3 * max_stem_length + 2)}")
for word, stems in stemming_res.items():
    print(f"{stems[0]:<{max_stem_length}} | {stems[1]:<{max_stem_length}} | {stems[2]:<{max_stem_length}}")

Porter           | Lancaster        | Snowball        
--------------------------------------------------
one              | on               | one             
consciou         | conscy           | conscious       
neither          | neith            | neither         
digniti          | dign             | digniti         
power            | pow              | power           
courag           | cour             | courag          
nearli           | near             | near            
five             | fiv              | five            
state            | stat             | state           
paralyz          | paralys          | paralyz         
parti            | party            | parti           
threaten         | threatened       | threaten        
leader           | lead             | leader          
everi            | every            | everi           
dissatisfi       | dissatisfy       | dissatisfi      
situat           | situ             | situat          
time          

In [36]:
wl = WordNetLemmatizer()

In [37]:
comparison = {}

In [38]:
max_stem_length = 0
max_lemma_length = 0

In [39]:
for word in lws:
    snb_word = snb.stem(word)
    max_stem_length = max(max_stem_length, len(snb_word))

    wordnet_word = wl.lemmatize(word)
    max_lemma_length = max(max_lemma_length, len(wordnet_word))

    if snb_word != wordnet_word:
        comparison[word] = (snb_word, wordnet_word)

In [40]:
print(f"{'Snowball':<{max_stem_length}} | {'WordNetLemmatizer':<{max_lemma_length}}")
print(f"{'-' * (max_stem_length + max_lemma_length + 2)}")

for word, (snb_word, wordnet_word) in comparison.items():
    print(f"{snb_word:<{max_stem_length}} | {wordnet_word:<{max_lemma_length}}")

Snowball       | WordNetLemmatizer
---------------------------------
begin          | beginning        
attitud        | attitude         
pleas          | pleased          
perhap         | perhaps          
annoy          | annoyed          
peopl          | people           
show           | showing          
digniti        | dignity          
courag         | courage          
intellig       | intelligence     
near           | nearly           
support        | supported        
paralyz        | paralyzed        
oppos          | opposed          
peac           | peace            
parti          | party            
wish           | wished           
threaten       | threatened       
everi          | every            
dissatisfi     | dissatisfied     
seem           | seemed           
dispos         | disposed         
situat         | situation        
revolut        | revolution       
necessari      | necessary        
pervers        | perversity       
serv           | serv

In [41]:
lemma_words = [wl.lemmatize(word) for word in lws]

lemma_counter = Counter(lemma_words)

N = 10 

for lemma_word, count in lemma_counter.most_common(N):
    print(f"{lemma_word}: {count}")

british: 470
hundred: 389
state: 356
could: 322
men: 318
1814: 312
new: 289
two: 280
one: 279
army: 227


In [44]:
from num2words import num2words

In [56]:
list_change_index = []
changes = 0
N = 10

for i, word in enumerate(lws):
    if str(word).isdigit():
        lws[i] = num2words(str(word))
        list_change_index.append(i)
        changes += 1

print("Total number of changes is: ", changes)
lws_N_changes = []
for idx in range(N):
    lws_N_changes.append(lws[list_change_index[idx]])
print(lws_N_changes)

Total number of changes is:  3232
['one thousand, eight hundred and fourteen', 'one', 'one thousand, eight hundred and fourteen', 'twelve', 'one thousand, eight hundred and fourteen', 'twenty-five', 'one thousand, eight hundred and fourteen', 'one', 'one thousand, seven hundred and eighty-three', 'two']


In [96]:
def find_ngrams(W, N):
    idxs = [i for i, word in enumerate(lws) if word.lower() == W.lower()]
    
    for idx in idxs:
        if N % 2 != 0:
            s_idx = max(0, idx - (N // 2))
            e_idx = min(len(lws), idx + (N // 2) + 1)
            
        else:
            s_idx = max(0, idx - (N // 2) + 1)
            e_idx = min(len(lws), idx + (N // 2) + 1)
            
        ngram_words = lws[s_idx:e_idx]
        print(ngram_words)

# if we wanted for N % 2 == 0, the chosen word to be on the even position, then we would modify the code as follows:
        # else:
        #     s_idx = max(0, idx - (N // 2))
        #     e_idx = min(len(lws), idx + (N // 2))

In [99]:
print(find_ngrams('boundary', 5))

['rectify', 'canadian', 'boundary', 'recovering', 'portion']
['miles', 'beyond', 'boundary', 'according', 'monthly']
['mill', 'crossing', 'boundary', 'march', 'thirty']
None


In [100]:
def ngrams_in_same_sentence(W, N):
    words_in_sentences = [[word.lower() for word in word_tokenize(sentence) if word.isalnum() and word.lower() not in stopwords.words('english')] for sentence in sentences]

    ngrams = []
    for sent_words in words_in_sentences:
        idxs = [i for i, word in enumerate(sent_words) if word.lower() == W.lower()]
        for idx in idxs:
            if N % 2 != 0:
                s_index = max(0, idx - (N // 2))
                e_index = min(len(sent_words), idx + (N // 2) + 1)
            else:
                s_index = max(0, idx - (N // 2) + 1)
                e_index = min(len(sent_words), idx + (N // 2) + 1)
            ngram = sent_words[s_index:e_index]
            ngrams.append(ngram)

    for ngram in ngrams:
        print(ngram)

In [103]:
print(ngrams_in_same_sentence('dignity', 4))

['neither', 'dignity', 'power', 'courage']
['measures', 'dignity', 'liberties', 'free']
['behaved', 'dignity', 'usual', 'pride']
None
