# Preprocessing of data

Necessary imports

In [26]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
%store -r mortaldict
%store -r goddict

First, we import the data

In [6]:
with open("homer_text_file.txt","r",encoding='utf-8') as f:
    data = f.readlines()
    
print(data[700])

[313] “So do not thou, my friend, wander long far from home, leaving thy wealth behind thee and men in thy house so insolent, lest they divide and devour all thy wealth, and thou shalt have gone on a fruitless journey. But to Menelaus I bid and command thee to go, for he has but lately come from a strange land, from a folk whence no one would hope in his heart to return, whom the storms had once driven astray into a sea so great, whence the very birds do not fare in the space of a year, so great is it and terrible. But now go thy way with thy ship and thy comrades, or, if thou wilt go by land, here are chariot and horses at hand for thee, and here at thy service are my sons, who will be thy guides to goodly Lacedaemon, where lives fair-haired Menelaus. And do thou beseech him thyself that he may tell thee the very truth. A lie will be not utter, for he is wise indeed.”



We split the data into sentences

In [7]:
sentences = []
for par in data:
    if par == '\n':
        continue
    senttemp = sent_tokenize(par)
    sentences = sentences +senttemp

print(sentences[100:105])

['Howbeit Poseidon had gone among the far-off Ethiopians—the Ethiopians who dwell sundered in twain, the farthermost of men, some where Hyperion sets and some where he rises, there to receive a hecatomb of bulls and rams, and there he was taking his joy, sitting at the feast; but the other gods were gathered together in the halls of Olympian Zeus.', "[28] Among them the father of gods and men was first to speak, for in his heart he thought of noble Aegisthus, whom far-famed Orestes, Agamemnon's son, had slain.", 'Thinking on him he spoke among the immortals, and said: “Look you now, how ready mortals are to blame the gods.', 'It is from us, they say, that evils come, but they even of themselves, through their own blind folly, have sorrows beyond that which is ordained.', 'Even as now Aegisthus, beyond that which was ordained, took to himself the wedded wife of the son of Atreus, and slew him on his return, though well he knew of sheer destruction, seeing that we spake to him before, se

Now, we tokenize the sentences

In [8]:
words = []
for sent in sentences:
    wordstemp = word_tokenize(sent)
    words = words + wordstemp
    
print(words[3000:3100])

['my', 'heart', ',', 'and', 'as', 'I', 'think', 'it', 'shall', 'be', 'brought', 'to', 'pass', ',', 'though', 'I', 'am', 'in', 'no', 'wise', 'a', 'soothsayer', ',', 'nor', 'one', 'versed', 'in', 'the', 'signs', 'of', 'birds', '.', 'Not', 'much', 'longer', 'shall', 'he', 'be', 'absent', 'from', 'his', 'dear', 'native', 'land', ',', 'no', ',', 'not', 'though', 'bonds', 'of', 'iron', 'hold', 'him', '.', 'He', 'will', 'contrive', 'a', 'way', 'to', 'return', ',', 'for', 'he', 'is', 'a', 'man', 'of', 'many', 'devices', '.', 'But', 'come', ',', 'tell', 'me', 'this', 'and', 'declare', 'it', 'truly', ',', 'whether', 'indeed', ',', 'tall', 'as', 'thou', 'art', ',', 'thou', 'art', 'the', 'son', 'of', 'Odysseus', 'himself', '.', 'Wondrously']


Now, we PoS tag the data, so that lemmatization is easier, and names of characters can be identified more easily

In [9]:
tagged = nltk.pos_tag(words)

tagged_filtered = []
for wordtag in tagged:
    if wordtag[1] == '.':
        continue
    else: tagged_filtered.append(wordtag)
        
print(tagged_filtered[3000:3100])

[('whether', 'IN'), ('indeed', 'RB'), (',', ','), ('tall', 'JJ'), ('as', 'IN'), ('thou', 'JJ'), ('art', 'NN'), (',', ','), ('thou', 'JJ'), ('art', 'VBP'), ('the', 'DT'), ('son', 'NN'), ('of', 'IN'), ('Odysseus', 'NNP'), ('himself', 'PRP'), ('Wondrously', 'RB'), ('like', 'IN'), ('his', 'PRP$'), ('are', 'VBP'), ('thy', 'JJ'), ('head', 'NN'), ('and', 'CC'), ('beautiful', 'JJ'), ('eyes', 'NNS'), (';', ':'), ('for', 'IN'), ('full', 'JJ'), ('often', 'RB'), ('did', 'VBD'), ('we', 'PRP'), ('consort', 'VB'), ('with', 'IN'), ('one', 'CD'), ('another', 'DT'), ('before', 'IN'), ('he', 'PRP'), ('embarked', 'VBD'), ('for', 'IN'), ('the', 'DT'), ('land', 'NN'), ('of', 'IN'), ('Troy', 'NNP'), (',', ','), ('whither', 'JJR'), ('others', 'NNS'), (',', ','), ('too', 'RB'), (',', ','), ('the', 'DT'), ('bravest', 'JJS'), ('of', 'IN'), ('the', 'DT'), ('Argives', 'NNP'), (',', ','), ('went', 'VBD'), ('in', 'IN'), ('their', 'PRP$'), ('hollow', 'JJ'), ('ships', 'NNS'), ('But', 'CC'), ('since', 'IN'), ('that', '

Next, we lemmatize the dataset

In [10]:
# Reference for lemmatizing: https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

lemmatizer = WordNetLemmatizer()

lemmatized = []
for wordtag in tagged_filtered:
    tag = ''
    if wordtag[1][0:2] ==  'NN':
        tag =  'n'
    elif wordtag[1][0:2] ==  'VB':
        tag =  'v'
    elif wordtag[1][0:2] ==  'JJ':
        tag =  'a'
    elif wordtag[1][0:2] ==  'RB':
        tag =  'r'
        
    # use postag if it exists in the wordnetlemmatizer
    if tag == '': lemma = lemmatizer.lemmatize(wordtag[0])
    else: lemma = lemmatizer.lemmatize(wordtag[0], pos=tag)
        
    lemmatized.append((lemma, wordtag[1]))
    
print(lemmatized[3000:3100])

[('whether', 'IN'), ('indeed', 'RB'), (',', ','), ('tall', 'JJ'), ('a', 'IN'), ('thou', 'JJ'), ('art', 'NN'), (',', ','), ('thou', 'JJ'), ('art', 'VBP'), ('the', 'DT'), ('son', 'NN'), ('of', 'IN'), ('Odysseus', 'NNP'), ('himself', 'PRP'), ('Wondrously', 'RB'), ('like', 'IN'), ('his', 'PRP$'), ('be', 'VBP'), ('thy', 'JJ'), ('head', 'NN'), ('and', 'CC'), ('beautiful', 'JJ'), ('eye', 'NNS'), (';', ':'), ('for', 'IN'), ('full', 'JJ'), ('often', 'RB'), ('do', 'VBD'), ('we', 'PRP'), ('consort', 'VB'), ('with', 'IN'), ('one', 'CD'), ('another', 'DT'), ('before', 'IN'), ('he', 'PRP'), ('embark', 'VBD'), ('for', 'IN'), ('the', 'DT'), ('land', 'NN'), ('of', 'IN'), ('Troy', 'NNP'), (',', ','), ('whither', 'JJR'), ('others', 'NNS'), (',', ','), ('too', 'RB'), (',', ','), ('the', 'DT'), ('brave', 'JJS'), ('of', 'IN'), ('the', 'DT'), ('Argives', 'NNP'), (',', ','), ('go', 'VBD'), ('in', 'IN'), ('their', 'PRP$'), ('hollow', 'JJ'), ('ship', 'NNS'), ('But', 'CC'), ('since', 'IN'), ('that', 'DT'), ('day

Next, we go through the data and print all the proper nouns that aren't in the dictionaries. If they are not and they refer to names of characters that are either Olympian gods or mortals, we add them to the dictionaries in the file characterdicts.ipynb. We do this only for names that occur multiple times in the book, since adding all names is not realistic.

In [30]:
non_occuring = {}

mortals = mortaldict
gods = goddict

for wordtag in lemmatized:
    
    # Only look at proper nouns
    if wordtag[1] == 'NNP':
        
        # Binary to indicate if the word has been found
        filled_in = 0
        
        # Go through mortaldict
        for idx, vals in enumerate(mortals.values()):
            if wordtag[0] in vals[0]:
                filled_in = 1
                break
                
        # If word hasn't been found in mortaldict, go through goddict
        if filled_in == 0:
            for idx, vals in enumerate(gods.values()):
                if wordtag[0] in vals[0]:
                    filled_in = 1
                    break
        
        # Print word if it hasn't been found at all
        if filled_in == 0:
            if wordtag[0] in non_occuring.keys():
                non_occuring[wordtag[0]] += 1
            else: non_occuring[wordtag[0]] = 1

for key in non_occuring.keys():
    if non_occuring[key] != 1:
        print(key, non_occuring[key])


HOMER 52
ODYSSEY 100
BOOK 650
Theoi 75
Classical 52
Texts 75
Library 127
Toggle 25
Project 50
LIBRARY 50
HOME 25
GREEK 50
MYTHOLOGY 25
GODS 25
Olympian 30
Gods 275
Primordial 25
Titan 25
Sky 25
Sea 25
Rustic 25
Underworld 50
Daemones-Spirits 25
Nymphs 27
> 125
BESTIARY 25
HEROES 25
MISCELLANY 25
Spirits 50
Monsters 50
A-Z 25
Family 25
Tree 25
Legendary 50
Tribes 25
Creatures 25
Star 25
Myths 50
Plant 25
Flower 25
GALLERY 25
Homer 77
Odyssey 56
Book 25
Ionia 2
Greeks 27
Iliad 27
C8th 2
C7th 2
B.C 2
Murray 4
A 53
T. 27
Loeb 4
Volumes 2
Cambridge 2
MA 2
Harvard 2
University 2
Press 2
London 2
William 2
Heinemann 2
Ltd. 2
Amazon.com 2
Greek 27
Dimock 2
Shewring 2
Theoi.com 2
THE 25
CONTENTS 25
Telemachus 371
Penelope 160
Departure 25
Nestor 77
Returns 50
Menelaus 90
Calypso 50
Raft 25
Naucicaa 25
Arete 39
Games 25
Feast 25
Lotus-Eaters 25
Cyclops 45
Laestrygones 25
Circe 70
Scylla 37
Helius 28
Return 50
Ithaca 129
Eumaeus 94
Odyseus 25
Beggar 50
Contest 50
Suitors 75
Slaying 25
Dead 50
BY 