# Preprocessing of data

Necessary imports

In [1]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
%store -r mortaldict
%store -r goddict

First, let's check what the encoding is of the text file, so that we can properly import the texts

In [2]:
with open('text_file.txt') as f:
    print(f)

<_io.TextIOWrapper name='text_file.txt' mode='r' encoding='cp1252'>


As can be seen above, this file doesn't use the standard utf-8 encoding, but cp1252 encoding. So, we will use this encoder when reading it in:

In [3]:
with open("text_file.txt","r",encoding= 'cp1252') as f:
    data = f.readlines()
    
print(data[700])

[313] “So do not thou, my friend, wander long far from home, leaving thy wealth behind thee and men in thy house so insolent, lest they divide and devour all thy wealth, and thou shalt have gone on a fruitless journey. But to Menelaus I bid and command thee to go, for he has but lately come from a strange land, from a folk whence no one would hope in his heart to return, whom the storms had once driven astray into a sea so great, whence the very birds do not fare in the space of a year, so great is it and terrible. But now go thy way with thy ship and thy comrades, or, if thou wilt go by land, here are chariot and horses at hand for thee, and here at thy service are my sons, who will be thy guides to goodly Lacedaemon, where lives fair-haired Menelaus. And do thou beseech him thyself that he may tell thee the very truth. A lie will be not utter, for he is wise indeed.”



We split the data into sentences

In [4]:
sentences = []
for par in data:
    if par == '\n':
        continue
    senttemp = sent_tokenize(par)
    sentences = sentences +senttemp

print(sentences[100:105])

['Howbeit Poseidon had gone among the far-off Ethiopians—the Ethiopians who dwell sundered in twain, the farthermost of men, some where Hyperion sets and some where he rises, there to receive a hecatomb of bulls and rams, and there he was taking his joy, sitting at the feast; but the other gods were gathered together in the halls of Olympian Zeus.', "[28] Among them the father of gods and men was first to speak, for in his heart he thought of noble Aegisthus, whom far-famed Orestes, Agamemnon's son, had slain.", 'Thinking on him he spoke among the immortals, and said: “Look you now, how ready mortals are to blame the gods.', 'It is from us, they say, that evils come, but they even of themselves, through their own blind folly, have sorrows beyond that which is ordained.', 'Even as now Aegisthus, beyond that which was ordained, took to himself the wedded wife of the son of Atreus, and slew him on his return, though well he knew of sheer destruction, seeing that we spake to him before, se

Now, we tokenize the sentences

In [5]:
words = []
for sent in sentences:
    wordstemp = word_tokenize(sent)
    words = words + wordstemp
    
print(words[3000:3100])

['my', 'heart', ',', 'and', 'as', 'I', 'think', 'it', 'shall', 'be', 'brought', 'to', 'pass', ',', 'though', 'I', 'am', 'in', 'no', 'wise', 'a', 'soothsayer', ',', 'nor', 'one', 'versed', 'in', 'the', 'signs', 'of', 'birds', '.', 'Not', 'much', 'longer', 'shall', 'he', 'be', 'absent', 'from', 'his', 'dear', 'native', 'land', ',', 'no', ',', 'not', 'though', 'bonds', 'of', 'iron', 'hold', 'him', '.', 'He', 'will', 'contrive', 'a', 'way', 'to', 'return', ',', 'for', 'he', 'is', 'a', 'man', 'of', 'many', 'devices', '.', 'But', 'come', ',', 'tell', 'me', 'this', 'and', 'declare', 'it', 'truly', ',', 'whether', 'indeed', ',', 'tall', 'as', 'thou', 'art', ',', 'thou', 'art', 'the', 'son', 'of', 'Odysseus', 'himself', '.', 'Wondrously']


Now, we PoS tag the data, so that lemmatization is easier, and names of characters can be identified more easily

In [6]:
tagged = nltk.pos_tag(words)

tagged_filtered = []
for wordtag in tagged:
    if wordtag[1] in ['.', ',', ':', '--', '$', '(', ')']:
        continue
    else: tagged_filtered.append(wordtag)
        
print(tagged_filtered[3000:3100])

[('as', 'IN'), ('that', 'DT'), ('man', 'NN'), ('was', 'VBD'), ('still', 'RB'), ('among', 'IN'), ('his', 'PRP$'), ('people', 'NNS'), ('But', 'CC'), ('now', 'RB'), ('the', 'DT'), ('gods', 'NNS'), ('have', 'VBP'), ('willed', 'VBN'), ('otherwise', 'RB'), ('in', 'IN'), ('their', 'PRP$'), ('evil', 'JJ'), ('devising', 'NN'), ('seeing', 'VBG'), ('that', 'IN'), ('they', 'PRP'), ('have', 'VBP'), ('caused', 'VBN'), ('him', 'PRP'), ('to', 'TO'), ('pass', 'VB'), ('from', 'IN'), ('sight', 'NN'), ('as', 'IN'), ('they', 'PRP'), ('have', 'VBP'), ('no', 'DT'), ('other', 'JJ'), ('man', 'NN'), ('For', 'IN'), ('I', 'PRP'), ('should', 'MD'), ('not', 'RB'), ('so', 'RB'), ('grieve', 'VB'), ('for', 'IN'), ('his', 'PRP$'), ('death', 'NN'), ('if', 'IN'), ('he', 'PRP'), ('had', 'VBD'), ('been', 'VBN'), ('slain', 'VBN'), ('among', 'IN'), ('his', 'PRP$'), ('comrades', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('land', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Trojans', 'NNPS'), ('or', 'CC'), ('had', 'VBD'), ('died', 'VBN'),

Next, we lemmatize the dataset

In [7]:
# Reference for lemmatizing: https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

lemmatizer = WordNetLemmatizer()

lemmatized = []
for wordtag in tagged_filtered:
    tag = ''
    if wordtag[1][0:2] ==  'NN':
        tag =  'n'
    elif wordtag[1][0:2] ==  'VB':
        tag =  'v'
    elif wordtag[1][0:2] ==  'JJ':
        tag =  'a'
    elif wordtag[1][0:2] ==  'RB':
        tag =  'r'
        
    # use postag if it exists in the wordnetlemmatizer
    if tag == '': lemma = lemmatizer.lemmatize(wordtag[0])
    else: lemma = lemmatizer.lemmatize(wordtag[0], pos=tag)
        
    lemmatized.append((lemma, wordtag[1]))
    
print(lemmatized[3000:3100])

[('a', 'IN'), ('that', 'DT'), ('man', 'NN'), ('be', 'VBD'), ('still', 'RB'), ('among', 'IN'), ('his', 'PRP$'), ('people', 'NNS'), ('But', 'CC'), ('now', 'RB'), ('the', 'DT'), ('god', 'NNS'), ('have', 'VBP'), ('will', 'VBN'), ('otherwise', 'RB'), ('in', 'IN'), ('their', 'PRP$'), ('evil', 'JJ'), ('devising', 'NN'), ('see', 'VBG'), ('that', 'IN'), ('they', 'PRP'), ('have', 'VBP'), ('cause', 'VBN'), ('him', 'PRP'), ('to', 'TO'), ('pass', 'VB'), ('from', 'IN'), ('sight', 'NN'), ('a', 'IN'), ('they', 'PRP'), ('have', 'VBP'), ('no', 'DT'), ('other', 'JJ'), ('man', 'NN'), ('For', 'IN'), ('I', 'PRP'), ('should', 'MD'), ('not', 'RB'), ('so', 'RB'), ('grieve', 'VB'), ('for', 'IN'), ('his', 'PRP$'), ('death', 'NN'), ('if', 'IN'), ('he', 'PRP'), ('have', 'VBD'), ('be', 'VBN'), ('slay', 'VBN'), ('among', 'IN'), ('his', 'PRP$'), ('comrade', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('land', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Trojans', 'NNPS'), ('or', 'CC'), ('have', 'VBD'), ('die', 'VBN'), ('in', 'IN')

Next, we go through the data and print all the proper nouns that aren't in the dictionaries. If they are not and they refer to names of characters that are either Olympian gods or mortals, we add them to the dictionaries in the file characterdicts.ipynb. We do this only for names that occur at least 10 times in the book, since adding all names is not realistic.

In [8]:
# Reference for sorting dictionary: https://www.edureka.co/blog/sort-dictionary-by-value-in-python/

non_occuring = {}
non_occuring_min10 = {}

mortals = mortaldict
gods = goddict

for wordtag in lemmatized:
    
    # Only look at proper nouns
    if wordtag[1] == 'NNP':
        
        # Binary to indicate if the word has been found
        filled_in = 0
        
        # Go through mortaldict
        for idx, vals in enumerate(mortals.values()):
            if wordtag[0] in vals[0]:
                filled_in = 1
                break
                
        # If word hasn't been found in mortaldict, go through goddict
        if filled_in == 0:
            for idx, vals in enumerate(gods.values()):
                if wordtag[0] in vals[0]:
                    filled_in = 1
                    break
        
        # Print word if it hasn't been found at all
        if filled_in == 0:
            if wordtag[0] in non_occuring.keys():
                non_occuring[wordtag[0]] += 1
            else: non_occuring[wordtag[0]] = 1

for key in non_occuring.keys():
    if non_occuring[key] >= 10:
        non_occuring_min10[key] = non_occuring[key]

for key in sorted(non_occuring_min10.keys()):
       print("%s: %s" % (key, non_occuring_min10[key]))

>: 325
A: 142
A-Z: 65
A.: 67
ARTICLES: 65
Aaron: 65
Acamas: 18
Achaea: 40
Achaean: 54
Achaeans: 26
Achilleid: 65
Adrastus: 10
Aeacus: 67
Aeneid: 65
Aeolus: 10
Aeschylus: 455
Against: 67
Agenor: 23
Ah: 68
Aiantes: 18
Aias: 260
Alcman: 65
Alexander: 49
Alexandra: 65
Amazon: 20
Amazons: 15
Amphimachus: 10
Anchises: 22
Andromache: 36
Antenor: 31
Antilochus: 76
Antinous: 54
Antiphus: 11
Apollodorus: 65
Apollonius: 65
Aratus: 65
Argive: 70
Argives: 122
Argonautica: 130
Argos: 54
Armour: 25
Arms: 15
Asius: 20
Asteropaeus: 16
Astronomica: 65
Athene: 170
Atsma: 65
Autolycus: 19
Automedon: 26
Ay: 17
Aye: 64
BESTIARY: 65
BOOK: 1540
BY: 65
Barring: 25
Battefield: 25
Battle: 51
Battlefield: 350
Battles: 30
Be: 16
Bearers: 65
Beggar: 50
Beguiling: 25
Bion: 65
Body: 25
Book: 65
Bound: 65
Briseis: 17
CLASSICAL: 65
CONTENTS: 65
Calchas: 23
Callimachus: 65
Callistratus: 65
Calypso: 50
Capaneus: 10
Cassandra: 10
Catalogue: 25
Child: 14
Chiliades: 65
Chryse: 10
Chryses: 10
Circe: 70
Classical: 136
Claudia