# Preprocessing of data

Necessary imports

In [45]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

First, we import the data

In [29]:
with open("homer_text_file.txt","r",encoding='utf-8') as f:
    data = f.readlines()
    
print(data[700])

[313] “So do not thou, my friend, wander long far from home, leaving thy wealth behind thee and men in thy house so insolent, lest they divide and devour all thy wealth, and thou shalt have gone on a fruitless journey. But to Menelaus I bid and command thee to go, for he has but lately come from a strange land, from a folk whence no one would hope in his heart to return, whom the storms had once driven astray into a sea so great, whence the very birds do not fare in the space of a year, so great is it and terrible. But now go thy way with thy ship and thy comrades, or, if thou wilt go by land, here are chariot and horses at hand for thee, and here at thy service are my sons, who will be thy guides to goodly Lacedaemon, where lives fair-haired Menelaus. And do thou beseech him thyself that he may tell thee the very truth. A lie will be not utter, for he is wise indeed.”



We split the data into sentences

In [38]:
sentences = []
for par in data:
    if par == '\n':
        continue
    senttemp = sent_tokenize(par)
    sentences = sentences +senttemp

print(sentences[100:105])

['Howbeit Poseidon had gone among the far-off Ethiopians—the Ethiopians who dwell sundered in twain, the farthermost of men, some where Hyperion sets and some where he rises, there to receive a hecatomb of bulls and rams, and there he was taking his joy, sitting at the feast; but the other gods were gathered together in the halls of Olympian Zeus.', "[28] Among them the father of gods and men was first to speak, for in his heart he thought of noble Aegisthus, whom far-famed Orestes, Agamemnon's son, had slain.", 'Thinking on him he spoke among the immortals, and said: “Look you now, how ready mortals are to blame the gods.', 'It is from us, they say, that evils come, but they even of themselves, through their own blind folly, have sorrows beyond that which is ordained.', 'Even as now Aegisthus, beyond that which was ordained, took to himself the wedded wife of the son of Atreus, and slew him on his return, though well he knew of sheer destruction, seeing that we spake to him before, se

Now, we tokenize the sentences

In [40]:
words = []
for sent in sentences:
    wordstemp = word_tokenize(sent)
    words = words + wordstemp
    
print(words[3000:3100])

['my', 'heart', ',', 'and', 'as', 'I', 'think', 'it', 'shall', 'be', 'brought', 'to', 'pass', ',', 'though', 'I', 'am', 'in', 'no', 'wise', 'a', 'soothsayer', ',', 'nor', 'one', 'versed', 'in', 'the', 'signs', 'of', 'birds', '.', 'Not', 'much', 'longer', 'shall', 'he', 'be', 'absent', 'from', 'his', 'dear', 'native', 'land', ',', 'no', ',', 'not', 'though', 'bonds', 'of', 'iron', 'hold', 'him', '.', 'He', 'will', 'contrive', 'a', 'way', 'to', 'return', ',', 'for', 'he', 'is', 'a', 'man', 'of', 'many', 'devices', '.', 'But', 'come', ',', 'tell', 'me', 'this', 'and', 'declare', 'it', 'truly', ',', 'whether', 'indeed', ',', 'tall', 'as', 'thou', 'art', ',', 'thou', 'art', 'the', 'son', 'of', 'Odysseus', 'himself', '.', 'Wondrously']


Now, we PoS tag the data, so that lemmatization is easier, and names of characters can be identified more easily

In [43]:
tagged = nltk.pos_tag(words, tagset = 'universal')

tagged_filtered = []
for wordtag in tagged:
    if wordtag[1] == '.':
        continue
    else: tagged_filtered.append(wordtag)
        
print(tagged_filtered[3000:3100])

[('that', 'DET'), ('man', 'NOUN'), ('was', 'VERB'), ('still', 'ADV'), ('among', 'ADP'), ('his', 'PRON'), ('people', 'NOUN'), ('But', 'CONJ'), ('now', 'ADV'), ('the', 'DET'), ('gods', 'NOUN'), ('have', 'VERB'), ('willed', 'VERB'), ('otherwise', 'ADV'), ('in', 'ADP'), ('their', 'PRON'), ('evil', 'ADJ'), ('devising', 'NOUN'), ('seeing', 'VERB'), ('that', 'ADP'), ('they', 'PRON'), ('have', 'VERB'), ('caused', 'VERB'), ('him', 'PRON'), ('to', 'PRT'), ('pass', 'VERB'), ('from', 'ADP'), ('sight', 'NOUN'), ('as', 'ADP'), ('they', 'PRON'), ('have', 'VERB'), ('no', 'DET'), ('other', 'ADJ'), ('man', 'NOUN'), ('For', 'ADP'), ('I', 'PRON'), ('should', 'VERB'), ('not', 'ADV'), ('so', 'ADV'), ('grieve', 'VERB'), ('for', 'ADP'), ('his', 'PRON'), ('death', 'NOUN'), ('if', 'ADP'), ('he', 'PRON'), ('had', 'VERB'), ('been', 'VERB'), ('slain', 'VERB'), ('among', 'ADP'), ('his', 'PRON'), ('comrades', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('land', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Trojans', 'NOUN')

Next, we lemmatize the dataset

In [55]:
# Reference for lemmatizing: https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

lemmatizer = WordNetLemmatizer()

lemmatized = []
for wordtag in tagged_filtered:
    tag = ''
    if wordtag[1] ==  'NOUN':
        tag =  'n'
    elif wordtag[1] ==  'VERB':
        tag =  'v'
    elif wordtag[1] ==  'ADJ':
        tag =  'a'
    elif wordtag[1] ==  'ADV':
        tag =  'r'
        
    # use postag if it exists in the wordnetlemmatizer
    if tag == '': lemma = lemmatizer.lemmatize(wordtag[0])
    else: lemma = lemmatizer.lemmatize(wordtag[0], pos=tag)
        
    lemmatized.append((lemma, wordtag[1]))
    
print(lemmatized[3000:3100])

[('that', 'DET'), ('man', 'NOUN'), ('be', 'VERB'), ('still', 'ADV'), ('among', 'ADP'), ('his', 'PRON'), ('people', 'NOUN'), ('But', 'CONJ'), ('now', 'ADV'), ('the', 'DET'), ('god', 'NOUN'), ('have', 'VERB'), ('will', 'VERB'), ('otherwise', 'ADV'), ('in', 'ADP'), ('their', 'PRON'), ('evil', 'ADJ'), ('devising', 'NOUN'), ('see', 'VERB'), ('that', 'ADP'), ('they', 'PRON'), ('have', 'VERB'), ('cause', 'VERB'), ('him', 'PRON'), ('to', 'PRT'), ('pass', 'VERB'), ('from', 'ADP'), ('sight', 'NOUN'), ('a', 'ADP'), ('they', 'PRON'), ('have', 'VERB'), ('no', 'DET'), ('other', 'ADJ'), ('man', 'NOUN'), ('For', 'ADP'), ('I', 'PRON'), ('should', 'VERB'), ('not', 'ADV'), ('so', 'ADV'), ('grieve', 'VERB'), ('for', 'ADP'), ('his', 'PRON'), ('death', 'NOUN'), ('if', 'ADP'), ('he', 'PRON'), ('have', 'VERB'), ('be', 'VERB'), ('slay', 'VERB'), ('among', 'ADP'), ('his', 'PRON'), ('comrade', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('land', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Trojans', 'NOUN'), ('or', 'CO