In [25]:
!pip install nltk
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package punkt_tab to /home/emil/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/emil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/emil/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
corpus_names = ["oracc_cams", "oracc_dcclt", "oracc_ribo", "oracc_rinap", "oracc_saao"]

In [None]:
import nltk
tokenized_corpora = {}
for corpus_name in corpus_names:
    with open(f"parsed_dataset/translations_{corpus_name}.txt") as f:
        lines = f.readlines()
        print(f"{corpus_name}: {len(lines)} lines")
        lines = [nltk.word_tokenize(line) for line in lines]
        tokenized_corpora[corpus_name] = lines

oracc_cams: 19370 lines
oracc_dcclt: 0 lines
oracc_ribo: 1027 lines
oracc_rinap: 2434 lines
oracc_saao: 40064 lines


## 2

In [82]:
# extract tokens into a flat list for each corpus
flat_token_lists = {}
for corpus_name in corpus_names:
    flat_tokens = []
    for line in tokenized_corpora[corpus_name]:
        flat_tokens.extend(line)
    flat_token_lists[corpus_name] = flat_tokens

for corpus_name in corpus_names:
    num_tokens = len(flat_token_lists[corpus_name])
    print(f"{corpus_name}: {num_tokens} tokens")


oracc_cams: 322766 tokens
oracc_dcclt: 0 tokens
oracc_ribo: 50345 tokens
oracc_rinap: 173278 tokens
oracc_saao: 618420 tokens


In [70]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def exists_in_wordnet(word):
    return bool(wordnet.synsets(word))

def exists_in_stopwords(word):
    return word.lower() in stop_words

def is_valid_word(word):
    return word not in [",", ".", ";", ":", "!", "?", "'", '"', "-", "—", "(", ")", "[", "]", "{", "}", "...", "`", "``", "''", "“", "”"]


In [83]:
stopword_removed_corpora = {}
for corpus_name in corpus_names:
    line = flat_token_lists[corpus_name]
    filtered_line = []
    for word in line:
        if is_valid_word(word):
            filtered_line.append(word.lower())
    stopword_removed_corpora[corpus_name] = filtered_line

print(stopword_removed_corpora["oracc_cams"][:50])

['the', 'son', 'of', 'the', 'king', 'of', 'the', 'inhabited', 'world', 'the', 'resplendent', 'one', 'the', 'beloved', 'of', 'mami', 'let', 'me', 'sing', 'of', 'the', 'mighty', 'one', 'the', 'son', 'of', 'enlil', 'ninurta', 'the', 'resplendent', 'one', 'the', 'beloved', 'of', 'mami', 'let', 'me', 'praise', 'the', 'mighty', 'one', 'the', 'god', 'the', 'son', 'of', 'enlil', 'the', 'offspring', 'of']


In [84]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_words(words):
    return [stemmer.stem(word) for word in words]

stemmed_corpora = {}
for corpus_name in corpus_names:
    lines = stopword_removed_corpora[corpus_name]
    stemmed_lines = stem_words(lines)
    stemmed_corpora[corpus_name] = stemmed_lines

# test print
print(stemmed_corpora["oracc_cams"][0:5])


['the', 'son', 'of', 'the', 'king']


In [86]:
# calculate total number of tokens in each stemmed corpus
num_stemmed_tokens = {}
for corpus_name in corpus_names:
    num_tokens = len(stemmed_corpora[corpus_name])
    num_stemmed_tokens[corpus_name] = num_tokens
    print(f"{corpus_name}: {num_tokens} stemmed tokens")

oracc_cams: 235673 stemmed tokens
oracc_dcclt: 0 stemmed tokens
oracc_ribo: 37627 stemmed tokens
oracc_rinap: 129354 stemmed tokens
oracc_saao: 484901 stemmed tokens
