In [103]:
!pip install nltk pandas
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[?25hDownloading numpy-2.3.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
[2K

[nltk_data] Downloading package punkt_tab to /home/emil/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/emil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/emil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [106]:
corpus_names = ["oracc_cams", "oracc_ribo", "oracc_rinap", "oracc_saao"]
# corpus oracc_dcclt has been excluded due to it having no valid translations.
corpus_display_names = {
    "oracc_cams": "Corpus of Ancient Mesopotamian Scholarship",
    "oracc_ribo": "Royal Inscriptions of Babylonia online",
    "oracc_rinap": "Royal Inscriptions of the Neo-Assyrian Period",
    "oracc_saao": "State Archives of Assyria Online"
}

In [94]:
import nltk
tokenized_corpora = {}
for corpus_name in corpus_names:
    with open(f"parsed_dataset/translations_{corpus_name}.txt") as f:
        lines = f.readlines()
        print(f"{corpus_name}: {len(lines)} lines")
        lines = [nltk.word_tokenize(line) for line in lines]
        tokenized_corpora[corpus_name] = lines

oracc_cams: 19370 lines
oracc_ribo: 1027 lines
oracc_rinap: 2434 lines
oracc_saao: 40064 lines


## 2

In [95]:
# extract tokens into a flat list for each corpus
flat_token_lists = {}
for corpus_name in corpus_names:
    flat_tokens = []
    for line in tokenized_corpora[corpus_name]:
        flat_tokens.extend(line)
    flat_token_lists[corpus_name] = flat_tokens

for corpus_name in corpus_names:
    num_tokens = len(flat_token_lists[corpus_name])
    print(f"{corpus_name}: {num_tokens} tokens")


oracc_cams: 322766 tokens
oracc_ribo: 50345 tokens
oracc_rinap: 173278 tokens
oracc_saao: 618420 tokens


In [96]:

def is_valid_word(word):
    return word not in [",", ".", ";", ":", "!", "?", "'", '"', "-", "—", "(", ")", "[", "]", "{", "}", "...", "`", "``", "''", "“", "”"]


In [97]:
stopword_removed_corpora = {}
for corpus_name in corpus_names:
    line = flat_token_lists[corpus_name]
    filtered_line = []
    for word in line:
        if is_valid_word(word):
            filtered_line.append(word.lower())
    stopword_removed_corpora[corpus_name] = filtered_line

print(stopword_removed_corpora["oracc_cams"][:50])

['the', 'son', 'of', 'the', 'king', 'of', 'the', 'inhabited', 'world', 'the', 'resplendent', 'one', 'the', 'beloved', 'of', 'mami', 'let', 'me', 'sing', 'of', 'the', 'mighty', 'one', 'the', 'son', 'of', 'enlil', 'ninurta', 'the', 'resplendent', 'one', 'the', 'beloved', 'of', 'mami', 'let', 'me', 'praise', 'the', 'mighty', 'one', 'the', 'god', 'the', 'son', 'of', 'enlil', 'the', 'offspring', 'of']


In [98]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_words(words):
    return [stemmer.stem(word) for word in words]

stemmed_corpora = {}
for corpus_name in corpus_names:
    lines = stopword_removed_corpora[corpus_name]
    stemmed_lines = stem_words(lines)
    stemmed_corpora[corpus_name] = stemmed_lines

# test print
print(stemmed_corpora["oracc_cams"][0:5])


['the', 'son', 'of', 'the', 'king']


In [99]:
# calculate total number of tokens in each stemmed corpus
num_stemmed_tokens = {}
for corpus_name in corpus_names:
    num_tokens = len(stemmed_corpora[corpus_name])
    num_stemmed_tokens[corpus_name] = num_tokens
    print(f"{corpus_name}: {num_tokens} stemmed tokens")

oracc_cams: 235673 stemmed tokens
oracc_ribo: 37627 stemmed tokens
oracc_rinap: 129354 stemmed tokens
oracc_saao: 484901 stemmed tokens


In [119]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def exists_in_wordnet(word):
    return bool(wordnet.synsets(word))


In [120]:
# calculate proportion of tokens that exist in WordNet for each corpus
wordnet_proportions = {}
for corpus_name in corpus_names:
    stemmed_tokens = stemmed_corpora[corpus_name]
    if len(stemmed_tokens) > 0:
        count_in_wordnet = sum(1 for token in stemmed_tokens if exists_in_wordnet(token))
        proportion = count_in_wordnet / len(stemmed_tokens)
        wordnet_proportions[corpus_name] = proportion
        print(f"{corpus_name}: {proportion:.4f} proportion of tokens in WordNet")

oracc_cams: 0.5712 proportion of tokens in WordNet
oracc_ribo: 0.5377 proportion of tokens in WordNet
oracc_rinap: 0.5087 proportion of tokens in WordNet
oracc_saao: 0.5351 proportion of tokens in WordNet


In [121]:
# calculate vocabulary sizes
vocabulary_sizes = {}
for corpus_name in corpus_names:
    unique_tokens = set(stemmed_corpora[corpus_name])
    vocabulary_sizes[corpus_name] = len(unique_tokens)
    print(f"{corpus_name}: vocabulary size: {len(unique_tokens)}")

oracc_cams: vocabulary size: 8051
oracc_ribo: vocabulary size: 2389
oracc_rinap: vocabulary size: 4717
oracc_saao: vocabulary size: 13192


In [122]:
# summarizing findings in a table
import pandas as pd

data = {
    "Corpus": [corpus_display_names[name] for name in corpus_names],
    "Total Tokens": [num_stemmed_tokens[name] for name in corpus_names],
    "Vocabulary Size": [vocabulary_sizes[name] for name in corpus_names],
    "Proportion in WordNet": [wordnet_proportions[name] for name in corpus_names]
}


df = pd.DataFrame(data)
df

Unnamed: 0,Corpus,Total Tokens,Vocabulary Size,Proportion in WordNet
0,Corpus of Ancient Mesopotamian Scholarship,235673,8051,0.571181
1,Royal Inscriptions of Babylonia online,37627,2389,0.537699
2,Royal Inscriptions of the Neo-Assyrian Period,129354,4717,0.508689
3,State Archives of Assyria Online,484901,13192,0.535111


## 3

In [123]:
# load BBC dataset with Pandas
bbc_df = pd.read_csv("dataset/bbc_news.csv")
bbc_df.head()
# extract title and description columns
bbc_texts = bbc_df["title"].tolist() + bbc_df["description"].tolist()
print(bbc_texts[:5])

['Ukraine: Angry Zelensky vows to punish Russian atrocities', 'War in Ukraine: Taking cover in a town under attack', "Ukraine war 'catastrophic for global food'", "Manchester Arena bombing: Saffie Roussos's parents on hearing the truth", 'Ukraine conflict: Oil price soars to highest level since 2008']


In [124]:
tokenized_bbc = [nltk.word_tokenize(text) for text in bbc_texts]
print(tokenized_bbc[:2])

[['Ukraine', ':', 'Angry', 'Zelensky', 'vows', 'to', 'punish', 'Russian', 'atrocities'], ['War', 'in', 'Ukraine', ':', 'Taking', 'cover', 'in', 'a', 'town', 'under', 'attack']]


In [125]:
# flatten the tokenized list
flat_bbc_tokens = []
for line in tokenized_bbc:
    flat_bbc_tokens.extend(line)
print(flat_bbc_tokens[:50])

['Ukraine', ':', 'Angry', 'Zelensky', 'vows', 'to', 'punish', 'Russian', 'atrocities', 'War', 'in', 'Ukraine', ':', 'Taking', 'cover', 'in', 'a', 'town', 'under', 'attack', 'Ukraine', 'war', "'catastrophic", 'for', 'global', 'food', "'", 'Manchester', 'Arena', 'bombing', ':', 'Saffie', 'Roussos', "'s", 'parents', 'on', 'hearing', 'the', 'truth', 'Ukraine', 'conflict', ':', 'Oil', 'price', 'soars', 'to', 'highest', 'level', 'since', '2008']


In [126]:
# remove special characters and lowercase
filtered_bbc_tokens = []
for word in flat_bbc_tokens:
    if is_valid_word(word):
        filtered_bbc_tokens.append(word.lower())
print(filtered_bbc_tokens[:50])

['ukraine', 'angry', 'zelensky', 'vows', 'to', 'punish', 'russian', 'atrocities', 'war', 'in', 'ukraine', 'taking', 'cover', 'in', 'a', 'town', 'under', 'attack', 'ukraine', 'war', "'catastrophic", 'for', 'global', 'food', 'manchester', 'arena', 'bombing', 'saffie', 'roussos', "'s", 'parents', 'on', 'hearing', 'the', 'truth', 'ukraine', 'conflict', 'oil', 'price', 'soars', 'to', 'highest', 'level', 'since', '2008', 'ukraine', 'war', 'pm', 'to', 'hold']


In [127]:
stemmed_bbc_tokens = stem_words(filtered_bbc_tokens)
print(stemmed_bbc_tokens[:50])

['ukrain', 'angri', 'zelenski', 'vow', 'to', 'punish', 'russian', 'atroc', 'war', 'in', 'ukrain', 'take', 'cover', 'in', 'a', 'town', 'under', 'attack', 'ukrain', 'war', "'catastroph", 'for', 'global', 'food', 'manchest', 'arena', 'bomb', 'saffi', 'rousso', "'s", 'parent', 'on', 'hear', 'the', 'truth', 'ukrain', 'conflict', 'oil', 'price', 'soar', 'to', 'highest', 'level', 'sinc', '2008', 'ukrain', 'war', 'pm', 'to', 'hold']


In [128]:
# calculate total number of tokens in BBC dataset
num_bbc_tokens = len(stemmed_bbc_tokens)
print(f"BBC Dataset: {num_bbc_tokens} tokens")

BBC Dataset: 1165809 tokens


In [129]:
# calculate proportion of tokens that exist in WordNet
count_in_wordnet_bbc = sum(1 for token in stemmed_bbc_tokens if exists_in_wordnet(token))
proportion_bbc = count_in_wordnet_bbc / len(stemmed_bbc_tokens)
print(f"BBC Dataset: {proportion_bbc:.4f} proportion of tokens in WordNet")

BBC Dataset: 0.6004 proportion of tokens in WordNet


In [130]:
# calculate vocabulary size of BBC dataset
unique_bbc_tokens = set(stemmed_bbc_tokens)
vocabulary_size_bbc = len(unique_bbc_tokens)
print(f"BBC Dataset: vocabulary size: {vocabulary_size_bbc}")

BBC Dataset: vocabulary size: 33902


In [131]:
# summarize results in a table
bbc_data = {
    "Corpus": ["BBC News Dataset"],
    "Total Tokens": [num_bbc_tokens],
    "Vocabulary Size": [vocabulary_size_bbc],
    "Proportion in WordNet": [proportion_bbc]
}
bbc_df = pd.DataFrame(bbc_data)
bbc_df

Unnamed: 0,Corpus,Total Tokens,Vocabulary Size,Proportion in WordNet
0,BBC News Dataset,1165809,33902,0.600399


Results from two previous stages summarized in a table:

In [138]:
overall_df = pd.concat([df, bbc_df])
# reset index for cleaner look
overall_df.reset_index(drop=True, inplace=True)
overall_df

Unnamed: 0,Corpus,Total Tokens,Vocabulary Size,Proportion in WordNet
0,Corpus of Ancient Mesopotamian Scholarship,235673,8051,0.571181
1,Royal Inscriptions of Babylonia online,37627,2389,0.537699
2,Royal Inscriptions of the Neo-Assyrian Period,129354,4717,0.508689
3,State Archives of Assyria Online,484901,13192,0.535111
4,BBC News Dataset,1165809,33902,0.600399
