In [None]:
import pandas as pd
import os
from collections import Counter
import re
import matplotlib.pyplot as plt
import string
import data
import random

In [None]:
OLD_BIBLE_CORPUS_PATH = '/home/pablo/Documents/GitHubRepos/paralleltext/bibles/corpus'
KOPLENIG_FILES = '/home/pablo/ownCloud/WordOrderBibles/Literature/ThirdRound/dataverse_files'

In [None]:
constant, bible, shuffled, master = [pd.read_csv(os.path.join(KOPLENIG_FILES, filename), sep='\t') for filename in (
    'final_data_entropy_bible_constant.csv', 
    'final_data_entropy_bible.csv', 
    'final_data_entropy_bible_constant_fullshuffle.csv', 
    'master.csv'
)]

In [None]:
bible.head(2)

In [None]:
constant.head(2)

In [None]:
shuffled.head(2)

In [None]:
constant[(constant['language'].apply(lambda x: 'Auhelawa' in x)) & (constant['booktitle'] == 'Luke')]

In [None]:
shuffled[(shuffled['language'].apply(lambda x: 'Auhelawa' in x)) & (shuffled['booktitle'] == 'Luke')]

In [None]:
master.head(2)

In [None]:
len(master)

In [None]:
master['language'].nunique()

In [None]:
for lbl, grp in master.groupby('language'):
    if len(grp) > 1:
        print(lbl)
        break

In [None]:
master[master['language'] == 'Afrikaans']

In [None]:
master.groupby('ISO').count()

Get the names of the languages for which there are multiple ISOs

In [None]:
for lbl, grp in master.groupby('language'):
    if grp['ISO'].nunique() > 1:
        print(lbl, grp['ISO'].unique())

Similarly, get the ISOs for which there are multiple languages

In [None]:
for lbl, grp in master.groupby('ISO'):
    if grp['language'].nunique() > 1:
        print(lbl, grp['language'].unique())

It would probably be better to trust the ISO code

In [None]:
master.head(2)

In [None]:
shuffled.head(2)

shuffled contains books, while master contains bibles

In [None]:
assert master['translation'].nunique() == len(master), 'There are multiple bibles with the same translation code'

In [None]:
assert not any([el for el in shuffled['translation'].tolist() if el not in master['translation'].tolist()]), \
'Some translation codes in the list of books are not in the list of bibles'

### Simple compression method

Let's test the simple compression method in Koplenig et al that is given as an example. I have downloaded a random Wikipedia page.

In [None]:
with open('randomWikipediaPage.txt', 'r') as f:
    text = f.read()

In [None]:
# Split into words. This loses information, but we could keep this information in some way
words = text.split()

In [None]:
orig_n_alpha = sum([len(wd) for wd in words])

In [None]:
orig_n_spaces = len(text) - orig_n_alpha

In [None]:
seen_words = {}
new_list = []
for wd in words:
    if wd in seen_words:
        new_list.append(str(seen_words[wd]))
    else:
        new_list.append(f'{len(seen_words)}_{wd}')
        seen_words[wd] = len(seen_words)

In [None]:
new_n_alpha = sum([len(wd) for wd in new_list])

In [None]:
old_size = orig_n_alpha + orig_n_spaces
new_size = new_n_alpha + orig_n_spaces

In [None]:
print(new_size/old_size)

So this didn't work for us. Probably it would be good to remove punctuation as well.

In [None]:
tokens = [text[0]]
old_ch = text[0]
for ch in text[1:]:
    if ch.isalpha() != old_ch.isalpha():
        tokens.append(ch)
    else:
        tokens[-1] = tokens[-1] + ch
    old_ch = ch

This split the tokens differently, and now we can attempt the same process again. Note that now the number of characters in these tokens is the same as in the original text.

In [None]:
assert len(text) == sum([len(wd) for wd in tokens])

In [None]:
seen_words = {}
new_list = []
for wd in tokens:
    if wd in seen_words:
        new_list.append(str(seen_words[wd]))
    else:
        new_list.append(f'{len(seen_words)}_{wd}')
        seen_words[wd] = len(seen_words)

In [None]:
print(len(text))
print(sum([len(wd) for wd in new_list]))
print(sum([len(wd) for wd in new_list])/len(text))

Now this gave some compression, but still very little. A final approach would be to represent non-alpha characters without modification. But we have to add an underscore to ranks to distinguish from numbers.

In [None]:
seen_words = {}
new_list = []
for wd in tokens:
    if not wd.isalpha():
        new_list.append(wd)
        continue
    if wd in seen_words:
        new_list.append(str(seen_words[wd]) + '_')
    else:
        new_list.append(f'{len(seen_words)}_{wd}')
        seen_words[wd] = len(seen_words)

In [None]:
print(len(text))
print(sum([len(wd) for wd in new_list]))
print(sum([len(wd) for wd in new_list])/len(text))

This ended up hurting us. My conclusion would be that you need a bigger corpus in order for this compression algorithm to do something meaningful.

# Bible corpus

The corpus was made available my Michael Cysouw

In [None]:
bibles = os.listdir(OLD_BIBLE_CORPUS_PATH)

In [None]:
Counter([len(el.split('-')) for el in bibles])

In [None]:
[el for el in bibles if len(el.split('-')) == 5]

In [None]:
assert not [el for el in bibles if el.split('.')[1] != 'txt']

In [None]:
struct_bibles = []
for bible_name in bibles:
    bible = bible_name.split('.')[0]
    parts = bible.split('-')
    language = parts[0]
    delimiter = parts[1]
    document_type = parts[2]
    description = '' if len(parts) == 3 else '-'.join(parts[3:])
    uid = bible
    filename = bible_name
    struct_bibles.append((language, delimiter, document_type, description, uid, filename))

In [None]:
assert not [el for el in struct_bibles if el[2] != 'bible']

In [None]:
bible_df = pd.DataFrame(columns=['language', 'delimiter', 'type', 'description', 'uid', 'filename'], 
                        data=struct_bibles)

In [None]:
bible_df.loc[(bible_df['language'] == 'ben') & (bible_df['type'] == 'kerry'), 'description'] = 'kerry'
bible_df.loc[(bible_df['language'] == 'ben') & (bible_df['type'] == 'kerry'), 'type'] = 'bible'
bible_df.loc[(bible_df['language'] == 'ben') & (bible_df['type'] == 'kerry'), 'uid'] = 'ben-x-bible-kerry'
assert len(struct_bibles) == len(bible_df)
assert len(struct_bibles) == len(bible_df[(bible_df['delimiter'] == 'x') & (bible_df['type'] == 'bible')])

Now we have filled a table with the names of the files. We also need to load the contents of the files.

In [None]:
def parse_bible(bible_lines: list, parse_content: bool) -> tuple:
    # Assume that the file starts with comments, and then it moves on to content
    # The comments should have string keys (not numeric) that start with a hash and whose key ends in colon
    # The content can optionally be commented out
    in_comments = True
    comment_lines, content_lines = [], []
    content_pattern = '#? ?(\d{1,8}) ?\t(.*)\s*'
    for line in bible_lines:
        #print(line, in_comments)
        if in_comments:
            comment_match = re.fullmatch('# ([\w\d-]+):\s+(.*)\s*', line)
            if comment_match:
                comment_lines.append((comment_match.group(1), comment_match.group(2)))
            else:
                content_match = re.fullmatch(content_pattern, line)
                if content_match:
                    if not parse_content:
                        break
                    content_lines.append((content_match.group(1), content_match.group(2), line[0] == '#'))
                    in_comments = False
                else:
                    comment_lines[-1] = (comment_lines[-1][0], comment_lines[-1][1] + '\n' + line)
        else:
            content_match = re.fullmatch(content_pattern, line)
            if content_match:
                content_lines.append((content_match.group(1), content_match.group(2), line[0] == '#'))
            else:
                raise Exception(f'{line} does not match an expected format')
    return comment_lines, content_lines

In [None]:
def open_and_parse(filename: str, parse_content: bool) -> tuple:
    with open(os.path.join(OLD_BIBLE_CORPUS_PATH, filename)) as f:
        lines = f.readlines()
    return parse_bible(lines, parse_content)

In [None]:
comments_and_content = []
for filename in bible_df['filename']:
    if len(comments_and_content) % int(len(bible_df) / 10) == 0:
        print(len(comments_and_content))
    try:
        comments_and_content.append(open_and_parse(filename, True))
    except Exception as e:
        print(f'Error for file {filename}', e)
        comments_and_content.append(-1)

In [None]:
# 1st index: bible
# 2nd index: comments or content
# 3rd index: line number
# 4th index: key, value, or commented (the latter in the case of content only)
zeroth_bible_content_zeroth_line_is_commented = comments_and_content[0][1][0][2]

## Some simple analytics

### Are the keys in the comments universal?

In [None]:
keys_in_comments = [[comment[0] for comment in bible[0]] for bible in comments_and_content]

In [None]:
all_comment_keys = [el for lis in keys_in_comments for el in lis]

In [None]:
comment_key_counter = Counter(all_comment_keys)

In [None]:
comment_key_counter

This tells us that all the keys are basically the same, with the exception of the notes, which for some reason are missing in two of the bibles. We can insert all of this into a dataframe. Maybe it will be the same as the dataframes we used above.

In [None]:
comments_dicts = []
assert len(bible_df) == len(comments_and_content)
for i, filename in enumerate(bible_df['filename']):
    key_value = {k:v for k,v in comments_and_content[i][0]}
    if 'notes' not in key_value:
        key_value['notes'] = ''
    key_value['filename'] = filename
    comments_dicts.append(key_value)
comments_df = pd.DataFrame(comments_dicts)

In [None]:
comments_df.head()

In [None]:
master.head()

So these two dataframes don't match exactly, although they're probably related.

### How common is it for content lines to be commented out?

In [None]:
bibles_with_commented_content = [i for i, comments_content in enumerate(comments_and_content) if any([el[2] for el in comments_content[1]])]

In [None]:
print(f'{len(bibles_with_commented_content)} bibles have commented-out content')

In [None]:
numbers_of_commented_content_lines = [len([el for el in comments_and_content[i][1] if el[2]]) for i in bibles_with_commented_content]

In [None]:
print(f'The top most commented bibles are: {sorted(numbers_of_commented_content_lines, key=lambda x: -x)[:3]}')

In [None]:
index_num_commented_content_lines = [(ix, numbers_of_commented_content_lines[i]) for i, ix in enumerate(bibles_with_commented_content)]

In [None]:
index_num_commented_content_lines = sorted(index_num_commented_content_lines, key=lambda x: -x[1])

In [None]:
print(f'The most commented bible is {bible_df["filename"].tolist()[index_num_commented_content_lines[0][0]]}')

In [None]:
comments_df[comments_df['filename'] == bible_df["filename"].tolist()[index_num_commented_content_lines[0][0]]]

In [None]:
print(f'The total number of content lines in that file is {len(comments_and_content[index_num_commented_content_lines[0][0]][1])}')

So about 10% of the content lines in that file are commented out. Now let's see what some of those lines look like.

In [None]:
two_similar_lines = [el for el in comments_and_content[index_num_commented_content_lines[0][0]][1] if el[0] == '40026071']

In [None]:
print(two_similar_lines[1][1], len(two_similar_lines[1][1]))
print(two_similar_lines[0][1], len(two_similar_lines[0][1]))

Note a very small difference in the word "jah sa" versus "jas~sa". This must be due to some difference in convention of writing. That could explain some of these commented-out lines. The best we can do is to ignore commented-out lines.

### Numbers of content lines with no key or no value

In [None]:
no_key = [i for i, comments_content in enumerate(comments_and_content) if any([el[0].strip() == '' for el in comments_content[1]])]

In [None]:
no_value = [i for i, comments_content in enumerate(comments_and_content) if any([el[1].strip() == '' for el in comments_content[1]])]

In [None]:
print(f'{len(no_key)} bibles have content lines with no key')

In [None]:
print(f'{len(no_value)} bibles have content lines with no value')

So having now value is quite common. Let's take one example.

In [None]:
welsh = 12
assert welsh in no_value

In [None]:
first_empty_line = [(i, el) for i, el in enumerate(comments_and_content[welsh][1]) if not el[1].strip()]

In [None]:
comments_and_content[welsh][1][first_empty_line[0][0]-1:first_empty_line[0][0]+2]

Now let's translate the verse before the empty line:

*Greet one another with a holy kiss. All the saints salute you.*

Let's look for the same line in an English bible

In [None]:
english = 20
empty_verse_index = [i for i in range(len(comments_and_content[english][1])) if comments_and_content[english][1][i][0] == '47013013'][0]

In [None]:
comments_and_content[english][1][empty_verse_index-1:empty_verse_index+2]

So, indeed, for some reason the Welsh bible has two verses merged into one.

### Number of content lines for different bibles/languages

In [None]:
commented_content_lines, uncommented_content_lines = [], []
for bible_index, comment_content in enumerate(comments_and_content):
    content = comment_content[1]
    non_commented_content = [el for el in content if not el[2]]
    commented_content = [el for el in content if el[2]]
    assert len(commented_content) + len(non_commented_content) == len(content)
    commented_content_lines.append(len(commented_content))
    uncommented_content_lines.append(len(non_commented_content))
    assert len(non_commented_content) == len(set([el[0] for el in non_commented_content]))

In [None]:
assert len(comments_df) == len(commented_content_lines) and len(comments_df) == len(uncommented_content_lines)

In [None]:
comments_df['n_uncommented_verses'] = uncommented_content_lines
comments_df['n_commented_verses'] = commented_content_lines

In [None]:
comments_df.head()

In [None]:
plt.hist(comments_df['n_uncommented_verses'], bins=30)
plt.show()

These two peaks look like only new testament versus full bible. Let's take some example and check that.

In [None]:
plt.hist(comments_df[comments_df['n_uncommented_verses'] < 15000]['n_uncommented_verses'], bins=30)
plt.title('Small counts')
plt.show()

In [None]:
plt.hist(comments_df[comments_df['n_uncommented_verses'] > 15000]['n_uncommented_verses'], bins=30)
plt.title('Big counts')
plt.show()

In [None]:
verse_counter = Counter(comments_df['n_uncommented_verses'])

In [None]:
verse_counter.most_common(4)

In [None]:
n_peak = verse_counter.most_common(1)[0][0]
in_peak = [i for i in range(len(comments_df)) if comments_df['n_uncommented_verses'].tolist()[i] == n_peak][:3]
print(f'Exactly the peak: {in_peak}')

In [None]:
n_second = verse_counter.most_common(2)[1][0]
in_second = [i for i in range(len(comments_df)) if comments_df['n_uncommented_verses'].tolist()[i] == n_second][:3]
print(f'Exactly the secondary peak: {in_second}')

In [None]:
n_almost = verse_counter.most_common(3)[2][0]
in_almost = [i for i in range(len(comments_df)) if comments_df['n_uncommented_verses'].tolist()[i] == n_almost][:3]
print(f'Almost the peak: {in_almost}')

In [None]:
comments_df.iloc[in_peak]

In [None]:
comments_df.iloc[in_second]

In [None]:
comments_df.iloc[in_almost]

Indeed, the large peak at smaller number of verses is the new testament only, while the secondary peak at larger number of verses is the full bible. Why are the counts not exactly the same for all bibles in each peak?

In [None]:
full_new_testament = comments_and_content[in_peak[0]][1]
almost_new_testament = comments_and_content[in_almost[0]][1]

In [None]:
full_new_testament_keys = [el[0] for el in full_new_testament if not el[2]]
almost_new_testament_keys = [el[0] for el in almost_new_testament if not el[2]]

In [None]:
print(f'The full and almost full new testaments contain {len(full_new_testament_keys)} and {len(almost_new_testament_keys)} keys, respectively')

In [None]:
for keys in (full_new_testament_keys, almost_new_testament_keys):
    assert len(keys) == len(set(keys))

In [None]:
assert len([el for el in almost_new_testament_keys if el not in full_new_testament_keys]) == 0

In [None]:
assert len([el for el in full_new_testament_keys if el not in almost_new_testament_keys]) == 1
missing_key = [el for el in full_new_testament_keys if el not in almost_new_testament_keys][0]

In [None]:
[el for el in almost_new_testament if el[0] == missing_key]

So it's fully missing, not commented out. Let's look at the raw file:

In [None]:
with open(os.path.join(OLD_BIBLE_CORPUS_PATH, comments_df.iloc[in_peak[0]]['filename']), 'r') as f:
    peak_lines = f.readlines()
with open(os.path.join(OLD_BIBLE_CORPUS_PATH, comments_df.iloc[in_almost[0]]['filename']), 'r') as f:
    almost_lines = f.readlines()
for i, lines in enumerate((peak_lines, almost_lines)):
    print(i)
    print([line for line in lines if missing_key in line])

The line is not in the second bible, even commented out or mis-parsed. It's not there at all. I'd like to see this in an English bible.

In [None]:
english_filename = comments_df[(comments_df['language_name'].apply(lambda x: 'English' == x.strip())) & (comments_df['n_uncommented_verses'] == n_peak)]['filename'].tolist()[0]
with open(os.path.join(OLD_BIBLE_CORPUS_PATH, english_filename)) as f:
    english_lines = f.readlines()
print(english_filename)
[line for line in english_lines if missing_key in line]

There does not seem to be anything special about this verse. I need some information about it. 43 is the book of John. John 7:53 is a verse that is apparently joined with the following verse in some bibles. Specifically, with the first verse in chapter 8. So let's look at this verse in the bible that is missing the verse.

In [None]:
comments_df[comments_df['n_uncommented_verses'] == n_almost]['language_name']

Malay is widely spoken and easy to translate, so let's pick that one:

In [None]:
almost_index = comments_df[comments_df.apply(lambda row: row['n_uncommented_verses'] == n_almost and row['language_name'].strip() == 'Malay', 1)].index.tolist()[0]
[el for el in comments_and_content[almost_index][1] if '43008001' == el[0]]

In Google Translate, this translates to "Then everyone went home, but Jesus went to the Mount of Olives.". Thus, indeed, this New Testament merges 7:53 with 8:1. What does 8:1 look like in the English bible referenced above?

In [None]:
[el for el in english_lines if '43008001' in el]

Indeed, this makes no reference to everyone going home. So the verses are separated. We will have to deal with this if we want to do verse matching. However, it's not clear that that is what we want.

### Data characteristics in Bentz et al

Do my simple analytics match what is reported in Bentz et al?

In [None]:
file_content = {file: comments_and_content[i][1] for i, file in enumerate(comments_df['filename'])}

only texts with at least
50 K tokens are included

In [None]:
def merge_bible_text(bible: list) -> str:
    text = ''
    for verse in bible:
        if verse[2]:
            continue
        text += (verse[1] + '\n')
    return text

In [None]:
texts = [merge_bible_text(el[1]) for el in comments_and_content]

In [None]:
def count_tokens_in_bible(bible: str) -> int:
    return len(re.findall('\s\S', bible.strip())) + 1

comments_df['n_tokens'] = [count_tokens_in_bible(el) for el in texts]

In [None]:
comments_df['n_tokens'].hist()

In [None]:
token_cutoff = 50000
    
file_text = {file: texts[i] for i, file in enumerate(comments_df['filename'])}

In [None]:
long_enough = comments_df[comments_df['n_tokens'] > token_cutoff].reset_index(drop=True)

In [None]:
print(f'{len(long_enough)} bibles pass the token cutoff')
print(f'This represents {int(len(long_enough) / len(comments_df) * 100)}% of the bibles')
print(f'For Bentz et al, this was {int(1499/1525*100)}%')

So the number of bibles I have is much bigger than the one used by Bentz et al, and this is probably due to changes in the corpus since then. But both of us have nearly all bibles long enough.

In [None]:
print(f"The number of languages goes from {comments_df['closest_ISO_639-3'].nunique()} to {long_enough['closest_ISO_639-3'].nunique()}")

This is a very small reduction, as was also observed by Bentz et al.

In [None]:
print(f"n_languages/n_bibles={long_enough['closest_ISO_639-3'].nunique()/len(long_enough)}")

In [None]:
print(f'For Bentz et al, it was {1115/1499}')

Again, this is very similar to previous work.

In [None]:
print(f'The mean size of a bible (including short ones) is {comments_df["n_tokens"].mean()}')

This is considerably longer (though on the same order of magnitude) as reported by Bentz et al.

In [None]:
print(f'The total number of tokens is {int(comments_df["n_tokens"].sum()/1000000)}M')

That's all for the comparison with Bentz et al. The longer mean size might be due to improvements to the corpus, or due to a difference in token counting. Since it's on the same order of magnitude, I will not worry about this.

# Tokenization

In [None]:
def tokenize(text: str) -> list:
    return re.sub(r'[^\w\s]','',text).lower().split()

In [None]:
[tokenize(el[1]) for el in comments_and_content[0][1]][0]

In [None]:
comments_and_content[0][1][0][1]

## How many bibles contain the underscore?

In [None]:
underscored_files = []
for i, filename in enumerate(bible_df['filename'].tolist()):
    with open(os.path.join(OLD_BIBLE_CORPUS_PATH, filename), 'r') as f:
        text = f.readlines()
        if any(['_' in line and line[0] != '#' for line in text]):
            print(filename)
            underscored_files.append(i)

In [None]:
[[el[1] for el in comments_and_content[i][0] if el[0] == 'language_name'] for i in underscored_files]

I looked at the two that are most familiar to me (Polish and Filipino), and found in the case of Filipino that the underscores could just be removed. In the case of Polish it is not so clear. Seeing as how these are very few bibles, I would just remove them from the corpus.

After tokenization, which bibles (if any) contain a character that is not a letter, a number, an underscore or a whitespace?

In [None]:
for i, bible in enumerate(comments_and_content):
    bible = bible[1]
    bible = '\n'.join([el[1] for el in bible])
    tokens = tokenize(bible)
    bible = ' '.join(tokens)
    if not re.fullmatch('[\w\s]*', bible):
        print(i)

Again, this seems to be related to few examples...

# Test tokenization

We follow the methodology of Bentz et al for tokenization: the PBC has spaces between punctuation marks, so we split on spaces, and then we remove punctuation by keeping only tokens that contain at least one non-punctuation character. Thus, the word "she's" should appear in the corpus as "she ' s", and be tokenized as "she s". Meanwhile, the word "q'atb'altzij" should appear in the corpus as "q'atb'altzij", and be tokenized as "q'atb'altzij". We will test those cases here, as well as 3 random verses from 3 random bibles.

In [None]:
assert ["she", "s"] == data.tokenize("she ' s", remove_punctuation=True, lowercase=True)

In [None]:
assert ["q'atb'altzij"] == data.tokenize("q'atb'altzij", remove_punctuation=True, lowercase=True)

In [None]:
sample_bibles = [os.path.join(OLD_BIBLE_CORPUS_PATH, filename) \
                 for filename in bible_df['filename'].sample(3).tolist()]

In [None]:
sample_tokenized_bibles = [data.process_bible(filename, 'PBC') \
                           for filename in sample_bibles]

In [None]:
sample_original_bibles = [data.parse_file(filename, 'PBC') \
                          for filename in sample_bibles]

In [None]:
assert all([len(sample_original_bibles[i].content) == len(sample_tokenized_bibles[i].verse_tokens) \
            for i in range(len(sample_bibles))])

In [None]:
for i in range(len(sample_bibles)):
    print(sample_bibles[i])
    print(ix)
    verse_number = random.choice(list(sample_original_bibles[i].content.keys()))
    print(verse_number)
    print(sample_original_bibles[i].content[verse_number])
    print(' '.join(sample_tokenized_bibles[i].verse_tokens[verse_number]))
    print('------------------------------------')

These all look good, so the tokenizer works well.

# Data splitting

Following Hahn, Degen & Furtrell (2021), we want to:

* conserve 15% of the data as a held-out set for early stopping, learning curves, and hyperparameter estimation

* concatenate the sentences from each partition in random order, separated by an end-of-sentence symbol

Additionally, we are trying to estimate the entropy on the same data that we use to train the LSTM. This is a bit tricky, but on the other hand presumably we will not be minimizing the same metric during training, so it's probably OK. Still, it would be good to keep a test set for reporting. So we'll do a 75, 15, 10 split. What is a suitable end-of-sentence symbol? I think `<END>` would do, and we know it will not appear in any bible because they are lowercased.

In [None]:
sample_bibles = [os.path.join(OLD_BIBLE_CORPUS_PATH, filename) \
                 for filename in bible_df['filename'].sample(30).tolist()]

In [None]:
sample_tokenized_bibles = [data.process_bible(filename, 'PBC') \
                           for filename in sample_bibles]

In [None]:
sample_split_bibles = [bible.split(0.15, 0.1) for bible in sample_tokenized_bibles]

In [None]:
# No lines were lost
assert all([len(bible.train_data) + len(bible.hold_out_data) + len(bible.test_data) == \
            len(sample_tokenized_bibles[i].verse_tokens) \
            for i, bible in enumerate(sample_split_bibles)])

In [None]:
# No tokens were lost
for i in range(len(sample_bibles)):
    verse_tokens = sample_tokenized_bibles[i].verse_tokens
    split_data = sample_split_bibles[i]
    split_data = (split_data.train_data, split_data.hold_out_data, split_data.test_data)
    n_orig_tokens = sum([len(v) for v in verse_tokens.values()])
    n_split_tokens = sum([sum([len(verse) for verse in data_partition]) \
                          for data_partition in split_data])
    assert n_orig_tokens == n_split_tokens, (n_orig_tokens, n_split_tokens)

In [None]:
# No types were lost
for i in range(len(sample_bibles)):
    verse_tokens = sample_tokenized_bibles[i].verse_tokens
    split_data = sample_split_bibles[i]
    split_data = (split_data.train_data, split_data.hold_out_data, split_data.test_data)
    n_orig_types = set([el for lis in verse_tokens.values() for el in lis])
    n_split_types = set([ell for liss in [[el for lis in data_partition for el in lis] \
                                          for data_partition in split_data] for ell in liss])
    assert n_orig_types == n_split_types