# Number of bibles

For every bible, I should know if it was processed correctly and, if not, why. Is it because none of the books were available? And how many books did we process for word-pasting? Do we have the same bibles for word-pasting and word-splitting?

In [None]:
BIBLE_DIR = '/hpc/uu_ics_ads/pmosteiro/paralleltext/bibles/corpus'
OUTPUT_DIR = '/hpc/uu_ics_ads/pmosteiro/WordOrderBibles/output/KoplenigEtAl/WordSplitting'
WP_DIR = '/hpc/uu_ics_ads/pmosteiro/WordOrderBibles/output/KoplenigEtAl/WordPasting'
EXPECTED_N_BIBLES = 2000
REQUESTED_BOOKS = [40, 41, 42, 43, 44, 66]

In [None]:
import os
import re
import json
from util import to_csv
import pandas as pd
import data

In [None]:
bibles = []
for filename in os.listdir(BIBLE_DIR):
    if not re.fullmatch('\w\w\w-x-bible(-[\w-]*)?(.txt)+', filename):
        raise ValueError(filename + 'not of the right format')
    bibles.append(filename)

In [None]:
def is_subset(sublist: list) -> bool:
    superlist = [40, 41, 42, 43, 44, 66]
    return all([el in superlist for el in sublist]) and len(sublist) != 0

In [None]:
def is_processed(bible_filename: str, output_dir: str) -> bool:
    entropies_filename = f'entropies_{bible_filename}.json'
    output_files = os.listdir(output_dir)
    return entropies_filename in output_files

In [None]:
def is_correct(csv_filename: str) -> bool:
    df = pd.read_csv(csv_filename)
    return len(df) > 0

In [None]:
def is_empty(json_file: str) -> bool:
    with open(json_file) as f:
        my_dict = json.loads(f.read())
    if len(my_dict) == 0 or not is_subset(sorted([int(el) for el in my_dict.keys()])):
        return True
    return False

In [None]:
empty_json = []
processed = []
not_processed = []
for bible_filename in bibles:
    # Was it processed correctly?
    if is_processed(bible_filename, OUTPUT_DIR):
        output_base = os.path.join(OUTPUT_DIR, f'entropies_{bible_filename}')
        json_file = output_base + '.json'
        if is_empty(json_file):
            empty_json.append(json_file)
            continue
        processed.append(bible_filename)
    else:
        not_processed.append(bible_filename)

In [None]:
assert len(empty_json) + len(processed) + len(not_processed) == len(bibles)
assert len(bibles) == EXPECTED_N_BIBLES

In [None]:
print(len(processed), 'processed correctly')
print(len(empty_json), 'processed but json is empty')
print(len(not_processed), 'completely absent')

## Processed but json is empty

- check the empty-json category and see that it matched the word-pasting empty-json files

In [None]:
wp_files = os.listdir(WP_DIR)
wp_json_files = [file for file in wp_files if file.endswith('.json')]
empty_wp_json_files = [file for file in wp_json_files if is_empty(os.path.join(WP_DIR, file))]

In [None]:
assert len(empty_json) == len(empty_wp_json_files) and len(empty_json) != 0

In [None]:
assert set([os.path.basename(file) for file in empty_json]) == set(empty_wp_json_files)

- for each empty-json bible, check that none of the requested books is present

In [None]:
def get_books(filename: str) -> set:
    return set([el[:2] for el in list(data.parse_pbc_bible(filename).content.keys())])

In [None]:
for file in empty_json:
    available_books = get_books(os.path.join(BIBLE_DIR, os.path.basename(file).replace('entropies_', '').replace('.json', '')))
    assert not any([(book in available_books) for book in REQUESTED_BOOKS])

## Completely absent

- Check that these are exactly the same files that were excluded because of an unknown symbol

In [None]:
with open('bibles_to_exclude.txt') as f:
    lines = f.readlines()

In [None]:
unknown_character_bibles = [line.split('#')[0].strip() for line in lines if "unknown character" in line]

In [None]:
assert set(not_processed) == set(unknown_character_bibles)

# Conclusion

The word-splitting processing is correct if this notebook ran correctly with no exceptions from start to finish.