In [71]:
import os

In [72]:
from collections import defaultdict, Counter

In [73]:
bibles_dir = '/home/pablo/Documents/GitHubRepos/paralleltext/bibles/corpus/'

In [74]:
files = os.listdir(bibles_dir)

In [75]:
files = [bibles_dir + file for file in files]

In [76]:
def find_script(file):
    with open(file, 'r') as f:
        while True:
            line = f.readline()
            if not line:
                break
            if 'ISO_15924' in line:
                return [el for el in line.split() if el.strip() != ''][-1].strip()

In [77]:
script_files = defaultdict(list)
for file in files:
    script_files[find_script(file)].append(file)

In [78]:
script_n_files = Counter([el for lis in [len(v) * [k] for k, v in script_files.items()] for el in lis])

In [79]:
count_so_far = 0
common_scripts = []
for script, ct in script_n_files.most_common():
    count_so_far += ct
    common_scripts.append(script)
    if count_so_far / len(files) > 0.9:
        break

In [80]:
common_scripts

['Latn', 'Cyrl']

It suffices to check if these are lowercased correctly to account for 90% of the bibles.

I checked them, and they are correct.

However, the article hangs on comparing multiple languages. Figure 1 highlights a number of languages by ISO code: chr–Cherokee; cmn–Mandarin Chinese; deu–Standard German; eng–English; esk–Northwest Alaska Inupiatun; grc–Koine Greek; mya–Burmese; tam–Tamil; qvw—Huaylla Wanca Quechua; vie–Vietnamese; xuo–Kuo; zul–Zulu. Let's check those.

In [81]:
koplenig_et_al_languages = "chr–Cherokee; cmn–Mandarin Chinese; deu–Standard German; eng–English; esk–Northwest Alaska Inupiatun; grc–Koine Greek; mya–Burmese; tam–Tamil; qvw–Huaylla Wanca Quechua; vie–Vietnamese; xuo–Kuo; zul–Zulu".split(";")
koplenig_et_al_languages = [l.split('–')[0].strip() for l in koplenig_et_al_languages]

In [92]:
def find_language(filename: str) -> str:
    with open(filename, 'r') as f:
        while True:
            line = f.readline()
            if not line:
                break
            if 'closest_ISO_639-3' in line:
                return [el for el in line.split() if el.strip() != ''][2].strip()
    return 'error'

In [93]:
languages = [find_language(file) for file in files]

In [94]:
scripts = [find_script(file) for file in files]

In [96]:
scripts_to_check = set([])
for l in koplenig_et_al_languages:
    for i, language in enumerate(languages):
        if language == l:
            # We're only interested in those that are not the most common scripts
            if scripts[i] not in common_scripts:
                scripts_to_check.add(scripts[i])

In [97]:
scripts_to_check

{'Grek', 'Mymr', 'Taml'}

These are the additional scripts we need to check. We've already checked Greek, so there are two left to be checked.

In [103]:
sorted([(i, scripts[i], languages[i], files[i].split('/')[-1]) for i in range(len(scripts)) \
 if scripts[i] in scripts_to_check and languages[i] in koplenig_et_al_languages], key=lambda el: el[1])

[(234, 'Grek', 'grc', 'grc-x-bible-wescotthortVAR1.txt'),
 (509, 'Grek', 'grc', 'grc-x-bible-textusreceptusVAR2.txt'),
 (953, 'Grek', 'grc', 'grc-x-bible-accented.txt'),
 (974, 'Grek', 'grc', 'grc-x-bible-byzantine.txt'),
 (1003, 'Grek', 'grc', 'grc-x-bible-ecumenical.txt'),
 (1242, 'Grek', 'grc', 'grc-x-bible-textusreceptusVAR1.txt'),
 (1583, 'Grek', 'grc', 'grc-x-bible-wescotthortVAR2.txt'),
 (1586, 'Grek', 'grc', 'grc-x-bible-unaccented.txt'),
 (1664, 'Grek', 'grc', 'grc-x-bible-combined2005.txt'),
 (1794, 'Grek', 'grc', 'grc-x-bible-tischendorf.txt'),
 (189, 'Mymr', 'mya', 'mya-x-bible-1835.txt'),
 (269, 'Mymr', 'mya', 'mya-x-bible-common.txt'),
 (1430, 'Mymr', 'mya', 'mya-x-bible-newworld.txt'),
 (624, 'Taml', 'tam', 'tam-x-bible-newworld.txt'),
 (1051, 'Taml', 'tam', 'tam-x-bible-easy.txt')]

I need to check the Myanmar and Tamil alphabets. These do not necessarily have uppercasing, but I need to check.

I believe the Burmese script has no uppercasing. We will check whether lowering does anything to these languages.

In [105]:
for to_check in sorted([(i, scripts[i], languages[i], files[i].split('/')[-1]) for i in range(len(scripts)) \
 if scripts[i] in scripts_to_check and languages[i] in koplenig_et_al_languages], key=lambda el: el[1]):
    if to_check[1] != 'Mymr':
        continue
    with open(bibles_dir + to_check[3], 'r') as f:
        lines = f.readlines()
    text = ' '.join([line.split('\t')[1] for line in lines if line[0] != '#'])
    lowd = text.lower()
    assert text == lowd

The same holds for Tamil, so let's check it

In [106]:
for to_check in sorted([(i, scripts[i], languages[i], files[i].split('/')[-1]) for i in range(len(scripts)) \
 if scripts[i] in scripts_to_check and languages[i] in koplenig_et_al_languages], key=lambda el: el[1]):
    if to_check[1] != 'Taml':
        continue
    with open(bibles_dir + to_check[3], 'r') as f:
        lines = f.readlines()
    text = ' '.join([line.split('\t')[1] for line in lines if line[0] != '#'])
    lowd = text.lower()
    assert text == lowd

And this does NOT hold for Greek, so let's check that the assertion is backwards there

In [112]:
for to_check in sorted([(i, scripts[i], languages[i], files[i].split('/')[-1]) for i in range(len(scripts)) \
 if scripts[i] in scripts_to_check and languages[i] in koplenig_et_al_languages], key=lambda el: el[1]):
    if to_check[1] != 'Grek':
        continue
    with open(bibles_dir + to_check[3], 'r') as f:
        lines = f.readlines()
    text = ' '.join([line.split('\t')[1] for line in lines if line[0] != '#'])
    lowd = text.lower()
    if text == lowd:
        print(to_check[3])

grc-x-bible-wescotthortVAR1.txt
grc-x-bible-textusreceptusVAR2.txt
grc-x-bible-accented.txt
grc-x-bible-byzantine.txt
grc-x-bible-textusreceptusVAR1.txt
grc-x-bible-wescotthortVAR2.txt
grc-x-bible-unaccented.txt
grc-x-bible-tischendorf.txt
