In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import compression_entropy as ce

In [None]:
FILE_PATH = 'output/KoplenigEtAl/WordPasting/entropies_backup.json'

In [None]:
with open(FILE_PATH, 'r') as f:
    entropies = json.loads(f.read())

In [None]:
row_list = []
for filename, book_entropies in entropies.items():
    for book_id, version_entropies in book_entropies.items():
        for n_iter, level_entropies in version_entropies[book_id].items():
            row = level_entropies
            row['filename'] = filename
            row['book_id'] = book_id
            row['iter_id'] = n_iter
            row_list.append(row)

In [None]:
df = pd.DataFrame(row_list)

In [None]:
df['language'] = df['filename'].apply(lambda x: x.split('/')[-1].split('-')[0])
df['description'] = df['filename'].apply(lambda x: '-'.join(x.split('/')[-1].split('.')[0].split('-')[3:]))

In [None]:
df['D_structure'] = df.apply(lambda row: row['masked'] - row['orig'], 1)
df['D_order'] = df.apply(lambda row: row['shuffled'] - row['orig'], 1)

In [None]:
book_id_name = {'40': 'Matthew', 
                '41': 'Mark', 
                '42': 'Luke', 
                '43': 'John', 
                '44': 'Acts', 
                '66': 'Revelation'}
df['book'] = df['book_id'].map(book_id_name)

In [None]:
df['bible_id'] = df['filename'].apply(lambda x: '.'.join(x.split('/')[-1].split('.')[:-1]))

In [None]:
for bible_id in df['bible_id'].unique():
    bible_df = df[df['bible_id'] == bible_id]
    for book_name in bible_df['book'].unique():
        book_df = bible_df[bible_df['book'] == book_name]
        assert len(book_df) == book_df['iter_id'].nunique()
        x = book_df['D_order'].tolist()
        y = book_df['D_structure'].tolist()
        labels = book_df['iter_id'].tolist()
        fig, ax = plt.subplots()
        ax.scatter(x, y)
        plt.xlabel('Word order information')
        plt.ylabel('Word structure information')
        plt.title(f'{book_name}_{bible_id.split("-")[0]}')
        for i, txt in enumerate(labels):
            ax.annotate(txt, (x[i], y[i]), rotation=45)

In [None]:
df[(df['language'] == 'eng') & (df['book'] == 'Luke') & (df['iter_id'] == '0')]

There are two important findings:

1. there is very little variation at this level

2. the result I got is different from the one I had previously. Is it because of natural variations, or a mistake?

To test the second question, we need to compute the same quantity a few times, with different shufflings.

In [None]:
eng_luke_entropies = ce.run('/home/pablo/Documents/GitHubRepos/paralleltext/bibles/corpus/eng-x-bible-world.txt', True, True, [42], True)

In [None]:
print(eng_luke_entropies)

Repeat the calculation:

In [None]:
eng_luke_entropies = ce.run('/home/pablo/Documents/GitHubRepos/paralleltext/bibles/corpus/eng-x-bible-world.txt', True, True, [42], True)
print(eng_luke_entropies)

The variation is almost negligible. The result is very close to the one obtained in notebook 10, even though the result here is without truncation in the second case.

In [None]:
filename = '/home/pablo/Documents/GitHubRepos/paralleltext/bibles/corpus/eng-x-bible-world.txt'
for i in (False, True):
    for j in (False, True):
        for k in (False, True):
            print(ce.run(filename, i, j, [42], k))

If we use the run_word_pasting method with n_iter=1, we should get the same result.

In [None]:
print(ce.run_word_pasting(filename,
                     lowercase=True,
                     remove_mismatcher_files=True,
                     chosen_books=[42],
                     truncate_books=False,
                     n_iter=1,
                     output_file_path='/home/pablo/ownCloud/WordOrderBibles/GitHub/output/KoplenigEtAl/WordPasting/'))

This is again the same result. So I must have made a mistake before. I will run the code again, using the Python command line.