In [None]:
import pandas as pd

In [None]:
import os

In [None]:
import matplotlib.pyplot as plt

In [None]:
entropy_files_path = 'output/gpt2/'
files = os.listdir(entropy_files_path)

In [None]:
entropies = [file for file in files if 'entropies.csv' in file]
stats = [file for file in files if 'stats.csv' in file]

In [None]:
entropies_dfs = [pd.read_csv(entropy_files_path + file) for file in entropies]
stats_dfs = [pd.read_csv(entropy_files_path + file) for file in stats]

In [None]:
assert len(entropies_dfs) == len(stats_dfs)
for i in range(len(entropies_dfs)):
    entropies_dfs[i]['file'] = entropies[i].split('_')[0]
    stats_dfs[i]['file'] = stats[i].split('_')[0]

In [None]:
entropies_df = pd.concat(entropies_dfs)
stats_df = pd.concat(stats_dfs)

# Analysis at different levels

We can run the analysis at the bible, testament, book, and chapter levels, and for each report two histograms: one for H_s, and one for H_r. At the book and chapter levels we will use only H_r.

In [None]:
colors = {'H': 'green', 'H_r': 'blue', 'H_s': 'blue', 'D_r': 'red', 'D_s': 'red'}
levels = {'s': ('bible', 'testament'), 'r': ('bible', 'testament', 'book', 'chapter')}

for method in ('s', 'r'):
    for level in levels[method]:
        for col in ('H', f'H_{method}', f'D_{method}'):
            plt.hist(entropies_df[entropies_df['level'] == 'bible'][col], color=colors[col], label=col)
        plt.title(level)
        plt.legend()
        plt.show()

Now let's look at some statistics

In [None]:
# Average over all texts at each level
col_aggs = {col: ['mean', 'std'] for col in entropies_df.columns if col not in {'level', 'file'}}
# Add one more column that is the number of texts for that level
col_aggs[list(col_aggs.keys())[0]].append('count')
stats = entropies_df.groupby('level').agg(col_aggs)

# No _s in book and chapter levels
for col in stats.columns:
    if '_s' in col[0]:
        stats.at['book', col] = 'N/A'
        stats.at['chapter', col] = 'N/A'
        
# Drop verses
stats.drop(['verse'], inplace=True)

# Sort by hierarchy
hierarchy = ['bible', 'testament', 'book', 'chapter']
stats['hierarchy'] = stats.index.map(lambda x: hierarchy.index(x))
stats.sort_values(by='hierarchy', inplace=True)
stats.drop(columns=['hierarchy'], inplace=True)

In [None]:
stats

It's surprising that the number of tokens varies so wildly across bibles. We can look at this in further detail.

In [None]:
plt.hist(entropies_df[entropies_df['level'] == 'testament']['n_tokens'], bins=30)
plt.show()

In [None]:
plt.hist(entropies_df[entropies_df['level'] == 'bible']['n_tokens'], bins=30)
plt.show()

So most of this variation seems to be due to the old vs new testament.

# Outlier analysis

What are the bibles with the lowest entropy differences?

In [None]:
entropies_df[(entropies_df['level'] == 'bible') & (entropies_df['D_r'] < 3)]

Is this a particularly short or long bible? Judging from the distribution shown above, it is rather short, but not the shortest. This "diaglot" bible is a literal word-by-word translation of the Greek bible. The text is order in an odd manner and it's quite difficult to read as English. This probably means that the entropy is higher than usual (and this is consistent with the observation), which causes the small entropy difference.

In [None]:
entropies_df[(entropies_df['level'] == 'bible') & (entropies_df['D_r'] < 4)]

Basic does not have a high entropy rate, but a small unigram entropy. This is probably due to the more limited vocabulary. I can't understand very much what the case is for the Etheridge bible. The "basic" bible is still in English, so it should be kept, but the "diaglot" bible is arguably not in English. If we remove it, how does the analysis change?

In [None]:
# Average over all texts at each level
col_aggs = {col: ['mean', 'std'] for col in entropies_df.columns if col not in {'level', 'file'}}
# Add one more column that is the number of texts for that level
col_aggs[list(col_aggs.keys())[0]].append('count')
stats = entropies_df[entropies_df['file'] != 'eng-x-bible-diaglot'].groupby('level').agg(col_aggs)

# No _s in book and chapter levels
for col in stats.columns:
    if '_s' in col[0]:
        stats.at['book', col] = 'N/A'
        stats.at['chapter', col] = 'N/A'
        
# Drop verses
stats.drop(['verse'], inplace=True)

# Sort by hierarchy
hierarchy = ['bible', 'testament', 'book', 'chapter']
stats['hierarchy'] = stats.index.map(lambda x: hierarchy.index(x))
stats.sort_values(by='hierarchy', inplace=True)
stats.drop(columns=['hierarchy'], inplace=True)

In [None]:
stats

Now the statistical uncertainties become closer to the values calculated on a single bible. Not that, while we calculated the cross-entropy between GPT-2 and the bible, the cross-entropy is expected to be higher than or equal to the entropy (Gibbs' Inequality). Thus, the entropy difference should be even **larger** than reported here, unlike presented in Montemurro & Zanette.