In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
MZ_FILES = 'output/MontemurroZanette/'
LANGUAGE_MAP = {'deu': 'German', 'vie': 'Vietnamese', 'eng': 'English', 'mya': 'Burmese', 
                'esk': 'Inupiatun', 'zho': 'Chinese', 'grc': 'Greek', 'tam': 'Tamil', 
                'zul': 'Zulu', 'qvw': 'Quechua', 'chr': 'Cherokee', 'xuo': 'Kuo'}

In [None]:
all_files = os.listdir(MZ_FILES)
entropy_files = [el for el in all_files if 'entropies' in el]

In [None]:
assert all([el.endswith('_entropies.csv') for el in entropy_files])

In [None]:
dataframes = [(filename, pd.read_csv(MZ_FILES + filename)) for filename in entropy_files]

In [None]:
for i in range(len(dataframes)):
    dataframes[i][1]['filename'] = dataframes[i][0]

In [None]:
dataframes = [el[1] for el in dataframes]

In [None]:
for df in dataframes:
    df['iso'] = df['filename'].apply(lambda x: x.split('-')[0])
    df['bible_id'] = df['filename'].apply(lambda x: x.replace('_entropies.csv', '')[6:])

In [None]:
for df in dataframes:
    df.drop(columns=['filename'], inplace=True)

In [None]:
for df in dataframes:
    df['temp'] = df.apply(lambda row: row['H_r'] - row['H'], 1)
    df['temp2'] = df.apply(lambda row: abs(row['temp'] - row['D_r']), 1)

In [None]:
for df in dataframes:
    assert all([el < 0.001 for el in df['temp2'].tolist()])

In [None]:
for df in dataframes:
    df.drop(columns=['temp', 'temp2'], inplace=True)

In [None]:
df = pd.concat(dataframes)

In [None]:
df['language'] = df['iso'].map(LANGUAGE_MAP)

In [None]:
def plot_entropies(the_level: str, the_unigram: str) -> None:
    level_df = df[df['level'] == the_level].reset_index()
    H_u = f'H_{the_unigram}'
    D_u = f'D_{the_unigram}'
    aggregators = {col: ['mean', 'std'] for col in ('H', H_u, D_u)}
    results_df = level_df.groupby('language').agg(aggregators).reset_index().fillna(0)

    X = results_df['language'].tolist()
    X_axis = np.arange(len(X))
    H_mean = results_df[('H', 'mean')].tolist()
    H_u_mean = results_df[(H_u, 'mean')].tolist()
    D_u_mean = results_df[(D_u, 'mean')].tolist()
    H_std = results_df[('H', 'std')].tolist()
    H_u_std = results_df[(H_u, 'std')].tolist()
    D_u_std = results_df[(D_u, 'std')].tolist()

    plt.figure(figsize=(16, 6), dpi=80)

    plt.bar(X_axis - 0.3, H_u_mean, 0.3, color='blue', yerr=H_u_std, capsize=5)
    plt.bar(X_axis, H_mean, 0.3, color='green', yerr=H_std, capsize=5)
    plt.bar(X_axis + 0.3, D_u_mean, 0.3, color='red', yerr=D_u_std, capsize=5)

    plt.xticks(X_axis, X)

    plt.ylabel("entropy [bits/word]")
    plt.title(f"Level: {the_level}. Unigram: {the_unigram}")
    plt.show()

In [None]:
for level in ('bible', 'book'):
    for unigram in ('r', 's'):
        plot_entropies(level, unigram)

Observations: 

* the variance in the book-level analysis is too large, so we need to stick to bible-level analyses
* the error bars then represent the variance among bibles in the same language, which are expected to be small
* this is markedly not the case for Greek. There were some lowercased bibles there, and this might cause the difference
* the results for 'r' and 's' are very similar and we can just stick to one of them

In [None]:
the_level = 'bible'
the_unigram = 'r'
level_df = df[df['level'] == the_level].reset_index()
H_u = f'H_{the_unigram}'
D_u = f'D_{the_unigram}'
aggregators = {col: ['mean', 'std'] for col in ('H', H_u, D_u)}
results_df = level_df.groupby('language').agg(aggregators).reset_index().fillna(0)

X = results_df['language'].tolist()
X_axis = np.arange(len(X))
H_mean = results_df[('H', 'mean')].tolist()
H_u_mean = results_df[(H_u, 'mean')].tolist()
D_u_mean = results_df[(D_u, 'mean')].tolist()
H_std = results_df[('H', 'std')].tolist()
H_u_std = results_df[(H_u, 'std')].tolist()
D_u_std = results_df[(D_u, 'std')].tolist()

print(f'Mean H: {np.mean(H_mean):.2f}; stdev(H_mean): {np.std(H_mean):.2f}')
print(f'mean(H_r_mean): {np.mean(H_u_mean):.2f}. stdev(H_r_mean): {np.std(H_u_mean):.2f}')
print(f'mean(D_r_mean): {np.mean(D_u_mean):.2f}. stdev(D_r_mean): {np.std(D_u_mean):.2f}')

Ignoring standard deviations for each language, we can see that the variance in the mean value of the entropies (original and shuffled) are larger than the variance in the difference between the two entropies. This is exactly as reported in Montemurro & Zanette. However, the mean value of the mean difference between the original and shuffled entropies is 4.6 bits per word, over 1 bit per word higher than that reported by Montemurro & Zanette. This seems to suggest that, in my study, there is MORE information contained in word order than in the study of Montemurro & Zanette.

Note that Montemurro & Zanette also obtained basically the same result as Bentz et al (2017), who in Figure 1 show a narrow distribution of entropy differences with a mean value of 3.17 and a standard deviation of 0.36. My mean value lies 4 standard deviations away from theirs.

Looking at the top plot in Figure 1 of Bentz et al, it looks like it is the mean UNIGRAM entropy that is significantly different from their result. We could try to use exactly the same method that they used, to see if we can reproduce that.