In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
GPT_FILES = 'output/gpt2_all_languages/'
LANGUAGE_MAP = {'deu': 'German', 'vie': 'Vietnamese', 'eng': 'English', 'mya': 'Burmese', 
                'esk': 'Inupiatun', 'zho': 'Chinese', 'grc': 'Greek', 'tam': 'Tamil', 
                'zul': 'Zulu', 'qvw': 'Quechua', 'chr': 'Cherokee', 'xuo': 'Kuo'}

In [None]:
all_files = os.listdir(GPT_FILES)
entropy_files = [el for el in all_files if el.endswith('_entropies.csv')]

In [None]:
dataframes = [(filename, pd.read_csv(GPT_FILES + filename)) for filename in entropy_files]

In [None]:
for i in range(len(dataframes)):
    dataframes[i][1]['filename'] = dataframes[i][0]

In [None]:
dataframes = [el[1] for el in dataframes]

In [None]:
full_df = pd.concat(dataframes)

full_df['iso'] = full_df['filename'].apply(lambda x: x.split('-')[0])
full_df['bible_id'] = full_df['filename'].apply(lambda x: x.replace('_entropies.csv', 
                                                                    '')[6:])
full_df.drop(columns=['filename'], inplace=True)
full_df['temp'] = full_df.apply(lambda row: row['H_r'] - row['H'], 1)
full_df['temp2'] = full_df.apply(lambda row: abs(row['temp'] - row['D_r']), 1)
assert all([el < 0.001 for el in full_df['temp2'].tolist()])
full_df.drop(columns=['temp', 'temp2'], inplace=True)

In [None]:
mz_df = full_df[full_df['iso'].apply(lambda x: x in LANGUAGE_MAP)].reset_index()

In [None]:
mz_df['language'] = mz_df['iso'].map(LANGUAGE_MAP)

In [None]:
def plot_entropies(the_level: str, the_unigram: str, dataframe: pd.DataFrame) -> None:
    level_df = dataframe[dataframe['level'] == the_level].reset_index()
    H_u = f'H_{the_unigram}'
    D_u = f'D_{the_unigram}'
    aggregators = {col: ['mean', 'std'] for col in ('H', H_u, D_u)}
    results_df = level_df.groupby('language').agg(aggregators).reset_index().fillna(0)

    X = results_df['language'].tolist()
    X_axis = np.arange(len(X))
    H_mean = results_df[('H', 'mean')].tolist()
    H_u_mean = results_df[(H_u, 'mean')].tolist()
    D_u_mean = results_df[(D_u, 'mean')].tolist()
    H_std = results_df[('H', 'std')].tolist()
    H_u_std = results_df[(H_u, 'std')].tolist()
    D_u_std = results_df[(D_u, 'std')].tolist()

    plt.figure(figsize=(16, 6), dpi=80)

    plt.bar(X_axis - 0.3, H_u_mean, 0.3, color='blue', yerr=H_u_std, capsize=5)
    plt.bar(X_axis, H_mean, 0.3, color='green', yerr=H_std, capsize=5)
    plt.bar(X_axis + 0.3, D_u_mean, 0.3, color='red', yerr=D_u_std, capsize=5)

    plt.xticks(X_axis, X)

    plt.ylabel("entropy [bits/word]")
    plt.title(f"Level: {the_level}. Unigram: {the_unigram}")
    plt.show()

In [None]:
for level in ('bible', 'book'):
    for unigram in ('r', 's'):
        plot_entropies(level, unigram, mz_df)

In [None]:
def print_stats(df: pd.DataFrame):
    the_level = 'bible'
    the_unigram = 'r'
    level_df = df[df['level'] == the_level].reset_index()
    H_u = f'H_{the_unigram}'
    D_u = f'D_{the_unigram}'
    aggregators = {col: ['mean', 'std'] for col in ('H', H_u, D_u)}
    results_df = level_df.groupby('language').agg(aggregators).reset_index().fillna(0)

    X = results_df['language'].tolist()
    X_axis = np.arange(len(X))
    H_mean = results_df[('H', 'mean')].tolist()
    H_u_mean = results_df[(H_u, 'mean')].tolist()
    D_u_mean = results_df[(D_u, 'mean')].tolist()
    H_std = results_df[('H', 'std')].tolist()
    H_u_std = results_df[(H_u, 'std')].tolist()
    D_u_std = results_df[(D_u, 'std')].tolist()

    print(f'Mean H: {np.mean(H_mean):.2f}; stdev(H_mean): {np.std(H_mean):.2f}')
    print(f'mean(H_r_mean): {np.mean(H_u_mean):.2f}. stdev(H_r_mean): {np.std(H_u_mean):.2f}')
    print(f'mean(D_r_mean): {np.mean(D_u_mean):.2f}. stdev(D_r_mean): {np.std(D_u_mean):.2f}')

In [None]:
print_stats(mz_df)

In [None]:
full_df['language'] = full_df['iso']
print_stats(full_df)

This is significantly different from Montemurro/Zanette, in that the variance of the difference is LARGER than the variances of H and H_r