In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json

In [None]:
df_wp = pd.read_csv("../WordOrderBibles_HPC/output/KoplenigEtAl/WordPasting/entropies_aso-x-bible.txt.csv")
df_ws = pd.read_csv("output/KoplenigEtAl/WordSplitting/entropies_aso-x-bible.txt.csv")

In [None]:
acts_wp = df_wp[df_wp['book_id'] == 44]

In [None]:
acts_ws = df_ws[df_ws['book_id'] == 44]

In [None]:
print(len(acts_wp), len(acts_ws))

In [None]:
acts_wp[acts_wp['iter_id'] == 0]

In [None]:
acts_ws[acts_ws['iter_id'] == 0]

# Check if the WhitespaceSplit pre-tokenizer makes a difference

In [None]:
new_filename = 'output/KoplenigEtAl/WordSplitting/entropies_aso-x-bible_whitespacesplit.txt.csv'
df_ws_whitespacesplit = pd.read_csv(new_filename)
acts_ws_whitespacesplit = df_ws_whitespacesplit[df_ws_whitespacesplit['book_id'] == 44]
acts_ws_whitespacesplit[acts_ws_whitespacesplit['iter_id'] == 0]

D_order seems to be "fixed" now, although D_structure still looks suspiciously different. In any case, let's look at the plot I showed Damián once again, with these new values.

In [None]:
df_wp['experiment'] = 'pasting'
df_ws_whitespacesplit['experiment'] = 'splitting'

In [None]:
df = pd.concat([df_wp, df_ws_whitespacesplit])

In [None]:
for lbl, grp in df.groupby('book'):
    xs = grp[grp['experiment'] == 'splitting']['D_order'].tolist()
    ys = grp[grp['experiment'] == 'splitting']['D_structure'].tolist()
    xp = grp[grp['experiment'] == 'pasting']['D_order'].tolist()
    yp = grp[grp['experiment'] == 'pasting']['D_structure'].tolist()
    labelss = grp[grp['experiment'] == 'splitting']['iter_id'].tolist()
    labelsp = grp[grp['experiment'] == 'pasting']['iter_id'].tolist()
    fig, ax = plt.subplots()
    ax.scatter(xs, ys)
    ax.scatter(xp, yp)
    plt.xlabel('Word order information')
    plt.ylabel('Word structure information')
    plt.title(f'{lbl}')
    for i, txt in enumerate(labelss):
        ax.annotate(txt, (xs[i], ys[i]), rotation=45)
    for i, txt in enumerate(labelsp):
        ax.annotate(txt, (xp[i], yp[i]), rotation=45)

Now, the qualitative results make sense, but there is a suspicious vertical shift between experiments. And, more importantly, between 0s. First, let's try to reproduce this word-pasting experiment.

# Re-run word-pasting on this bible

In [None]:
df_wp_repro = pd.read_csv("output/KoplenigEtAl/WordPasting/entropies_aso-x-bible_repro.txt.csv")
acts_wp_repro = df_wp_repro[df_wp_repro['book_id'] == 44]
acts_wp_repro[acts_wp_repro['iter_id'] == 0]

In [None]:
acts_wp[acts_wp['iter_id'] == 0]

In [None]:
acts_ws_whitespacesplit[acts_ws_whitespacesplit['iter_id'] == 0]

So, there was some minor change either in the code or in the resources after the previous file was created. Possible checks:

1. What changed since the file was created in January 2023

2. What is different between this branch and the word-pasting branch (i.e. first run it on that branch)

3. Directly re-run all bibles, possibly with an updated parallel bible corpus

## Re-run on the old branch

In [None]:
df_wp_oldbranch = pd.read_csv("output/KoplenigEtAl/WordPasting/entropies_aso-x-bible_oldbranch.txt.csv")
acts_wp_oldbranch = df_wp_oldbranch[df_wp_oldbranch['book_id'] == 44]
acts_wp_oldbranch[acts_wp_oldbranch['iter_id'] == 0]

So, it looks like something changed the "masked" entropy, only for the word-pasting case, in the new branch. Let's make sure this is not a problem when converting json

In [None]:
with open('../WordOrderBibles_HPC/output/KoplenigEtAl/WordPasting/entropies_aso-x-bible.txt.json') as f:
    json_current = json.loads(f.read())

In [None]:
with open('output/KoplenigEtAl/WordPasting/entropies_aso-x-bible_repro.txt.json') as f:
    json_repro = json.loads(f.read())

In [None]:
with open('output/KoplenigEtAl/WordPasting/entropies_aso-x-bible_oldbranch.txt.json') as f:
    json_oldbranch = json.loads(f.read())

In [None]:
json_current['44']['0']

In [None]:
json_repro['44']['0']

In [None]:
json_oldbranch['44']['0']

So, if I use the old branch, I get the "right" value. If I run the code again, I get the "right" value. This means the switch to the new branch did not cause the discrepancy, which was already present in the old branch.

In [None]:
df_wp_repro['experiment'] = 'pasting'
df_wp_oldbranch['experiment'] = 'pasting'
df_repro = pd.concat([df_wp_repro, df_ws_whitespacesplit])
df_oldbranch = pd.concat([df_wp_oldbranch, df_ws_whitespacesplit])
lbl = 'Luke'
for name, dataf in {'current': df, 'repro': df_repro, 'oldbranch': df_oldbranch}.items():
    grp = dataf[dataf['book'] == 'Luke']
    xs = grp[grp['experiment'] == 'splitting']['D_order'].tolist()
    ys = grp[grp['experiment'] == 'splitting']['D_structure'].tolist()
    xp = grp[grp['experiment'] == 'pasting']['D_order'].tolist()
    yp = grp[grp['experiment'] == 'pasting']['D_structure'].tolist()
    labelss = grp[grp['experiment'] == 'splitting']['iter_id'].tolist()
    labelsp = grp[grp['experiment'] == 'pasting']['iter_id'].tolist()
    fig, ax = plt.subplots()
    ax.scatter(xs, ys)
    ax.scatter(xp, yp)
    plt.xlabel('Word order information')
    plt.ylabel('Word structure information')
    plt.title(f'{lbl} ({name})')
    for i, txt in enumerate(labelss):
        ax.annotate(txt, (xs[i], ys[i]), rotation=45)
    for i, txt in enumerate(labelsp):
        ax.annotate(txt, (xp[i], yp[i]), rotation=45)

Clearly, there was something we changed after having generated the plot. What were the last changes in the old branch? It looks like it was something in the method compression_entropy.py::create_random_word. This would be consistent with it changing something in the "masked" entropy. Now I'm re-running with the last commit of 2022.

In [None]:
with open('output/KoplenigEtAl/WordPasting/entropies_aso-x-bible_oldcommit.txt.json') as f:
    json_oldcommit = json.loads(f.read())

In [None]:
json_current['44']['0']

In [None]:
json_oldcommit['44']['0']

This is it. It's one of the commits from December 2022 and the head of the word-pasting branch.

# Final check

Now let's pick another bible, at random, and run word-pasting and word-splitting using the current code, and check that the transition is correct for all 6 books.

In [None]:
df_wp_check = pd.read_csv("output/KoplenigEtAl/WordPasting/entropies_deu-x-bible-greber_check.txt.csv")
df_ws_check = pd.read_csv("output/KoplenigEtAl/WordSplitting/entropies_deu-x-bible-greber_check.txt.csv")

In [None]:
df_wp_check['experiment'] = 'pasting'
df_ws_check['experiment'] = 'splitting'
dataf = pd.concat([df_wp_check, df_ws_check])
lbl = 'John'
name = 'check'
grp = dataf[dataf['book'] == 'Luke']
xs = grp[grp['experiment'] == 'splitting']['D_order'].tolist()
ys = grp[grp['experiment'] == 'splitting']['D_structure'].tolist()
xp = grp[grp['experiment'] == 'pasting']['D_order'].tolist()
yp = grp[grp['experiment'] == 'pasting']['D_structure'].tolist()
labelss = grp[grp['experiment'] == 'splitting']['iter_id'].tolist()
labelsp = grp[grp['experiment'] == 'pasting']['iter_id'].tolist()
fig, ax = plt.subplots()
ax.scatter(xs, ys)
ax.scatter(xp, yp)
plt.xlabel('Word order information')
plt.ylabel('Word structure information')
plt.title(f'{lbl} ({name})')
for i, txt in enumerate(labelss):
    ax.annotate(txt, (xs[i], ys[i]), rotation=45)
for i, txt in enumerate(labelsp):
    ax.annotate(txt, (xp[i], yp[i]), rotation=45)

This makes sense, and is exactly as expected.