In [None]:
import numpy as np
from sklearn.feature_selection import r_regression
import pandas as pd
import os
import matplotlib.pyplot as plt
from util import make_book_plot, rel_error
from scipy.stats import spearmanr
from collections import defaultdict

In [None]:
FILENAME = 'output/KoplenigEtAl/merged.csv'
PLOTS_DIR = 'output/KoplenigEtAl/Plots/Paper'

In [None]:
df = pd.read_csv(FILENAME)

# Sample correlation plot

- Book: Matthew
- Bible: xuo

In [None]:
fig_corr, ax_corr = make_book_plot(df[df['bible'] == 'xuo-x-bible.txt'], 'Matthew', 'xuo')

In [None]:
fig_corr.savefig(os.path.join(PLOTS_DIR, 'xuo_matthew_correlation.png'))

# Spearman correlation coefficients

We have no reason to suspect the correlations are linear, so we can't use Pearson's correlation coefficient, but rather Spearman's. We will compute correlations between D_order and D_structure, and between each of those and iter_id.

In [None]:
df.sample(5)

In [None]:
assert len(df[df['experiment'].apply(lambda x: x != 'pasting' and x != 'splitting')]) == 0
df['n_splits'] = df.apply(lambda row: row['iter_id'] * (1 if row['experiment'] == 'splitting' else -1), 1)

In [None]:
def get_spearman(grp: pd.DataFrame) -> np.ndarray:
    D_order = grp['D_order'].tolist()
    D_structure = grp['D_structure'].tolist()
    n_splits = grp['n_splits'].tolist()
    structure_order = spearmanr(D_order, D_structure).correlation
    structure_splits = spearmanr(n_splits, D_structure).correlation
    order_splits = spearmanr(n_splits, D_order).correlation
    return order_splits, structure_splits, structure_order

In [None]:
def get_transition_errors(grp: pd.DataFrame) -> bool:
    # Check whether the transition point at n_splits = 0 makes sense
    assert len(grp) == grp['n_splits'].nunique() + 1
    assert 0 in grp['n_splits'].unique()
    assert len(grp[grp['n_splits'] == 0]) == 2
    return {col: rel_error(grp[grp['n_splits'] == 0][col].tolist()) for col in ('orig', 'shuffled', 'masked')}

In [None]:
spearmans = []
warnings = []
empties = []
bible_book_transitions = {}
for bible_name, bible_grp in df.groupby('bible'):
    bible_book_transitions[bible_name] = {}
    for book_name, book_grp in bible_grp.groupby('book'):
        bible_book_transitions[bible_name][book_name] = get_transition_errors(book_grp)
        if len(book_grp) == 0:
            empties.append((bible_name, book_name))
            continue
        try:
            spearmans.append((bible_name, book_name, get_spearman(book_grp)))
        except ValueError as e:
            warnings.append((bible_name, book_name, e))

# Check errors

Check that there are no empties or warnings, and evaluate the transition errors

In [None]:
assert not len(empties)

In [None]:
assert not len(warnings)

In [None]:
transition_errors = []
for bible_name, book_transitions in bible_book_transitions.items():
    for book_name, transitions in book_transitions.items():
        transition_errors.append({'bible': bible_name, 'book': book_name})
        for k, v in transitions.items():
            transition_errors[-1][k] = v
transition_df = pd.DataFrame(transition_errors)

In [None]:
for col in ('orig', 'shuffled', 'masked'):
    print(col + ':', transition_df[transition_df[col] == transition_df[col].max()][[col, 'bible', 'book']].values)

In [None]:
fig_ax_max_orig_masked = make_book_plot(df[df['bible'] == 'ita-x-bible-vita1997.txt'], 'Revelation', 'ita-vita1997')

In [None]:
fig_ax_max_shuffled = make_book_plot(df[df['bible'] == 'etu-x-bible.txt'], 'John', 'etu')

These are odd but the trends are completely consistent with the observations made before.

# Correlation coefficient histograms

In [None]:
# order_splits
plt.hist([rho[2][0] for rho in spearmans])
plt.xlabel('Spearman(n_splits, D_order)')
plt.ylabel('Number of book-translation pairs')
plt.yscale("log")
plt.savefig(os.path.join(PLOTS_DIR, 'splits_order.png'))
plt.show()

In [None]:
# structure_splits
plt.hist([rho[2][1] for rho in spearmans])
plt.xlabel('Spearman(n_splits, D_structure)')
plt.ylabel('Number of book-translation pairs')
plt.yscale("log")
plt.savefig(os.path.join(PLOTS_DIR, 'splits_structure.png'))
plt.show()

In [None]:
# structure_order
plt.hist([rho[2][2] for rho in spearmans])
plt.xlabel('Spearman(D_order, D_structure)')
plt.ylabel('Number of book-translation pairs')
plt.yscale("log")
plt.savefig(os.path.join(PLOTS_DIR, 'order_structure.png'))
plt.show()