In [None]:
import json
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress, spearmanr
import numpy as np
from data import build_dataframe

In [None]:
def parse_bible_name(filename: str) -> tuple:
    parts = filename.replace('.txt\n', '').split('-')
    language = parts[0]
    assert parts[1] == 'x'
    assert parts[2] == 'bible', (filename, parts[2])
    description = '-'.join(parts[3:])
    return f'{language}-{description}' if description else language
    return (language, description)

In [None]:
entropies_most = build_dataframe('output/KoplenigEtAl/entropies_backup.json')
entropies_zho = build_dataframe('output/KoplenigEtAl/entropies_zho.json')
df = pd.concat([entropies_most, entropies_zho])

In [None]:
df['language'] = df['filename'].apply(lambda x: x.split('-')[0])
df['description'] = df['filename'].apply(lambda x: '-'.join(x.split('.')[0].split('-')[3:]))

In [None]:
# Rename general Chinese as Mandarin Chinese to match Koplenig et al
df['language'] = df['language'].apply(lambda x: 'cmn' if x == 'zho' else x)

In [None]:
# Exclude Burmese, as Koplenig et al don't explain how they inserted spaces
df = df[df['language'] != 'mya'].reset_index()

In [None]:
df['D_structure'] = df.apply(lambda row: row['masked'] - row['orig'], 1)
df['D_order'] = df.apply(lambda row: row['shuffled'] - row['orig'], 1)

In [None]:
book_id_name = {'40': 'Matthew', 
                '41': 'Mark', 
                '42': 'Luke', 
                '43': 'John', 
                '44': 'Acts', 
                '66': 'Revelation'}
df['book'] = df['book_id'].map(book_id_name)

In [None]:
df['bible_id'] = df['filename'].apply(parse_bible_name)

In [None]:
df.sample(3)

In [None]:
for book_name in df['book'].unique():
    book_df = df[df['book'] == book_name]
    assert len(book_df) == book_df['bible_id'].nunique(), book_name
    x = book_df['D_order'].tolist()
    y = book_df['D_structure'].tolist()
    labels = book_df['bible_id'].tolist()
    fig, ax = plt.subplots()
    ax.scatter(x, y)
    plt.xlabel('Word order information')
    plt.ylabel('Word structure information')
    plt.title(book_name)
    for i, txt in enumerate(labels):
        ax.annotate(txt, (x[i], y[i]), rotation=45)

In [None]:
book_name = 'John'
book_df = df[df['book'] == book_name]
assert len(book_df) == book_df['bible_id'].nunique(), book_name

book_df = book_df[['language', 'D_order', 'D_structure']].groupby('language').mean().reset_index()

In [None]:
x = book_df['D_order'].tolist()
y = book_df['D_structure'].tolist()
labels = book_df['language'].tolist()
fig, ax = plt.subplots()
ax.scatter(x, y)
plt.xlabel('Word order information')
plt.ylabel('Word structure information')
plt.title(book_name)
for i, txt in enumerate(labels):
    ax.annotate(txt, (x[i], y[i]), rotation=45)

Verify:

* trade-off cualitativo
* valor del fit
* rank correlation coefficient entre los ordenes de los idiomas

# Trade-off cualitativo

Si grafico estos idiomas para los 6 libros, me da un patron similar al que se ve en Koplenig et al?

In [None]:
for book_name in sorted(df['book'].unique()):
    book_df = df[df['book'] == book_name]
    assert len(book_df) == book_df['bible_id'].nunique(), book_name
    book_df = book_df[['language', 'D_order', 'D_structure']].groupby('language').mean().reset_index()
    x = book_df['D_order'].tolist()
    y = book_df['D_structure'].tolist()
    labels = book_df['language'].tolist()
    fig, ax = plt.subplots()
    ax.scatter(x, y)
    plt.xlabel('Word order information')
    plt.ylabel('Word structure information')
    plt.title(book_name)
    for i, txt in enumerate(labels):
        ax.annotate(txt, (x[i], y[i]), rotation=45)

Qualitatively, this is the same as in Koplenig et al

# Valor del fit

Podemos hacer un fit como hicieron en Koplenig et al, y ver si da parecido. Caveat: ellos usaron todos los idiomas, y yo estoy usando solo los que resaltaron.

In [None]:
def shifted_inverse(xi, A, B):
    return A * B / xi

for book_name in sorted(df['book'].unique()):
    book_df = df[df['book'] == book_name]
    assert len(book_df) == book_df['bible_id'].nunique(), book_name
    book_df = book_df[['language', 'D_order', 'D_structure']].groupby('language').mean().reset_index()
    book_df.sort_values('D_order', ascending=True, inplace=True)
    x = book_df['D_order'].tolist()
    y = book_df['D_structure'].tolist()
    labels = book_df['language'].tolist()
    fig, ax = plt.subplots()
    ax.scatter(x, y)
    plt.xlabel('Word order information')
    plt.ylabel('Word structure information')
    plt.title(book_name)
    for i, txt in enumerate(labels):
        ax.annotate(txt, (x[i], y[i]), rotation=45)
    inv_x = [1 / el for el in x]
    inv_x.reverse()
    rev_y = y.copy()
    rev_y.reverse()
    inv_x = np.array(inv_x)
    rev_y = np.array(rev_y)
    res = linregress(inv_x, rev_y)
    pred_y = [el for el in res.intercept + res.slope* inv_x]
    pred_y.reverse()
    print(f"{book_name}: R-squared: {res.rvalue**2:.2f}, intercept: {res.intercept:.2f}, slope: {res.slope:.2f}")
    plt.plot(x, pred_y, 'r', label='fitted line')
    plt.show()

Qualitatively, these are similar to the results found in Koplenig et al. But they also computed the Spearman correlation coefficients for each book, and I didn't.

In [None]:
def shifted_inverse(xi, A, B):
    return A * B / xi

for book_name in sorted(df['book'].unique()):
    book_df = df[df['book'] == book_name]
    assert len(book_df) == book_df['bible_id'].nunique(), book_name
    book_df = book_df[['language', 'D_order', 'D_structure']].groupby('language').mean().reset_index()
    book_df.sort_values('D_order', ascending=True, inplace=True)
    x = book_df['D_order'].tolist()
    y = book_df['D_structure'].tolist()
    print(f"{book_name}: r_s: {spearmanr(x, y).correlation:.2f}")

These are all stronger than those found in Koplenig et al, but I'm using fewer bibles.

# rank correlation coefficient entre los ordenes de los idiomas

Ahora quiero ver si el rank que encuentro yo se correlaciona con el que encuentran ellos.

In [None]:
# Structure is high-to-low (top-to-bottom), order is low-to-high (left-to-right)
acts_structure = 'esk qvw tam zul chr grc deu eng vie mya xuo cmn'
acts_order = 'esk chr qvw zul deu tam grc eng vie xuo cmn mya'
john_structure = 'qvw esk tam zul chr grc deu eng vie mya xuo cmn'
john_order = 'esk qvw chr zul tam deu eng grc vie mya xuo cmn'
luke_structure = 'qvw esk tam zul chr grc deu eng vie mya xuo cmn'
luke_order = 'esk qvw chr zul deu tam grc eng vie mya xuo cmn'
revelation_structure = 'qvw esk tam zul chr grc deu eng vie mya cmn xuo'
revelation_order = 'esk qvw chr tam zul deu grc eng vie mya cmn xuo'
book_rank = {'acts': {}, 'john': {}, 'luke': {}, 'revelation': {}}
book_rank['acts']['structure'] = acts_structure.split(' ')
book_rank['acts']['order'] = acts_order.split(' ')
book_rank['john']['structure'] = john_structure.split(' ')
book_rank['john']['order'] = john_order.split(' ')
book_rank['luke']['structure'] = luke_structure.split(' ')
book_rank['luke']['order'] = luke_order.split(' ')
book_rank['revelation']['structure'] = revelation_structure.split(' ')
book_rank['revelation']['order'] = revelation_order.split(' ')
koplenig_et_al_books = {'chr', 'cmn', 'deu', 'eng', 'esk', 'grc', 'mya', 'tam', 
                        'qvw', 'vie', 'xuo', 'zul'}
for book in book_rank.keys():
    for quantity in book_rank[book].keys():
        assert set(book_rank[book][quantity]) == koplenig_et_al_books
        assert len(book_rank[book][quantity]) == len(koplenig_et_al_books)

In [None]:
# Remove mya from the Koplenig et al results
for book, rest in book_rank.items():
    for q, v in rest.items():
        v.remove('mya')

In [None]:
for book_name in sorted(df['book'].unique()):
    if book_name.lower() not in book_rank.keys():
        continue
    book_df = df[df['book'] == book_name]
    assert len(book_df) == book_df['bible_id'].nunique(), book_name
    book_df = book_df[['language', 'D_order', 'D_structure']].groupby('language').mean().reset_index()
    x_pm = book_df['D_order'].tolist()
    y_pm = book_df['D_structure'].tolist()
    labels = book_df['language'].tolist()
    x_k = [book_rank[book_name.lower()]['order'].index(el) for el in labels]
    y_k = [book_rank[book_name.lower()]['structure'].index(el) for el in labels]
    print(f"{book_name}: r_s: {spearmanr(x_pm, x_k).correlation:.2f}")
    print(f"{book_name}: r_s: {spearmanr(y_pm, y_k).correlation:.2f}")

So my findings correlate very strongly with those of Koplenig et al, as expected.