In [36]:
import pandas as pd
from collections import defaultdict
from fiject.visuals.tables import Table, ColumnStyle
from fiject import CacheMode
import numpy as np

from formatting import make_it, make_bold

WALS_DF = pd.read_csv("data/wals_dedup.csv")
X_DF = pd.read_csv("data/xtreme-r_results.csv")

In [37]:
WORD_ORDER = "Order of Subject, Object and Verb"
INFLECTION = "Prefixing vs. Suffixing in Inflectional Morphology"

In [38]:
def get_wals_feature(feature_col):
    return {row["ISO"]: row[feature_col] for _, row in WALS_DF.iterrows()}


def get_feature_order(feature):
    return {
        WORD_ORDER: [
            "SVO",
            "SOV",
            "VSO",
            "OSV",
            "OVS",
            "VOS",
            "No dominant order",
            "nan",
        ],
        INFLECTION: [
            "Equal prefixing and suffixing",
            "Little affixation",
            "Strong prefixing",
            "Strongly suffixing",
            "Weakly prefixing",
            "Weakly suffixing",
            "nan",
        ],
    }.get(feature)

In [39]:
TASK_LABEL_MAP = {
    "XNLI": f'{make_bold("XNLI")}\\acc',
    "XCOPA": f'{make_bold("XCOPA")}\\acc',
    "UD-POS": f'{make_bold("UD-POS")}\\f',
    "WikiANN-NER": f'{make_bold("WikiANN-NER")}\\f',
    "XQuAD": f'{make_bold("XQuAD")}\\f',
    "MLQA": f'{make_bold("MLQA")}\\f',
    "TyDiQA": f'{make_bold("TyDiQA")}\\f',
    "Tatoeba": f'{make_bold("Tatoeba")}\\acc',
    "Mewsli-X": f'{make_bold("Mewsli-X")}\\map',
    "LAReQA": f'{make_bold("LAReQA")}\\map',
}

FEATURE_LABEL_MAP = {
    'Equal prefixing and suffixing': '\makecell{Equal\\\\ Pre \& Suf}',
    'Little affixation': '\makecell{Little\\\\ Aff}',
    'Strong prefixing': '\makecell{Strong\\\\ Pre}',
    'Strongly suffixing': '\makecell{Strong\\\\ Suf}',
    'Weakly prefixing': '\makecell{Weak\\\\ Pre}',
    'Weakly suffixing': '\makecell{Weak\\\\ Suf}',
    'nan': 'NA',
    "SVO": "SVO",
    "SOV": "SOV",
    "VSO": "VSO",
    "OSV": "OSV",
    "OVS": "OVS",
    "VOS": "VOS",
    "No dominant order": "NDO",
}

In [24]:
for label, feature_name in [('word_order', WORD_ORDER), ('inflection', INFLECTION)]:
    features_by_lang = get_wals_feature(feature_name)
    X_DF[f'has_{label}_coverage'] = X_DF['wals_iso'].apply(lambda lang: isinstance(features_by_lang[lang], str))

In [40]:
sorted(X_DF["task"].unique())

['LAReQA',
 'MLQA',
 'Mewsli-X',
 'Tatoeba',
 'TyDiQA',
 'UD-POS',
 'WikiANN-NER',
 'XCOPA',
 'XNLI',
 'XQuAD']

In [35]:
X_DF.groupby(['task']).value_counts(['has_word_order_coverage'])

task         has_word_order_coverage
LAReQA       True                       22
MLQA         True                       21
Mewsli-X     True                       22
Tatoeba      True                       76
             False                       6
TyDiQA       True                       27
UD-POS       True                       72
             False                       4
WikiANN-NER  True                       86
             False                      10
XCOPA        True                       27
             False                       6
XNLI         True                       45
XQuAD        True                       33
Name: count, dtype: int64

In [9]:
def create_table(feature, table_name):
    table = Table(table_name, caching=CacheMode.NONE)
    feature_vals = get_wals_feature(feature)
    wals_l = get_feature_order(feature)

    for task in X_DF["task"].unique():
        task_label = TASK_LABEL_MAP[task]
        for model in sorted(X_DF["model"].unique()):
            # Get score per language
            scores = X_DF.loc[(X_DF["task"] == task) & (X_DF["model"] == model)]
            scores_dict = {
                row["wals_iso"]: float(row["score"]) for _, row in scores.iterrows()
            }

            if len(scores) == 0:
                continue

            model_label = model
            default_row = [task_label, model_label]

            n_langs = len(X_DF[X_DF["task"] == task]["wals_iso"].unique())
            lang_label = f"{make_it(n_langs)}"

            # Average overall
            l_all = list(scores_dict.values())
            avg_all = sum(l_all) / len(l_all)
            overall_label = make_bold("Overall")
            table.set(f"{avg_all:.2f} ({lang_label})", default_row, [overall_label])

            # Gather stats for results by WALS feature values
            feat_results = defaultdict(list)
            for lang, score in scores_dict.items():
                feat_val = feature_vals[lang]
                feat_results[str(feat_val)].append(score)

            stats = {}
            for feature, y in feat_results.items():
                # exclude nans for per value average
                scores_without_nan = [a for a in y if a != "nan"]
                stats[feature] = (
                    sum(scores_without_nan) / len(scores_without_nan),
                    len(y),
                )

            feat_avgs = [feat_avg for feat_avg, _ in stats.values()]
            f_avg = sum(feat_avgs) / len(feat_avgs) if len(feat_avgs) > 0 else "-"

            f_langs = make_it(sum(c for l, (_, c) in stats.items() if l != "nan"))
            by_f_val = f"{f_avg:.2f} ({f_langs})"
            by_feat_label = make_bold("By F")

            table.set(by_f_val, default_row, [by_feat_label])
            delta = f_avg - avg_all
            table.set(f"{delta:+.2f}", default_row, ["$\Delta$"])

            if stats:
                max_count = max(i[1] for i in stats.values())
                min_count = min(i[1] for i in stats.values())
            else:
                max_count = None
                min_count = None

            for w_l in wals_l:
                avg, count = stats.get(w_l, (None, 0))

                if count == max_count:
                    pre = "\cellcolor{high-color!40} "
                elif count == min_count:
                    pre = "\cellcolor{low-color!40} "
                else:
                    pre = ""

                lang_l = f"({make_it(count)})"
                val = f"{pre} {avg:.2f} {lang_l}" if avg else f"- {lang_l}"
                feat_l = FEATURE_LABEL_MAP[w_l]

                table.set(val, default_row, [feat_l])

    table.commit(
        default_column_style=ColumnStyle("l"),
        
        do_hhline_syntax=False,
    )

# Note
The final tables require some tweaks:
- the first two columns need a column name
- replace double hline with double midrule
- add midrules between tasks
- vertical line from where the feature columns start
- alignment of delta is nicer with r instead of l
- put \\ at the bottom row
- add toprule and bottomrule

In [6]:
create_table(WORD_ORDER, 'word_order')
create_table(INFLECTION, 'inflection')

Writing .tex word_order ...
\begin{tabular}{ll||lllllllllll}
	                                        & & \textbf{Overall} & \textbf{By F} & $\Delta$ & SVO & SOV & VSO & OSV & OVS & VOS & NDO & NA \\\hline\hline
	\multirow{2}{*}{\textbf{LAReQA}\map} & XLM-R-L & 40.75 (\textit{11}) & 39.31 (\textit{11}) & -1.44 & \cellcolor{high-color!40}  42.10 (\textit{6}) &  39.75 (\textit{2}) & \cellcolor{low-color!40}  34.60 (\textit{1}) & - (\textit{0}) & - (\textit{0}) & - (\textit{0}) &  40.80 (\textit{2}) & - (\textit{0}) \\
	 & mBERT & 21.58 (\textit{11}) & 19.75 (\textit{11}) & -1.83 & \cellcolor{high-color!40}  24.10 (\textit{6}) &  15.10 (\textit{2}) & \cellcolor{low-color!40}  17.00 (\textit{1}) & - (\textit{0}) & - (\textit{0}) & - (\textit{0}) &  22.80 (\textit{2}) & - (\textit{0}) \\
	\multirow{3}{*}{\textbf{MLQA}\f} & XLM-R-L & 72.71 (\textit{7}) & 70.83 (\textit{7}) & -1.88 & \cellcolor{high-color!40}  75.22 (\textit{4}) & \cellcolor{low-color!40}  70.80 (\textit{1}) & \cellcolor{low-