In [1]:
import pandas as pd
import pandas as pd

from formatting import TITLE_MAP, make_bold

df = pd.read_csv("data/annotations-enhanced.csv")
df = df[
    (df["has_claim"] == 1)
    & (df["introduces_dataset"] == 1)
    & (df["isos_wals"].notnull())
]

In [2]:
df.columns

Index(['id', 'title', 'year', 'web_url', 'venue', 'track', 'abstract',
       'has_claim', 'has_claim_e', 'has_claim_w', 'introduces_dataset',
       'introduces_dataset_e', 'introduces_dataset_w', 'claim_related_to_e',
       'claim_related_to_w', 'has_typ_justification_e',
       'has_typ_justification_w', 'justification_e', 'justification_w',
       'iso_codes_e', 'iso_codes_w', 'comment_e', 'comment_w', 'iso_codes',
       'isos_wals', 'n_langs', 'mpd', 'mpd_missing', 'mpd_low_cov',
       'gb_complete_cov', 'gb_incomplete_cov', 'gb_coverage', 'gb_missing'],
      dtype='object')

In [3]:
from fiject.visuals.tables import Table, ColumnStyle
from fiject import CacheMode

table = Table("datasets", CacheMode.NONE)

SORT_KEY = "n_langs"

df = df.sort_values("n_langs", ascending=False)

max_mpd = df["mpd"].max()
min_mpd = df["mpd"].min()

max_cov = df["gb_coverage"].max()
min_cov = df["gb_coverage"].min()

for _, row in df.iterrows():
    title = row["title"]
    cite, name, task = TITLE_MAP[title]
    name = name or ""
    n_langs = str(int(row["n_langs"]))

    if row["mpd"] == max_mpd:
        mpd_pre = "\cellcolor{high-color!40} "
    elif row["mpd"] == min_mpd:
        mpd_pre = "\cellcolor{low-color!40} "
    else:
        mpd_pre = ""

    mpd = f"{mpd_pre} {row['mpd']:.2f}"

    langs = f"{n_langs}\\nolang" if row["mpd_missing"] > 0 else n_langs
    mpd = f"{mpd}\\lowcov" if row["mpd_low_cov"] > 0 else mpd

    g_cov = row.get("gb_coverage", None)

    if g_cov == max_cov:
        cov_pre = "\cellcolor{high-color!40} "
    elif g_cov == min_cov:
        cov_pre = "\cellcolor{low-color!40} "
    else:
        cov_pre = ""

    g_cov = f"{cov_pre} {g_cov:.2f}" if g_cov is not None else f"{cov_pre} --"
    g_cov = f"{g_cov}\\missing" if row.get("gb_missing", 0) > 0 else g_cov

    table.set(name, [cite], [make_bold("Dataset name")])
    table.set(
        langs, [cite], [make_bold("$|$L$|$ ($\\downarrow$)")]
    )  # change if SORT_KEY is changed
    table.set(mpd, [cite], [make_bold("MPD")])
    table.set(g_cov, [cite], [make_bold("GB")])
    table.set(task, [cite], [make_bold("Task(s) or Topic")])

table.commit(
    default_column_style=ColumnStyle("l"),
    do_hhline_syntax=False,
)

Writing .tex datasets ...
\begin{tabular}{l||lllll}
	                                               & \textbf{Dataset name} & \textbf{$|$L$|$ ($\downarrow$)} & \textbf{MPD} & \textbf{GB} & \textbf{Task(s) or Topic} \\\hline\hline
	\citet{vylomova-etal-2020-sigmorphon} &  & 90 &  0.76\lowcov &  0.92\missing & Morphological inflection \\
	\citet{henrichsen-uneson-2012-smallworlds} & SMALLWorlds & 53 &  0.65\lowcov & \cellcolor{high-color!40}  0.94 & ASR \\
	\citet{fitzgerald-etal-2023-massive} & MASSIVE & 51 &  0.64\lowcov &  0.92 & Slot-filling, Intent classification \\
	\citet{ruder-etal-2021-xtreme} & XTREME-R & 50 &  0.66\lowcov &  0.92\missing & Classification, Parsing, IR, QA \\
	\citet{kodner-etal-2022-sigmorphon} &  & 32\nolang &  0.68\lowcov &  0.88\missing & Morphological inflection \\
	\citet{goldman-etal-2023-sigmorphon} &  & 26 &  0.64 &  0.89 & Morphological inflection \\
	\citet{longpre-etal-2021-mkqa} & MKQA & 25 &  0.61 &  0.89 & QA \\
	\citet{jiang-etal-2020-x} & X-FACT

# Note
The final table require some tweaks:
- remove hlines and vertical lines
- alignment of L is nicer with r instead of l
- remove the nan
- put \\ at the bottom row
- add toprule and bottomrule
- add midrule for column heading