In [35]:
import pandas as pd
from pathlib import Path
import altair as alt

from constants import ALTAIR_SETTINGS, URIEL_UNKNOWN, GB_FEATURE_MISSING, COLOR, COLOR_2
from distances import get_lang2vec_low_cov, get_syn_dict, get_gb_feature_cov

IMG_PATH = Path("img")
IMG_PATH.mkdir(exist_ok=True)

# Load Grambank
gb = pd.read_csv("data/gb_lang_feat_vals.csv")
gb = gb.drop(["Unnamed: 0"], axis=1)
gltc_df = pd.read_csv("data/languoid.csv")

# Load WALS
# the required distance files can be found here: http://www.cs.cmu.edu/~aanastas/files/
# download 'distances.zip', unzip and put the path to the extracted folder here
distances_path = Path(
    "/home/wessel/Documents/repos/typ-div-survey/sigtyp/pairwise-plot/distances"
)
dist_df = pd.read_csv(distances_path / "SYNTACTIC.csv")

df = pd.read_csv("data/annotations.csv", parse_dates=["year"])
df = df[(df["has_claim"] == 1) & (df["isos_wals"].notna())]

In [36]:
all_codes = set(df['isos_wals'].str.split(',').explode().unique().tolist())
uriel_final = list(all_codes - URIEL_UNKNOWN)

low_cov_langs = set(get_lang2vec_low_cov(uriel_final))

In [37]:
gb_max_coverage = get_gb_feature_cov(gb, GB_FEATURE_MISSING)

In [38]:
gltc_mapping = {r["iso639P3code"]: r["id"] for _, r in gltc_df.iterrows()}
gb_feats = [x for x in gb.columns if x.startswith("GB")]

In [39]:
results = {}

for idx, row in df.iterrows():
    # Syntactic MPD
    codes_t = set(row["isos_wals"].split(","))
    n_langs = len(codes_t)

    codes_filtered_low_cov = codes_t - low_cov_langs
    codes_filtered_missing = codes_t - URIEL_UNKNOWN
    codes_filtered = list(codes_t - low_cov_langs - URIEL_UNKNOWN)

    mean_pairwise_distance = get_syn_dict(dist_df, codes_filtered, norm=False)

    relevant_gltc = [i for c in codes_t if (i := gltc_mapping.get(c, None))]

    # GB feature coverage
    gb_ann = gb[gb["Lang_ID"].isin(relevant_gltc)]
    compl_cov, incompl_cov = [], []
    for f_name in gb_feats:
        f_vals = {val for val in gb_ann[f_name] if val not in GB_FEATURE_MISSING}
        if gb_max_coverage[f_name] == f_vals:
            compl_cov.append(f_name)
        else:
            incompl_cov.append(f_name)

    coverage = len(compl_cov) / len(gb_feats)

    gb_complete_cov = len(compl_cov)
    gb_incomplete_cov = len(incompl_cov)
    gb_coverage = coverage
    gb_missing = len(codes_t) - len(relevant_gltc)

    df.at[idx, "n_langs"] = n_langs
    df.at[idx, "mpd"] = mean_pairwise_distance
    df.at[idx, "mpd_missing"] = len(codes_t) - len(codes_filtered_missing)
    df.at[idx, "mpd_low_cov"] = len(codes_t) - len(codes_filtered_low_cov)
    df.at[idx, "gb_complete_cov"] = gb_complete_cov
    df.at[idx, "gb_incomplete_cov"] = gb_incomplete_cov
    df.at[idx, "gb_coverage"] = gb_coverage
    df.at[idx, "gb_missing"] = gb_missing

  return np.nanmean(matrix)


In [40]:
coverage_plot = (
    alt.Chart(df)
    .mark_point(color=COLOR)
    .encode(
        x=alt.X("n_langs", title="Number of languages"),
        y=alt.Y("gb_coverage", title="Feature coverage"),
    )
)
coverage_plot

In [41]:
mpd_plot = (
    alt.Chart(df)
    .mark_point(color=COLOR_2)
    .encode(
        x=alt.X("n_langs", title="Number of languages"),
        y=alt.Y("mpd", title="MPD", scale=alt.Scale(domain=[0,1])),
    )
)
mpd_plot

In [42]:
plots = [
    ("coverage-by-lang-plot", coverage_plot),
    ("mpd-by-lang-plot", mpd_plot),
]
for label, plot in plots:
    plot.save(str(IMG_PATH / f"{label}.pdf"))

In [43]:
to_include = ["Genetic", "Geographic", "Syntactic"]  # 'Phonological'
claim_codes = df["isos_wals"].str.split(',').tolist()

In [44]:
# this is repeated a bit from above... oh well

all_dists = []
for feature_path in distances_path.glob("*.csv"):
    feature_name = feature_path.stem.title()
    if feature_name.title() not in to_include:
        continue
    print(f'Working on {feature_name} ...')
    dist_df = pd.read_csv(feature_path)

    papers_missing = 0
    papers_low_cov = 0

    records = []

    for paper_codes in claim_codes:
        codes_t = set(paper_codes)

        codes_filtered_low_cov = codes_t - low_cov_langs
        codes_filtered_missing = codes_t - URIEL_UNKNOWN
        codes_filtered = list(codes_t - low_cov_langs - URIEL_UNKNOWN)

        if len(codes_t) != len(codes_filtered_missing):
            papers_missing += 1

        if len(codes_t) != len(codes_filtered_low_cov):
            papers_low_cov += 1

        mean_pairwise_distance = get_syn_dict(dist_df, codes_filtered)
        records.append({"paper_codes": paper_codes, "distances": mean_pairwise_distance})

    all_dists.append(
        {
            "feature_name": feature_name,
            "records": records,
            "papers_with_missing": papers_missing,
            "papers_with_low_cov": papers_low_cov,
        }
    )

Working on Geographic ...


  return np.nanmean(matrix)


Working on Genetic ...


  return np.nanmean(matrix)


Working on Syntactic ...


  return np.nanmean(matrix)


In [103]:
plots = []
for feature in all_dists:
    temp_df = pd.DataFrame({"distances": [d["distances"] for d in feature["records"]]})

    temp_plot = (
        alt.Chart(temp_df)
        .encode(
            x=alt.X(
                "distances",
                title=f"Mean pairwise {feature['feature_name'].lower()} distance",
                scale=alt.Scale(domain=(0.0, 1.0)),
            ),
        )
        .mark_tick(**{**ALTAIR_SETTINGS, "color": COLOR_2 if feature['feature_name'] == 'Syntactic' else COLOR})
    )
    plots.append(temp_plot)

In [104]:
gb_barcode = (
    alt.Chart(df)
    .encode(
        x=alt.X(
            "gb_coverage",
            title=f"Grambank feature coverage",
            scale=alt.Scale(domain=(0.0, 1.0)),
        ),
    )
    .mark_tick(**{**ALTAIR_SETTINGS, 'color': COLOR_2})
)
plots.append(gb_barcode)

In [105]:
left = alt.concat(plots[0], plots[2])
right = alt.concat(plots[1], plots[3])
 
dist_plot = alt.vconcat(left, right)
dist_plot

In [48]:
dist_plot.save(str(IMG_PATH / f"barcode-plots.pdf"))

In [49]:
# We will use this in the datasets notebook
df.to_csv('data/annotations-enhanced.csv', index=False)

In [72]:
print(f"""

-- MPD --
Languages with <5% coverage:                    {len(low_cov_langs)}
Papers with at least one lang in MPD missing:   {df[df['mpd_missing'] > 0].shape[0]}    
Papers with at least one lang in MPD low cov:   {df[df['mpd_low_cov'] > 0].shape[0]}
Average MPD:                                    {df['mpd'].min():.2f} -- {df['mpd'].max():.2f}, avg: {df['mpd'].mean():.2f}
Paper with highest MPD:                         {df[df['mpd'] == df['mpd'].max()]['title'].item()}
Paper with lowest MPD:                          {df[df['mpd'] == df['mpd'].min()]['title'].item()}

-- GB --
Papers with at least one lang in GB missing:    {df[df['gb_missing'] > 0].shape[0]}    
Average GB coverage:                            {df['gb_coverage'].min():.2f} -- {df['gb_coverage'].max():.2f}, avg: {df['gb_coverage'].mean():.2f}
Paper with highest coverage:                    {df[df['gb_coverage'] == df['gb_coverage'].max()]['title'].item()}
Paper with lowest coverage:                     {df[df['gb_coverage'] == df['gb_coverage'].min()]['title'].item()}
""")




-- MPD --
Languages with <5% coverage:                    70
Papers with at least one lang in MPD missing:   5    
Papers with at least one lang in MPD low cov:   31
Average MPD:                                    0.42 -- 0.86, avg: 0.64
Paper with highest MPD:                         A systematic comparison of methods for low-resource dependency parsing on genuinely low-resource languages
Paper with lowest MPD:                          An Unsupervised, Geometric and Syntax-aware Quantification of Polysemy

-- GB --
Papers with at least one lang in GB missing:    12    
Average GB coverage:                            0.00 -- 0.95, avg: 0.73
Paper with highest coverage:                    From characters to words: the turning point of BPE merges
Paper with lowest coverage:                     Making More of Little Data: Improving Low-Resource Automatic Speech Recognition Using Data Augmentation

