# Annotation stats

In [60]:
import pandas as pd
from pathlib import Path
from sklearn.metrics import cohen_kappa_score

df = pd.read_csv("data/annotations.csv", parse_dates=["year"])

In [61]:
len(df[(df["has_claim"] == 1) & (df["introduces_dataset"] == 1)])

38

In [62]:
cohen_kappa_score(df['has_claim_e'], df['has_claim_w'])

0.7772066061467789

In [63]:
df = df[df['has_claim'] == 1]

In [64]:
df['introduces_dataset_e'] = df['introduces_dataset_e'].fillna(0)
df['introduces_dataset_w'] = df['introduces_dataset_w'].fillna(0)

cohen_kappa_score(df['introduces_dataset_e'], df['introduces_dataset_w'])


0.7136481963555226

In [65]:
df["iso_codes_e"] = (
    df["iso_codes_e"]
    .fillna("NONE")
    .apply(lambda item: ",".join(sorted(i.strip() for i in item.strip().split(","))))
)
df["iso_codes_w"] = (
    df["iso_codes_w"]
    .fillna("NONE")
    .apply(lambda item: ",".join(sorted(i.strip() for i in item.strip().split(","))))
)

In [66]:
cohen_kappa_score(df['iso_codes_e'], df['iso_codes_w'])

0.43915078569040455

# Results stats

In [81]:
unique_langs

316

In [101]:
df = pd.read_csv("data/annotations.csv", parse_dates=["year"])

with_langs = df["isos_wals"].notnull()

df.loc[with_langs, "nlangs"] = df.loc[with_langs, "isos_wals"].str.count(",") + 1
df.loc[with_langs, "codes"] = df.loc[with_langs, "isos_wals"].str.split(",")

lang_df = (
    df[with_langs]["isos_wals"]
    .str.split(",")
    .explode()
    .value_counts()
    .rename_axis("lang")
    .reset_index(name="count")
)

print(
    f"""
Total papers:       {df.shape[0]}
Papers with claim:  {df[df['has_claim'] == 1].shape[0]}
Introduce dataset:  {df[df['introduces_dataset'] == 1].shape[0]}
N langs per paper:  {df['nlangs'].min()} -- {df['nlangs'].max()}, median: {df['nlangs'].median()}
Unique langs:       {lang_df.shape[0]}
Langs used once:    {lang_df[lang_df['count'] == 1].shape[0]}
Top 5 lang:         {lang_df.head()}
"""
)


Total papers:       194
Papers with claim:  110
Introduce dataset:  38
N langs per paper:  2.0 -- 90.0, median: 11.0
Unique langs:       315
Langs used once:    160
Top 5 lang:           lang  count
0  eng     63
1  deu     60
2  rus     58
3  fin     57
4  spa     55

