In [1]:
import pandas as pd

train_split = pd.read_csv("../../data/samisk_ocr_dataset/train/_metadata.csv")
train_split = train_split[
    (train_split.page_30 == False) & (train_split.gt_pix == False)
]
val_split = pd.read_csv("../../data/samisk_ocr_dataset/val/_metadata.csv")
test_split = pd.read_csv("../../data/new_testset_with_newspapers/metadata.csv")

In [2]:
train_lines = len(train_split)
val_lines = len(val_split)
test_lines = len(test_split)

train_lines, val_lines, test_lines

(6141, 2035, 871)

In [3]:
for e in train_split[["urn", "page"]]:
    print(e)

urn
page


In [23]:
from ast import literal_eval
from collections import defaultdict


def get_language_overview(df: pd.DataFrame):
    lang_docs = defaultdict(int)
    lang_pages = defaultdict(int)
    lang_lines = defaultdict(int)

    for langcodes, df_ in df.groupby("langcodes"):
        langcodes = literal_eval(langcodes)
        num_docs = len(df_.urn.unique())
        num_lines = len(df_)
        num_pages = len({(urn, page) for urn, page in zip(df_.urn, df_.page)})

        if len(langcodes) > 1:
            num_langs = 2
            assert len(langcodes) == num_langs
            for lang in langcodes:
                lang_pages[lang] += num_pages / num_langs
                lang_lines[lang] += num_lines / num_langs
                lang_docs[lang] += num_docs / num_langs

        else:
            lang = langcodes[0]
            lang_pages[lang] += num_pages
            lang_lines[lang] += num_lines
            lang_docs[lang] += num_docs

    return (lang_docs, lang_pages, lang_lines)


langcode_map = {
    "smn": r"Inari\\Sámi",
    "smj": r"Lule\\Sámi",
    "sme": r"North\\Sámi",
    "sma": r"South\\Sámi",
}

data_df = pd.DataFrame(
    {
        "language": langcode_map.values(),
        "train split": [""] * len(langcode_map),
        "val split": [""] * len(langcode_map),
        "test split": [""] * len(langcode_map),
    }
)
data_df = data_df.set_index("language")


for split, df in (
    ("train split", train_split),
    ("val split", val_split),
    ("test split", test_split),
):
    doc_counts, page_counts, line_counts = get_language_overview(df)
    tot_doc_counts = sum(doc_counts.values())
    tot_page_counts = sum(page_counts.values())
    tot_line_counts = sum(line_counts.values())

    for langcode in langcode_map:
        data_str = rf"{doc_counts[langcode]} docs\\{page_counts[langcode]} pages\\{line_counts[langcode]} lines"
        data_df.at[langcode_map[langcode], split] = data_str

data_df

Unnamed: 0_level_0,train split,val split,test split
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Inari\\Sámi,2.5 docs\\21.0 pages\\280.0 lines,3 docs\\3 pages\\109 lines,5 docs\\6 pages\\163 lines
Lule\\Sámi,2 docs\\4 pages\\81 lines,2 docs\\2 pages\\36 lines,4 docs\\4 pages\\137 lines
North\\Sámi,2.5 docs\\37.0 pages\\5572.0 lines,8 docs\\18 pages\\1837 lines,7 docs\\7 pages\\376 lines
South\\Sámi,5 docs\\9 pages\\208 lines,2 docs\\2 pages\\53 lines,4 docs\\4 pages\\195 lines


In [26]:
print(data_df.to_latex())

\begin{tabular}{llll}
\toprule
 & train split & val split & test split \\
language &  &  &  \\
\midrule
Inari\\Sámi & 2.5 docs\\21.0 pages\\280.0 lines & 3 docs\\3 pages\\109 lines & 5 docs\\6 pages\\163 lines \\
Lule\\Sámi & 2 docs\\4 pages\\81 lines & 2 docs\\2 pages\\36 lines & 4 docs\\4 pages\\137 lines \\
North\\Sámi & 2.5 docs\\37.0 pages\\5572.0 lines & 8 docs\\18 pages\\1837 lines & 7 docs\\7 pages\\376 lines \\
South\\Sámi & 5 docs\\9 pages\\208 lines & 2 docs\\2 pages\\53 lines & 4 docs\\4 pages\\195 lines \\
\bottomrule
\end{tabular}

