# LIA subsets

LIA was divided into train, dev and test in [UD](https://github.com/UniversalDependencies/UD_Norwegian-NynorskLIA/tree/8a4ea1a6e0e1fbb4ef5ba34c2d408563e9c8cf9a).

The [LIA](https://github.com/textlab/spoken_norwegian_resources/tree/master/treebanks/Norwegian-NynorskLIA) conllu-treebank is divided into 18 files, 1 per speaker/conversation.

Each sentence has a unique `sent_id` across all partitions in UD, as opposed to LIA where each sentence is given a file-internal `id` which is incremental from 1 in each file. 

Here we map the speaker/file-id from UD + sent_id back to LIA to recreate the partitions. 

# Load data

In [None]:
from pathlib import Path

import conllu
from conllu import parse

UD_path = Path("../data/UD_Norwegian-NynorskLIA")
LIA_path = Path("../spoken_norwegian_resources/treebanks/Norwegian-NynorskLIA")
LIA_old_path = Path("../spoken_norwegian_resources/treebanks/Norwegian-NynorskLIA_old")


def load_partition(filepath: Path, partition: str = "train") -> list:
    """Load one of the UD dataset partitions train, dev, or test."""
    data = next(filepath.glob(f"*{partition}.conllu")).read_text()
    sentences = parse(
        data,
        metadata_parsers={
            "sent_id": lambda key, value: (key, value),
            "text": lambda key, value: (key, value),
            "__fallback__": lambda key, value: [
                [k.rstrip(":"), key.split()[i + 1]]
                for i, k in list(enumerate(key.split()))[::2]
            ],
        },
    )
    return sentences


def load_lia_sentences(
    filestem: str, dir_path: Path = Path("../spoken_norwegian_resources/treebanks/")
):
    LIA_path = dir_path / "Norwegian-NynorskLIA"
    LIA_old_path = dir_path / "Norwegian-NynorskLIA_old"
    filename = filestem + ".conll"

    lia_file = LIA_path / filename
    try:
        lia_data = lia_file.read_text()
    except FileNotFoundError:
        try:
            lia_file = LIA_old_path / filename
            lia_data = lia_file.read_text()
        except FileNotFoundError:
            print(f"Couldn't load {filestem}")
            lia_data = ""
    finally:
        lia_sentences = parse(lia_data)
    return lia_sentences

In [None]:
partition = "test"
(UD_sentences := load_partition(UD_path, partition))

# Mapping 
Iterate over UD sentences and map them to the corresponding LIA sentence

In [None]:
def map_ud_partition_to_lia_sentences(sentences: conllu.models.TokenList) -> dict:
    mapping = {}
    no_match = {}

    for sentence in sentences:
        sent_id = sentence.metadata["sent_id"]
        UD_text = sentence.metadata["text"]
        filestem = sentence.metadata["speakerid"]
        lia_sentences = load_lia_sentences(filestem)
        for sent in lia_sentences:
            LIA_text = sent.metadata["text"]
            if (LIA_text == UD_text) or (LIA_text == UD_text.rstrip(" .")):
                mapping[sent_id] = sent

        if sent_id not in mapping:
            no_match[sent_id] = sentence

    return {"match": mapping, "no_match": no_match}

In [None]:
mapping = map_ud_partition_to_lia_sentences(UD_sentences)

# Annotate the partition with correct sent_ids and save

In [None]:
lia_partition = []
for sent_id, sentence in mapping["match"].items():
    sentence.metadata["sent_id"] = sent_id
    lia_partition.append(sentence)

# Save to disk
with open(f"../data/lia_{partition}.conllu", "w") as f:
    f.writelines([sentence.serialize() + "\n" for sentence in lia_partition])

## Handle mis-matches

In [None]:
no_match = mapping["no_match"]
print(f"No LIA sentence was found for {len(no_match)} UD {partition} sent_ids.")

fname = f"no_match_{partition}.txt"
with open(fname, "w") as f:
    f.writelines("\n".join(no_match.keys()) + "\n")

print(f'They have been saved to "{fname}" for later processing')