In [1]:
import convert_morph as conv 
import parse_conllu as parser
from pathlib import Path

import pandas as pd
import re 


## Konverter morfologiske trekk

In [2]:
# Definer datapartisjon
lang = "nb"
part = "train"
ndt_file = f"data/ndt_{lang}_{part}.conllu"


In [None]:
# Last inn data 
data = parser.parse_conll_file(Path(ndt_file))

In [None]:
# Sjekk resultatet visuelt før de skrives til disk med parser.write_df_to_conll()

# Konverter UPOS og FEATS til UD-standard
morphdata = conv.convert_morphology(data)
# Undersøk resultatet i pandas
(df := parser.load_conll_to_df(morphdata))


In [None]:
# Konverter morfologiske trekk for alle splittene av retokenisert ndt data

for part in ["train", "test", "dev"]: 
    for lang in ["nb", "nn"]: 
        fpath = Path(f"data/ndt_{lang}_{part}.conllu")
        data = parser.parse_conll_file(fpath)  # Last inn data 
        morphdata = conv.convert_morphology(data)  # Konverter UPOS og FEATS til UD-standard
        output = fpath.parent / f"{fpath.stem}_udmorph{fpath.suffix}"
        parser.write_conll(morphdata, output, add_comments=True)  # Skriv CONLL-data til disk 
        # Legg til SpaceAfter=No i MISC-feltet
        !udapy -s ud.SetSpaceAfterFromText < $output > out.conllu && mv out.conllu $output  

In [None]:
UDFILE=ndt_file

# Fix punctuation and add spaceafter 
!cat $UDFILE | udapy -s ud.FixPunct > out.conllu

!cat $UDFILE | udapy -TM \
    util.Mark node='node.lemma == "som"' > som.txt

!udapy -HMAC \
    read.Conllu zone=old files=$UDFILE \
    read.Conllu zone=new files=out.conllu \
    util.MarkDiff gold_zone=old attributes='form,lemma,upos,xpos,deprel,feats,misc' add=True > diff.html

---

## Partisjoner data

In [None]:
# Partisjoner data i dev, train, og test-splittene som UD er delt inn i

partitions = {
    "test" : "data/test_ids.txt",
    "dev" : "data/dev_ids.txt",
    "train" : "data/train_ids.txt",
}

def partition_file(filepath):
    fpath = Path(filepath)
    for part, idfile in partitions.items():
        outputfile = (fpath.parent / f"{fpath.stem}_{part}{fpath.suffix}")
        part_ids =  parser.filereadlines(idfile)
        part_df = df[df.sent_id.isin(part_ids)]
        parser.write_df_to_conll(part_df, outputfile, add_comments=True)

partition_file("data/gullkorpus/2023_gullkorpus_ud.conllu")
partition_file("data/gullkorpus/2019_gullkorpus_ud_før_annotasjon.conllu")
partition_file("data/gullkorpus/2019_gullkorpus_ndt.conllu")

## Hent eksempelsetninger

In [None]:
#Hent ut setningene som gir spesifikke feilmeldinger

fpath = Path("validation-report_ndt2ud.txt")
etype = "rel-upos-punct"
save_errorlines = False

rows = Path(fpath).read_text(encoding="utf-8").splitlines()

error_info_regx = re.compile(r"^\[Line (\d+)(?: Sent )?(\d+)?(?: Node )?(\d+)?\]\: \[(L.*)\] (.*)(\[[0-9]*, [0-9]*\])?(.*)?$", flags=re.DOTALL)
errors = []
for row in rows:
    m = error_info_regx.fullmatch(row)
    if m is not None:
        errors.append(m.groups())

df = pd.DataFrame(errors, columns=["line", "sent","node", "errortype", "message", "relevant_nodes", "message2"])

type_counts = df.errortype.value_counts()
print(type_counts)

if save_errorlines:
    errorlines = df[df.errortype.str.contains(etype)]
    errorlines.to_csv(f"error_{etype}.csv", index=False)

    ndt_file = "data/ndt_nb_train.conllu"
    data = parser.load_conll_to_df(
        parser.parse_conll_file(ndt_file)
    )
    error_sents = data[data.sent_id.isin(errorlines["sent"])]
    parser.write_df_to_conll(
        error_sents, f"error_{etype}_sents.conllu", add_comments=True)
    error_sents.sent_id.nunique()


In [1]:
import json

In [11]:
filename = "match_dev.json"

with open(filename) as fp:
    matches = json.load(fp)

len(matches)

99

In [12]:
sents = {s["sent_id"] for s in matches}
len(sents)

99

In [13]:
sents

{'015728',
 '015731',
 '015732',
 '015752',
 '015836',
 '015840',
 '015842',
 '015847',
 '015848',
 '015849',
 '015859',
 '015924',
 '015935',
 '015974',
 '015990',
 '015994',
 '016046',
 '016047',
 '016048',
 '016084',
 '016099',
 '016118',
 '016128',
 '016136',
 '016160',
 '016167',
 '016174',
 '016257',
 '016298',
 '016301',
 '016302',
 '016313',
 '016314',
 '016331',
 '016443',
 '016585',
 '016717',
 '016855',
 '016888',
 '016889',
 '016925',
 '016933',
 '016952',
 '016953',
 '016993',
 '016999',
 '017020',
 '017024',
 '017027',
 '017046',
 '017059',
 '017063',
 '017065',
 '017068',
 '017086',
 '017115',
 '017136',
 '017144',
 '017146',
 '017170',
 '017178',
 '017265',
 '017270',
 '017387',
 '017417',
 '017419',
 '017427',
 '017438',
 '017442',
 '017443',
 '017455',
 '017460',
 '017513',
 '017558',
 '017577',
 '017582',
 '017586',
 '017614',
 '017616',
 '017653',
 '017658',
 '017659',
 '017661',
 '017687',
 '017750',
 '017754',
 '017764',
 '017780',
 '017783',
 '017794',
 '017800',