In [1]:
import convert_morph as conv 
import parse_conllu as parser
from pathlib import Path

import pandas as pd
import re 


## Konverter morfologiske trekk

In [2]:
# Definer datapartisjon
lang = "nb"
part = "train"
ndt_file = f"data/ndt_{lang}_{part}.conllu"


In [None]:
# Last inn data 
data = parser.parse_conll_file(Path(ndt_file))

In [None]:
# Sjekk resultatet visuelt før de skrives til disk med parser.write_df_to_conll()

# Konverter UPOS og FEATS til UD-standard
morphdata = conv.convert_morphology(data)
# Undersøk resultatet i pandas
(df := parser.load_conll_to_df(morphdata))


In [None]:
# Konverter morfologiske trekk for alle splittene av retokenisert ndt data

for part in ["train", "test", "dev"]: 
    for lang in ["nb", "nn"]: 
        fpath = Path(f"data/ndt_{lang}_{part}.conllu")
        data = parser.parse_conll_file(fpath)  # Last inn data 
        morphdata = conv.convert_morphology(data)  # Konverter UPOS og FEATS til UD-standard
        output = fpath.parent / f"{fpath.stem}_udmorph{fpath.suffix}"
        parser.write_conll(morphdata, output, add_comments=True)  # Skriv CONLL-data til disk 
        # Legg til SpaceAfter=No i MISC-feltet
        !udapy -s ud.SetSpaceAfterFromText < $output > out.conllu && mv out.conllu $output  

In [None]:
UDFILE=ndt_file

# Fix punctuation and add spaceafter 
!cat $UDFILE | udapy -s ud.FixPunct > out.conllu

!cat $UDFILE | udapy -TM \
    util.Mark node='node.lemma == "som"' > som.txt

!udapy -HMAC \
    read.Conllu zone=old files=$UDFILE \
    read.Conllu zone=new files=out.conllu \
    util.MarkDiff gold_zone=old attributes='form,lemma,upos,xpos,deprel,feats,misc' add=True > diff.html

---

## Partisjoner data

In [None]:
# Partisjoner data i dev, train, og test-splittene som UD er delt inn i

partitions = {
    "test" : "data/test_ids.txt",
    "dev" : "data/dev_ids.txt",
    "train" : "data/train_ids.txt",
}

def partition_file(filepath):
    fpath = Path(filepath)
    for part, idfile in partitions.items():
        outputfile = (fpath.parent / f"{fpath.stem}_{part}{fpath.suffix}")
        part_ids =  parser.filereadlines(idfile)
        part_df = df[df.sent_id.isin(part_ids)]
        parser.write_df_to_conll(part_df, outputfile, add_comments=True)

partition_file("data/gullkorpus/2023_gullkorpus_ud.conllu")
partition_file("data/gullkorpus/2019_gullkorpus_ud_før_annotasjon.conllu")
partition_file("data/gullkorpus/2019_gullkorpus_ndt.conllu")

## Hent eksempelsetninger

In [None]:
#Hent ut setningene som gir spesifikke feilmeldinger

fpath = Path("validation-report_ndt2ud.txt")
etype = "rel-upos-punct"
save_errorlines = False

rows = Path(fpath).read_text(encoding="utf-8").splitlines()

error_info_regx = re.compile(r"^\[Line (\d+)(?: Sent )?(\d+)?(?: Node )?(\d+)?\]\: \[(L.*)\] (.*)(\[[0-9]*, [0-9]*\])?(.*)?$", flags=re.DOTALL)
errors = []
for row in rows:
    m = error_info_regx.fullmatch(row)
    if m is not None:
        errors.append(m.groups())

df = pd.DataFrame(errors, columns=["line", "sent","node", "errortype", "message", "relevant_nodes", "message2"])

type_counts = df.errortype.value_counts()
print(type_counts)

if save_errorlines:
    errorlines = df[df.errortype.str.contains(etype)]
    errorlines.to_csv(f"error_{etype}.csv", index=False)

    ndt_file = "data/ndt_nb_train.conllu"
    data = parser.load_conll_to_df(
        parser.parse_conll_file(ndt_file)
    )
    error_sents = data[data.sent_id.isin(errorlines["sent"])]
    parser.write_df_to_conll(
        error_sents, f"error_{etype}_sents.conllu", add_comments=True)
    error_sents.sent_id.nunique()


In [2]:
import json

In [3]:
filename = "match_pattern.json"

with open(filename) as fp:
    matches = json.load(fp)

len(matches)

1171

In [4]:
sents = {s["sent_id"] for s in matches}
len(sents)

1118

In [39]:
sents

{'000026',
 '000433',
 '000904',
 '000975',
 '001014',
 '001337',
 '001353',
 '001439',
 '001641',
 '001752',
 '001840',
 '002023',
 '002315',
 '002414',
 '002776',
 '002839',
 '002966',
 '003230',
 '003251',
 '003411',
 '003560',
 '003705',
 '003747',
 '003912',
 '003985',
 '004143',
 '004183',
 '004378',
 '004784',
 '005143',
 '005547',
 '005610',
 '005630',
 '005763',
 '005767',
 '005894',
 '005967',
 '006121',
 '006174',
 '006293',
 '006387',
 '006392',
 '006534',
 '006656',
 '007036',
 '007294',
 '007432',
 '007500',
 '007632',
 '008018',
 '008089',
 '008148',
 '008255',
 '008497',
 '008700',
 '008980',
 '009084',
 '009428',
 '009451',
 '009727',
 '009744',
 '009856',
 '010036',
 '010039',
 '010277',
 '010355',
 '010780',
 '010795',
 '011057',
 '011120',
 '011246',
 '012088',
 '012425',
 '012669',
 '012861',
 '012887',
 '013297',
 '013323',
 '013616',
 '013795',
 '013903',
 '013978',
 '014010',
 '014104',
 '014284',
 '014390',
 '014508',
 '014546',
 '014599',
 '014705',
 '014721',

In [1]:
from convert_morph import get_dependents, get_labels, parse_conll_file, convert_pos


from pathlib import Path

In [2]:
ndt = Path("data/ndt_nn_train.conllu")
ndt = parse_conll_file(ndt)