In [1]:
import utils.convert_morph as conv 
import utils.parse_conllu as parser
from pathlib import Path

import pandas as pd
import re 


## Konverter morfologiske trekk

In [2]:
# Definer datapartisjon
lang = "nb"
part = "train"
ndt_file = f"data/ndt_{lang}_{part}.conllu"


In [3]:
# Last inn data 
data = parser.parse_conll_file(Path(ndt_file))

In [4]:
# Sjekk resultatet visuelt før de skrives til disk med parser.write_df_to_conll()

# Konverter UPOS og FEATS til UD-standard
morphdata = conv.convert_morphology(data)
# Undersøk resultatet i pandas
(df := parser.load_conll_to_df(morphdata))


Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id,text,idx,newpar
0,1,Lam,lam,NOUN,subst,Definite=Ind|Gender=Neut|Number=Sing,0,FRAG,_,_,000001,Lam og piggvar på bryllupsmenyen,0,False
1,2,og,og,CCONJ,konj,_,3,KONJ,_,_,000001,Lam og piggvar på bryllupsmenyen,0,False
2,3,piggvar,piggvar,NOUN,subst,Definite=Ind|Gender=Masc|Number=Sing,1,KOORD,_,_,000001,Lam og piggvar på bryllupsmenyen,0,False
3,4,på,på,ADP,prep,_,1,SPRED,_,_,000001,Lam og piggvar på bryllupsmenyen,0,False
4,5,bryllupsmenyen,bryllupsmeny,NOUN,subst,Definite=Def|Gender=Masc|Number=Sing,4,PUTFYLL,_,_,000001,Lam og piggvar på bryllupsmenyen,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243881,2,bodde,bo,VERB,verb,Mood=Ind|Tense=Past|VerbForm=Fin,0,FINV,_,_,015696,Vi bodde på Hotel Føroyar.,15695,False
243882,3,på,på,ADP,prep,_,2,ADV,_,_,015696,Vi bodde på Hotel Føroyar.,15695,False
243883,4,Hotel,Hotel,PROPN,subst,_,3,PUTFYLL,_,_,015696,Vi bodde på Hotel Føroyar.,15695,False
243884,5,Føroyar,Føroyar,PROPN,subst,_,4,FLAT,_,_,015696,Vi bodde på Hotel Føroyar.,15695,False


In [None]:
# Konverter morfologiske trekk for alle splittene av retokenisert ndt data

for part in ["train", "test", "dev"]: 
    for lang in ["nb", "nn"]: 
        fpath = Path(f"data/ndt_{lang}_{part}.conllu")
        data = parser.parse_conll_file(fpath)  # Last inn data 
        morphdata = conv.convert_morphology(data)  # Konverter UPOS og FEATS til UD-standard
        output = fpath.parent / f"{fpath.stem}_udmorph{fpath.suffix}"
        parser.write_conll(morphdata, output, add_comments=True)  # Skriv CONLL-data til disk 
        # Legg til SpaceAfter=No i MISC-feltet
        !udapy -s ud.SetSpaceAfterFromText < $output > out.conllu && mv out.conllu $output  

In [None]:
UDFILE=ndt_file

# Fix punctuation and add spaceafter 
!cat $UDFILE | udapy -s ud.FixPunct > out.conllu

!cat $UDFILE | udapy -TM \
    util.Mark node='node.lemma == "som"' > som.txt

!udapy -HMAC \
    read.Conllu zone=old files=$UDFILE \
    read.Conllu zone=new files=out.conllu \
    util.MarkDiff gold_zone=old attributes='form,lemma,upos,xpos,deprel,feats,misc' add=True > diff.html

---

## Partisjoner data

In [None]:
# Partisjoner data i dev, train, og test-splittene som UD er delt inn i

partitions = {
    "test" : "data/test_ids.txt",
    "dev" : "data/dev_ids.txt",
    "train" : "data/train_ids.txt",
}

def partition_file(filepath):
    fpath = Path(filepath)
    for part, idfile in partitions.items():
        outputfile = (fpath.parent / f"{fpath.stem}_{part}{fpath.suffix}")
        part_ids =  parser.filereadlines(idfile)
        part_df = df[df.sent_id.isin(part_ids)]
        parser.write_df_to_conll(part_df, outputfile, add_comments=True)

partition_file("data/gullkorpus/2023_gullkorpus_ud.conllu")
partition_file("data/gullkorpus/2019_gullkorpus_ud_før_annotasjon.conllu")
partition_file("data/gullkorpus/2019_gullkorpus_ndt.conllu")

## Hent eksempelsetninger

In [5]:
from utils.parse_conllu import extract_partition


def fetch_conllu_sents(conllu_file, outputfile, sent_ids): 
    data = parser.load_conll_to_df(
        parser.parse_conll_file(conllu_file)
    )
    sents = data[data.sent_id.isin(sent_ids)]
    parser.write_df_to_conll(
        sents, outputfile, add_comments=True)
    return sents

In [None]:
#Hent ut setningene som gir spesifikke feilmeldinger

fpath = Path("validation-report_ndt2ud.txt")
etype = "rel-upos-punct"
save_errorlines = False

rows = Path(fpath).read_text(encoding="utf-8").splitlines()

error_info_regx = re.compile(r"^\[Line (\d+)(?: Sent )?(\d+)?(?: Node )?(\d+)?\]\: \[(L.*)\] (.*)(\[[0-9]*, [0-9]*\])?(.*)?$", flags=re.DOTALL)
errors = []
for row in rows:
    m = error_info_regx.fullmatch(row)
    if m is not None:
        errors.append(m.groups())

df = pd.DataFrame(errors, columns=["line", "sent","node", "errortype", "message", "relevant_nodes", "message2"])

type_counts = df.errortype.value_counts()
print(type_counts)

if save_errorlines:
    errorlines = df[df.errortype.str.contains(etype)]
    errorlines.to_csv(f"error_{etype}.csv", index=False)

    ndt_file = "data/ndt_nb_train.conllu"
    outfile = f"error_{etype}_sents.conllu"
    error_sents = fetch_conllu_sents(ndt_file, outfile, sent_ids=errorlines["sent"])
    error_sents.sent_id.nunique()


In [7]:
import json

In [None]:
filename = "pattern_matches.json"

with open(filename) as fp:
    matches = json.load(fp)

len(matches)

In [None]:
sents = {s["sent_id"] for s in matches}
len(sents)

In [None]:
sents

In [None]:
sentences = fetch_conllu_sents("data/ndt_nb_dev.conllu", "data/sentences/matched_sentences.conllu", sents)