In [1]:
import pandas as pd
import re
import subprocess
from pathlib import Path

from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText
from udapi.block.ud.fixpunct import FixPunct
from udapi.core.document import Document

import utils.extract_errorlines as err
import utils.convert_morph as conv
import utils.parse_conllu as parser

In [2]:
# KONFIGURER PARAMETERE HER

# Språk og partisjon
partition = "dev"  # "test" # "train"
lang = "nb"  # "nn"
language = "bokmaal" if lang == "nb" else "nynorsk"


# Filstier
ndt_file=f"data/ndt_aligned_with_ud/ndt_{lang}_{partition}.conllu"
ud_output_file=f"data/converted/no_{language}-ud-{partition}.conllu"
ud_official_file=f"data/UD_official/no_{language}-ud-{partition}.conllu"
report_file = f"validation-report_ndt2ud_{lang}_{partition}.txt"

# Opprett en mappe for midlertidige filer
tmpdir = Path("tmp")
tmpdir.mkdir(exist_ok=True)

## 1. Morfologiske trekk
NDT har sine egne ordklasser (POS-tags) og morfologiske trekk (feats), som må konverteres til UDs tags og feats før vi kan bruke `Grew`. 

* Modulen `convert_morph.py` konverterer NDTs morfologiske annotasjoner til UD-annotasjoner.
* `udapi` har funksjoner for å annotere `SpaceAfter=No` i `MISC`-feltet (`SetSpaceAfterFromText`), som sikrer rett detokenisering når `# text`-kommentaren ikke er tilgjengelig, og for å fikse tegnsetting (`FixPunct`).


In [3]:
# Konverter morfologiske trekk for alle splittene av retokenisert ndt data

# for partition in ["train", "test", "dev"]:
#    for lang in ["nb", "nn"]:
# input_file = Path(f"data/ndt_aligned_with_ud/ndt_{lang}_{partition}.conllu")

input_file = ndt_file
data = parser.parse_conll_file(Path(input_file))  # Last inn data

morphdata = conv.convert_morphology(data)  # Konverter NDT sine UPOS og FEATS til UD-standard

tmpfile1 = str(tmpdir / "01_convert_morph_output.conllu")
parser.write_conll(morphdata, tmpfile1, add_comments=True)  # Skriv CONLL-data til disk

In [4]:
# Legg til SpaceAfter=No i MISC-feltet
doc = Document(tmpfile1)

processor = SetSpaceAfterFromText()
processor.process_document(doc)

# output_file = input_file.parent / f"{input_file.stem}_udmorph{input_file.suffix}"
tmpfile2 = str(tmpdir / "02_udapy_spaceafter.conllu")
doc.store_conllu(tmpfile2)

## 2. Dependensrelasjoner 

NDT har andre retningslinjer enn UD, slik at vi må oversette og flytte eller reversere noen dependensrelasjoner.

Vi bruker [`Grew`](https://grew.fr/) for å konvertere fra NDT til UD. 


In [5]:
tmpfile3 = str(tmpdir / "03_grew_transform_deprels.conllu")

!grew transform \
    -i  $tmpfile2 \
    -o  $tmpfile3 \
    -grs  "rules/NDT_to_UD.grs" \
    -strat "main_"$lang \
    -safe_commands


100.00%                                                                                             


## 3. Fiks feil som introduseres underveis

In [33]:
# Fix punctuation
doc = Document(tmpfile3)

processor = FixPunct()
processor.process_document(doc)

tmpfile4  = str(tmpdir / "04_udapy_fixpunct.conllu")
doc.store_conllu(tmpfile4)

In [36]:
tmpfile5 = tmpdir / "05_grew_transform_postfix.conllu"

!grew transform \
    -i $tmpfile4 \
    -o $tmpfile5 \
    -grs rules/NDT_to_UD.grs \
    -strat "postprocess" \
    -safe_commands



In [51]:
# Remove comment line with column names and replace invalid newpar lines
text = tmpfile5.read_text()

text = re.sub(r"\#  = \# newpar", "# newpar", text)
text = re.sub(r"\# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC", "", text)

doc = Document()
doc.from_conllu_string(text)
doc.store_conllu(ud_output_file)

## 4. Validering

In [12]:
# UDs eget repo tools har et python-skript som må kjøres "fra kommandolinjen"
command = [
    "python",
    "tools/validate.py",
    "--max-err",
    "0",
    "--lang",
    "no",
    ud_output_file,
]

# Run the command and capture the output
result = subprocess.run(command, capture_output=True, text=True)

Path(report_file).write_text(result.stderr)


# Oppsummering av valideringsrapporten
err.report_errors(report_file)

## 5. Visualisering

In [None]:
# Remove comments from the output file and the official UD file so MaltEval can read them
tmpfile7 = tmpdir / "07_remove_comments.conllu"

parser.write_conll(
    parser.parse_conll_file(Path(ud_output_file)),
    Path(tmpfile7),
    add_comments=False
)


maltgold = tmpdir / "malt_ud_official.conllu"

parser.write_conll(
    parser.parse_conll_file(Path(ud_official_file)),
    Path(maltgold),
    add_comments=False
)

# Kjør MaltEval for å sammenligne UD og den konverterte filen visuelt
!java -jar dist-20141005/lib/MaltEval.jar -g $maltgold -s $tmpfile7 -v 1


----

# KODEDUMP

In [None]:
UDFILE=ndt_file

# Fix punctuation
!cat $UDFILE | udapy -s ud.FixPunct > out.conllu

!cat $UDFILE | udapy -TM \
    util.Mark node='node.lemma == "som"' > som.txt

!udapy -HMAC \
    read.Conllu zone=old files=$UDFILE \
    read.Conllu zone=new files=out.conllu \
    util.MarkDiff gold_zone=old attributes='form,lemma,upos,xpos,deprel,feats,misc' add=True > diff.html

---

## Partisjoner data

In [None]:
# Partisjoner data i dev, train, og test-splittene som UD er delt inn i

partitions = {
    "test" : "data/test_ids.txt",
    "dev" : "data/dev_ids.txt",
    "train" : "data/train_ids.txt",
}

def partition_file(filepath):
    fpath = Path(filepath)
    for part, idfile in partitions.items():
        outputfile = (fpath.parent / f"{fpath.stem}_{part}{fpath.suffix}")
        part_ids =  parser.filereadlines(idfile)
        part_df = df[df.sent_id.isin(part_ids)]
        parser.write_df_to_conll(part_df, outputfile, add_comments=True)

partition_file("data/gullkorpus/2023_gullkorpus_ud.conllu")
partition_file("data/gullkorpus/2019_gullkorpus_ud_før_annotasjon.conllu")
partition_file("data/gullkorpus/2019_gullkorpus_ndt.conllu")

## Hent eksempelsetninger

In [5]:
from utils.parse_conllu import extract_partition


def fetch_conllu_sents(conllu_file, outputfile, sent_ids):
    data = parser.load_conll_to_df(
        parser.parse_conll_file(conllu_file)
    )
    sents = data[data.sent_id.isin(sent_ids)]
    parser.write_df_to_conll(
        sents, outputfile, add_comments=True)
    return sents

In [None]:
#Hent ut setningene som gir spesifikke feilmeldinger

fpath = Path("validation-report_ndt2ud.txt")
etype = "rel-upos-punct"
save_errorlines = False

rows = Path(fpath).read_text(encoding="utf-8").splitlines()

error_info_regx = re.compile(r"^\[Line (\d+)(?: Sent )?(\d+)?(?: Node )?(\d+)?\]\: \[(L.*)\] (.*)(\[[0-9]*, [0-9]*\])?(.*)?$", flags=re.DOTALL)
errors = []
for row in rows:
    m = error_info_regx.fullmatch(row)
    if m is not None:
        errors.append(m.groups())

df = pd.DataFrame(errors, columns=["line", "sent","node", "errortype", "message", "relevant_nodes", "message2"])

type_counts = df.errortype.value_counts()
print(type_counts)

if save_errorlines:
    errorlines = df[df.errortype.str.contains(etype)]
    errorlines.to_csv(f"error_{etype}.csv", index=False)

    ndt_file = "data/ndt_nb_train.conllu"
    outfile = f"error_{etype}_sents.conllu"
    error_sents = fetch_conllu_sents(ndt_file, outfile, sent_ids=errorlines["sent"])
    error_sents.sent_id.nunique()


In [7]:
import json

In [None]:
filename = "pattern_matches.json"

with open(filename) as fp:
    matches = json.load(fp)

len(matches)

In [None]:
sents = {s["sent_id"] for s in matches}
len(sents)

In [None]:
sents

In [None]:
sentences = fetch_conllu_sents("data/ndt_nb_dev.conllu", "data/sentences/matched_sentences.conllu", sents)