In [2]:
import convert_morph as conv 
import partition_data as partd
from pathlib import Path
import pandas as pd
from collections import defaultdict


In [3]:
#ud_file = Path("data/gullkorpus/2023_gullkorpus_ud.conllu")
ud_file = Path("data/gullkorpus/2019_gullkorpus_ud_før_annotasjon.conllu")
ndt_file = Path("data/gullkorpus/2019_gullkorpus_ndt.conllu")


In [4]:
# Last inn data 

ndt = partd.parse_conll_file(ndt_file)
ud = partd.parse_conll_file(ud_file)


In [10]:
def load_conll_to_df(sentences):
    dfs =[]
    for idx, sentence in enumerate(sentences.get("sentences")): 
        sent_df = pd.DataFrame(sentence.get("tokens"))
        sent_df["sent_id"] = sentence.get("sent_id")
        sent_df["text"] = sentence.get("text")
        sent_df["idx"] = idx
        dfs.append(sent_df)

    return pd.concat(dfs, ignore_index=True)

In [10]:
sentences = ndt.get("sentences")
df = load_conll_to_df(sentences)
ud_df = load_conll_to_df(ud)

In [11]:
ud_df

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id,text,idx
0,1,Ville,ville,AUX,_,Mood=Ind|Tense=Past|VerbForm=Fin,5,aux,_,_,005326,Ville historien min vært annerledes da?,0
1,2,historien,historie,NOUN,_,Definite=Def|Gender=Masc|Number=Sing,5,nsubj,_,_,005326,Ville historien min vært annerledes da?,0
2,3,min,min,PRON,_,Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs,2,nmod,_,_,005326,Ville historien min vært annerledes da?,0
3,4,vært,være,AUX,_,VerbForm=Part,5,cop,_,_,005326,Ville historien min vært annerledes da?,0
4,5,annerledes,annerledes,ADJ,_,Definite=Ind|Degree=Pos|Number=Sing,0,root,_,_,005326,Ville historien min vært annerledes da?,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3102,20,og,og,CCONJ,_,_,21,cc,_,_,010386,For i en bransje som stadig er jakt etter morg...,201
3103,21,skynder,skynde,VERB,_,Mood=Ind|Tense=Pres|VerbForm=Fin,13,conj,_,_,010386,For i en bransje som stadig er jakt etter morg...,201
3104,22,seg,seg,PRON,_,Case=Acc|PronType=Prs|Reflex=Yes,21,obj,_,_,010386,For i en bransje som stadig er jakt etter morg...,201
3105,23,langsomt,langsom,ADJ,_,Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing,21,advmod,_,SpaceAfter=No,010386,For i en bransje som stadig er jakt etter morg...,201


In [7]:
for col in ["UPOS","FEATS", "DEPREL"]:
    df[f"UD_{col}"] = ud_df[col]


In [8]:

add_to_feats = ["Foreign=Yes", "Typo=Yes", "VerbType=Mod", "VerbType=Cop", "VerbType=Aux", "Voice=Act",  ]

In [9]:
# Konverter POS-taggene

def get_dependents(sentence, token):
    return [token_i for token_i in sentence if token.get("ID") == token_i.get("HEAD")] # token.get("ID") != token_i.get("ID") ]

def get_labels(tokens):
    return [t.get("DEPREL") for t in tokens]

def convert_pos_new(token, sentence):
    pos = token.get("UPOS")
    lemma = token.get("LEMMA")
    feats = token.get("FEATS").split("|")

    # direct mapping

    def convert_verb_pos():
        auxlemmas = ["bli", "burde", "få", "ha", "kunne", "måtte", "skulle", "tørre", "ville", "være"]
        deps = get_dependents(sentence, token)
        labels = get_labels(deps) if deps else []

        if (
            "INFV" in labels and lemma in auxlemmas
            # token is a modal auxiliary verb
            ) or (
            "SPRED" in labels and lemma == "være"
            # token is a copular verb
            ):
            return "AUX" 
        return "VERB"


    def convert_det_pos():
        quantifiers = [
            "all", "alt", "alle", "en", "et", "ei", "enhver", "ethvert",
            "hver", "hvert", "ingen", "noe", "noen", "samtlige", "begge"
        ]
        if "poss" in feats:
            return "PRON"
        if "romertall" in feats:
            return "NUM"
        if "kvant" in feats:
            return "DET" if lemma in quantifiers else "NUM"
        return "DET"

    def convert_prep_pos():
        if lemma == "der" and token.get("DEPREL") in ["FSUBJ", "FOBJ"]:
            return "PRON"
        elif lemma in ['her', 'her', 'der', 'herfra', 'derfra', 'hit', 'dit']:
            return "ADV"
        return "ADP"

    # special cases
    pos_conversion = {
        "subst": "PROPN" if "prop" in feats else "NOUN",
        "symb": "PUNCT" if lemma in ["$/", "*"] else "SYM", 
        'verb': convert_verb_pos(), #'VERB' or 'AUX',
        'det': convert_det_pos(), #'DET', 'PRON', 'NUM'
        'adj': 'ADJ', 
        'adv': "PART" if lemma in ["ikke", "ei"] else "ADV", #'ADV',
        'clb': "PUNCT",
        'prep': convert_prep_pos(), #'ADP',"PRON", 'ADV'
        'pron': 'PRON',
        '<komma>': 'PUNCT',
        'konj': 'CCONJ',
        'inf-merke': 'PART',
        '<anf>': 'PUNCT',
        'sbu': 'SCONJ',
        '<strek>': 'PUNCT',
        'ukjent': 'X', # feat: Foreign=Yes
        '<parentes-beg>': 'PUNCT',
        '<parentes-slutt>': 'PUNCT',
        'interj': 'INTJ'
    }
    newpos = pos_conversion.get(pos)
    return pos if newpos is None else newpos



In [10]:
def overlaps(actual_values, qualifying_values):
    return any(feat in qualifying_values for feat in actual_values)

def is_neg(token):
    lemma = token.get("LEMMA")
    deprel = token.get("DEPREL")
    return (lemma == "ikke" or (lemma == "ingen" and deprel == "DET"))

def field_is_empty(field):
    if isinstance(field, list):
        return (len(field) == 1 and field[0] == "_") or (field == [])
    elif isinstance(field, str):
        return field == "_"

def replace_placeholder(feats: list, addendum: list):
    if field_is_empty(feats):
        return addendum if not field_is_empty(addendum) else ["_"]
    feats.extend(addendum)
    return list(set(feats))



In [11]:
def add_feats(token):
    lemma = token.get("LEMMA")
    pos = token.get("UPOS")
    feats = token.get("FEATS").split("|")  # turn feats into a list
    new_feats = []

    possessivepronouns = ["min", "din", "sin", "hans","hennes", "dens", "dets", "vår", "deres"]
    pron_det_lemma_feats_map = {
            "en": "art",
            "seg": "pers",
            "noen": "ind",
            "noe": "ind",
            "endel": "ind",
            "ingen": "negpron|neg",
            "ingenting": "negpron|neg",
            "alle": "tot",
            "all": "tot",
            "hver": "tot",
            "enhver": "tot",
            "begge": "tot",
            "samtlige": "tot",
            "selv": "pers",
            "selve": "pers",
            "sjølv": "pers",
            "egen": "pers",
            "som": "rel",
        }
    pron_det_lemma_feats_map.update(
        {posspron: "pers" for posspron in possessivepronouns})
    pron_feats = ['pers', 'dem', 'sp', 'res','art', 'ind', 'negpron', 'tot', 'rel']
    verb_feats = ['pres', 'pret', 'perf-part', 'imp', 'inf', 'pres-part']
    
    if pos == "NUM":
        new_feats.append("card")
    if is_neg(token):
        new_feats.append("neg")
    if pos == "PRON" or pos == "DET":
        if lemma not in pron_det_lemma_feats_map and overlaps(feats, pron_feats):
            return feats
        new_feats.append(pron_det_lemma_feats_map.get(lemma, "pers"))
    if pos == "VERB" or pos == "AUX":
        if overlaps(feats, verb_feats):
            return feats         
        new_feats.append("pres")
    return replace_placeholder(feats,new_feats)


In [12]:
# feats from NDT and Oslo-Bergen Tagger: 
# https://tekstlab.uio.no/obt-ny/english/morphosyn.html 
feats_map = {
    'pret': {'Mood': 'Ind', 'Tense': 'Past', 'VerbForm': 'Fin'},
    'appell': '_', # Common noun, POS=NOUN
    'mask': {'Gender': 'Masc'},
    'be': {'Definite': 'Def'},
    'ent': {'Number': 'Sing'},
    'poss': {'Poss': 'Yes'},
    'perf-part': {'VerbForm': 'Part'},
    'ub': {'Definite': 'Ind'},
    'm/f': {'Gender':'Fem,Masc'}, # New 2.12
    'pos': {'Degree': 'Pos'},
    '<spm>': '_', # Spørsmålstegn, POS=PUNCT
    'prop': '_', # Egennavn, POS = PROPN
    'fem': {'Gender': 'Fem'},
    'pres': {'Mood': 'Ind', 'Tense': 'Pres', 'VerbForm': 'Fin'},
    'pers': {'PronType': 'Prs'},
    'hum': {'Animacy': 'Hum'},
    '3': {'Person': '3'},
    'nom': {'Case': 'Nom'},
    '<punkt>': '_', #tegnsetting, POS=PUNCT
    'nøyt': {'Gender': 'Neut'},
    'inf': {'VerbForm': 'Inf'},
    'kvant': '_', #quantifier (POS=DET)
    'samset': '_', #? DEPREL=compound? flat? fixed?
    '1': {'Person': '1'},
    'clb': '_', #Clause Boundary
    'dem': {'PronType': 'Dem'},
    '<adj>': '_', # POS = ADJ
    'fl': {'Number': 'Plur'},
    '<ikke-clb>': '_', # Not Clause boundary 
    'ufl': '_',  # incomplete 
    '<pres-part>': {'VerbForm': 'Part'},
    'komp': {'Degree': 'Cmp'},
    'akk': {'Case': 'Acc'},
    '<kolon>': '_', # Colon, POS=PUNCT
    'ubøy': '_', # Uninflected
    '<adv>': '_', # POS = ADV
    'gen': {'Case': 'Gen'},
    'refl': {'Reflex': 'Yes'},
    '<perf-part>': {'VerbForm': 'Part'},
    'pass': {'Voice': 'Pass'},
    'sp': {'PronType': 'Int'},
    '<s-verb>': '_', # S-verb, f.eks. finnes, synes
    'sup': {'Degree': 'Sup'},
    'fork': {'Abbr': 'Yes'},
    '<ordenstall>': {'NumType': 'Ord'}, # new 2.12
    'unorm': '_', # feat Typo? 
    '2': {'Person': '2'},
    '<utrop>': '_', #! POS=PUNCT
    'forst': '_', # ex: egen, selv 
    'imp': {'Mood': 'Imp', 'VerbForm': 'Fin'},
    'res': {'PronType': 'Rcp'},
    'art': {'PronType': 'Art'},
    'ind': {'PronType': 'Ind'},
    'negpron': {'PronType': 'Neg'},
    'neg': {'Polarity': 'Neg'},
    'tot': {'PronType': 'Tot'},
    'rel': {'PronType': 'Rel'},
    'card': {'NumType': 'Card'}}


In [13]:

def convert_feats_new(token):
    feats = add_feats(token)
    mapped_feats = map_feats(feats)
    formatted = "|".join(sorted(mapped_feats, key=str.lower))
    return formatted if formatted else "_"

def map_feats(feats):
    newfeats = defaultdict(list)
    mapped = (feats_map.get(feat, "_") for feat in feats)
    for feat in mapped:
        if not isinstance(feat, dict):
            continue
        for (feattype, val) in feat.items():
            newfeats[feattype].append(val)
    formatted = [format_ud_feat(*feat) for feat in newfeats.items()]
    return formatted

def format_ud_feat(feat_type, feat_val):
    value = ",".join(sorted(feat_val, key=str.lower))
    return f"{feat_type}={value}"


In [14]:
# Konverter POS og  morfologiske trekk fra NDT til UD

def convert_morphological_analysis(sentence):
    for token in sentence:
        try:
            token["UPOS"] = convert_pos_new(token, sentence)
            token["FEATS"] = convert_feats_new(token)
            yield token
        except ValueError:
            print("Skipping token that raises value error:", token)
            continue

def new_process(filepath):
    conll_data = partd.parse_conll_file(filepath)
    sentences = [list(convert_morphological_analysis(sentence.get("tokens")))
                 for sentence in conll_data.get("sentences")]
    return sentences


In [15]:

new_result = new_process(ndt_file)

In [12]:
# Partisjoner gullkorpuset ut fra setnings-IDene i partisjonene av UD (dev, test, train)

from csv import QUOTE_NONE

partitions = {
    "test" :  Path("data/ndt_nb_test_2023.conllu"),
    "dev" : Path("data/ndt_nb_dev_2023.conllu"),
    "train" : Path("data/ndt_nb_train_2023.conllu"),
}


goldstandard = Path("data/gullkorpus/2023_gullkorpus_ud.conllu")
partition= ndt_devset

#def fetch_partition_ids(goldstandard, partition):
gold = partd.parse_conll_file(goldstandard)
part = partd.parse_conll_file(partition)

gold_df = load_conll_to_df(gold)

part_data = []
for sentence in part.get("sentences"):
    sent_df = pd.DataFrame(sentence.get("tokens"))
    for key, value in sentence.items():
        if key == "tokens":
            continue
        sent_df[key] = value
    part_data.append(sent_df)

part_df = pd.concat(part_data)
try:
    part_ids = part_df["ud_id"]
except:
    part_ids = part_df["sent_id"]

gold_part = gold_df[gold_df.sent_id.isin(part_ids)]
gold_df.loc[gold_df.sent_id.isin(part_ids), "partition"] = "dev"

def write_df_to_conll(df):
    df.to_csv(
        sep='\t', header=False, index=False,  quoting=QUOTE_NONE,
        quotechar="", escapechar="\\", na_rep="_")




In [14]:
gold_part = gold_df[gold_df.sent_id.isin(part_ids)]

In [16]:
gold_df.loc[gold_df.sent_id.isin(part_ids), "partition"] = "dev"

In [18]:
gold_df[gold_df.partition == "dev"]

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id,text,idx,partition
86,1,Kultur,kultur,NOUN,_,Definite=Ind|Gender=Masc|Number=Sing,16,nsubj,_,_,017876,"Kultur og kunst, medier, frivillige organisasj...",7,dev
87,2,og,og,CCONJ,_,_,3,cc,_,_,017876,"Kultur og kunst, medier, frivillige organisasj...",7,dev
88,3,kunst,kunst,NOUN,_,Definite=Ind|Gender=Masc|Number=Sing,1,conj,_,SpaceAfter=No,017876,"Kultur og kunst, medier, frivillige organisasj...",7,dev
89,4,",","$,",PUNCT,_,_,5,punct,_,_,017876,"Kultur og kunst, medier, frivillige organisasj...",7,dev
90,5,medier,medium,NOUN,_,Definite=Ind|Gender=Neut|Number=Plur,1,conj,_,SpaceAfter=No,017876,"Kultur og kunst, medier, frivillige organisasj...",7,dev
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2982,6,en,en,DET,_,Gender=Masc|Number=Sing|PronType=Art,7,det,_,_,017678,Tenk om vi hadde fått en helårsvei til Grotli.,196,dev
2983,7,helårsvei,helårsvei,NOUN,_,Definite=Ind|Gender=Masc|Number=Sing,5,obj,_,_,017678,Tenk om vi hadde fått en helårsvei til Grotli.,196,dev
2984,8,til,til,ADP,_,_,9,case,_,_,017678,Tenk om vi hadde fått en helårsvei til Grotli.,196,dev
2985,9,Grotli,Grotli,PROPN,_,_,7,nmod,_,SpaceAfter=No,017678,Tenk om vi hadde fått en helårsvei til Grotli.,196,dev


In [None]:
#Skriv CONLLU-filer med og uten kommentarlinjer


def iterate_conll_data_dict(data, add_comments=False):
    for sentence in data.get("sentences"):
        if add_comments:
            # Can be 'sent_id', 'text', 'newpar' or 'newpar id', 'newdoc' or 'newdoc id'
            yield f"# sent_id = {sentence.get('sent_id')}\n"
            yield f"# text = {sentence.get('text')}\n"
        for token in sentence.get("tokens"):
            yield "\t".join(map(str, token.values())) + "\n"
        yield "\n"
    return "\n"


def write_conll(data, path:Path, add_comments=False):
    print(f"Write conll data to {path.name}")
    output_data = iterate_conll_data_dict(data, add_comments=add_comments)
    with open(path, "w+", encoding="utf-8") as fp:
        fp.writelines(output_data)



def remove_comments(conll_file):
    fpath = Path(conll_file)
    conll_data = parse_conll_file(fpath).get("sentences")
    write_conll(conll_data, fpath, suffix="_uten_hash")            

In [None]:
# Skriv CONLL-filer med kommentarlinjer

def get_conll(totaldf, funclist = [], reduced = False, path= None):
    """Apply functions in funclist (df to df) to totaldf and produce conllu string"""
    gb = totaldf.groupby("sent_id")
    mystring = ""
    ids = gb.groups.keys()
    for id in ids:
        df = gb.get_group(id).copy()
        for func in funclist:
            df = df.pipe(func, id).copy()
        if df.iloc[0, -2]:
            mystring += "# newpar\n"
        mystring += f"# ndt_id = {id}\n"
        mystring += f"# ud_id = {df.iloc[0, 9]}\n"
        mystring += f"# text = {df.iloc[0, 10]}\n"
        df.loc[:, "misc_1"] = "_"
        df.loc[:, "misc_2"] = "_"
        if reduced: # For comparing tokenization in different versions of the treebank
            mystring += df.to_csv(columns=["token_order", "form", "lemma"], sep='\t', header=False, index=False,  quoting=QUOTE_NONE, quotechar="",  escapechar="\\", na_rep="_")
        else: #write the full treebank
            mystring += df.to_csv(columns=["token_order", "form", "lemma", "pos", "pos", "feats", "head", "deprel", "misc_1", "misc_2"], sep='\t', header=False, index=False,  quoting=QUOTE_NONE, quotechar="",  escapechar="\\", na_rep="_")
        mystring += "\n"
    if path is not None:
        with Path(path).open(mode="w") as filepath:
            filepath.write(mystring)
    return mystring

