### Some scripts for reviewing UD Swedish LinES data and changing values

In this notebook I develop scripts that detects violations of current UD guidelines and, if possible, perform automatic conversions to the recommended analysis.

In [8]:
import re, os
from collections import defaultdict, Counter

Files can be obtained from here -- change these to your own environment

In [12]:
udpath = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/output/'
talbfiles = ['sv_talbanken-ud-dev.conllu', 'sv_talbanken-ud-test.conllu', 'sv_talbanken-ud-train.conllu']

for tbf in talbfiles:    
    if os.path.exists(udpath+tbf):
        print(f'Found file: {tbf}')
    else:
        print(f'ERROR - File not found: {tbf}')

Found file: sv_talbanken-ud-dev.conllu
Found file: sv_talbanken-ud-test.conllu
Found file: sv_talbanken-ud-train.conllu


### Extracting and counting tokens with simple conditions

In [26]:
# 0: ID, 1: FORM, 2: LEMMA, 3: UPOS, 4: XPOS, 5: FEATS, 6: HEAD, 7: DEPREL, 8: ENHANCED-DEPS, 9: MISC

def getMweStatsInTreebank (treebankpath, treebanks, deprel):
    ''' returns frequency for a given deprel in a set of treebanks '''
    nmboftokens = 0
    nmbofdeprels = 0
    for tb in treebanks:
        print(tb)
        with open(treebankpath + tb, "r") as t:
            for line in t:
                if re.match(r'# sent_id', line):
                    headsfound = []
                elif re.match(r'\d', line):
                    nmboftokens += 1
                    info = line.strip().split()
                    if info[7] == deprel:
                        nmbofdeprels += 1
                        headsfound.append(info[6])
                elif (len(line)<2) and (len(headsfound)>0): # empty line
                    headset = set(headsfound)
                    nmbofdeprels += len(headset)
                                                        
    return nmboftokens, nmbofdeprels


An example

In [27]:
tokennmbs, fixednmbs = getMweStatsInTreebank (udpath, talbfiles, 'fixed')

sv_talbanken-ud-dev.conllu
sv_talbanken-ud-test.conllu
sv_talbanken-ud-train.conllu


In [28]:
print('Included tokens:', fixednmbs, 'Total tokens:', tokennmbs)
print('Fixed MWE share:', round((fixednmbs / tokennmbs), 4), '\n')


Included tokens: 3018 Total tokens: 96859
Fixed MWE share: 0.0312 



In [44]:
# 0: ID, 1: FORM, 2: LEMMA, 3: UPOS, 4: XPOS, 5: FEATS, 6: HEAD, 7: DEPREL, 8: ENHANCED-DEPS, 9: MISC

def getTokensByInfo (udfile, poscat, feats):
    ''' returns a list of tokens matching a feature description exactly '''
    retlist = []
    with open (udfile, "r") as u:
        for line in u:
            if re.match(r'# sent_id', line):
                sentid = line.split('=')[-1].strip()
            elif re.match(r'\d', line):
                info = line.split('\t')
                # print(f"{info[3]=}, {info[5]=}")
                if (info[3] == poscat) and (info[5] == feats): # a match
                    match = [sentid, info[1], info[0]]
                    retlist.append(match)
    return retlist


In [46]:
tokens = getTokensByInfo(udpath+talbfiles[0], 'NOUN', 'Case=Nom|Definite=Ind|Gender=Com|Number=Plur')

tokens[:3]

[['sv-ud-dev-9', 'sköterskor', '14'],
 ['sv-ud-dev-10', 'föräldrar', '11'],
 ['sv-ud-dev-12', 'försummelser', '13']]

In [None]:
# compareLemmas (udpath + svlinesfiles[0], udpath + talbfiles[0], 'PRON')

### Finding all descriptions for a list of tokens

In [48]:
# 0: ID, 1: FORM, 2: LEMMA, 3: UPOS, 4: XPOS, 5: FEATS, 6: HEAD, 7: DEPREL, 8: ENHANCED-DEPS, 9: MISC

def findDescriptions (path, corpus, tokenlist):
    ''' returns a dictionary of alternative feature descriptions for a list of tokens '''
    result = defaultdict()
    for corp in corpus:
        udfile = path + corp
        with open(udfile, "r") as u:
            for line in u:    
                if re.match(r'\d', line):
                    info = line.split('\t')
                    # compute the result as lemma_upos_feats
                    entry = info[1].lower()
                    if entry in tokenlist:
                        descr = info[1] + ', Lemma=' + info[2] + ', UPOS=' + info[3] + ', FEATS=' + info[5]
                        try:
                            result[entry][descr] += 1
                        except:
                            try:
                                result[entry][descr] = 1
                            except:
                                result[entry] = Counter()
                                result[entry][descr] = 1
    return result


In [21]:
# Example
tottokens = ['all', 'alla', 'allt', 'båda', 'bägge', 'halv', 'halva', 'halvt', 'hel', 'hela', 'helt', 'varje']
#totdict = findDescriptions(udpath, svlinesfiles, tottokens)


### Extracting and changing lemmas and features for the words *de, den, det*.

We can check the current features by using *findDescriptions*

In [52]:
descrs = findDescriptions (udpath, talbfiles, ['de', 'den', 'det'])

descrs
for wrd in descrs:
    print()
    for desc in descrs[wrd].most_common():
        print(desc[0], desc[1])


de, Lemma=en, UPOS=DET, FEATS=Definite=Def|Number=Plur|PronType=Art 505
de, Lemma=de, UPOS=PRON, FEATS=Case=Nom|Definite=Def|Number=Plur|PronType=Prs 278
De, Lemma=en, UPOS=DET, FEATS=Definite=Def|Number=Plur|PronType=Art 116
De, Lemma=de, UPOS=PRON, FEATS=Case=Nom|Definite=Def|Number=Plur|PronType=Prs 91
de, Lemma=en, UPOS=PRON, FEATS=Case=Nom|Definite=Def|Number=Plur|PronType=Prs 12
de, Lemma=de, UPOS=DET, FEATS=Definite=Def|Number=Plur|PronType=Prs 7
De, Lemma=de, UPOS=DET, FEATS=Definite=Def|Number=Plur|PronType=Prs 2
de, Lemma=de, UPOS=PROPN, FEATS=Case=Nom 2
de, Lemma=de, UPOS=PRON, FEATS=Case=Nom|Definite=Def|Number=Plur|PronType=Ind 1
de, Lemma=de, UPOS=PRON, FEATS=Case=Nom|Definite=Def|Number=Plur|PronType=Rel 1

den, Lemma=en, UPOS=DET, FEATS=Definite=Def|Gender=Com|Number=Sing|PronType=Art 662
den, Lemma=den, UPOS=PRON, FEATS=Definite=Def|Gender=Com|Number=Sing|PronType=Prs 163
Den, Lemma=en, UPOS=DET, FEATS=Definite=Def|Gender=Com|Number=Sing|PronType=Art 152
Den, Lemma=de

In the future these words should carry the following lemmas and features:
 - den, Den with UPOS DET; Lemma: den; Feats: Definite=Def|Gender=Com|Number=Sing|PronType=Art
 - den, Den with UPOS PRON; Lemma: den; Feats: Definite=Def|Gender=Com|Number=Sing|PronType=Prs
 
 - det, Det with UPOS DET; Lemma: den; Feats: Definite=Def|Gender=Neut|Number=Sing|PronType=Art
 - det, Det with UPOS PRON; Lemma: den; Feats: Definite=Def|Gender=Neut|Number=Sing|PronType=Prs
 
 - de, De with UPOS DET; Lemma: de; Feats: Definite=Def|Number=Plur|PronType=Art
 - dom, Dom with UPOS DET; Lemma: de; Feats: Definite=Def|Number=Plur|PronType=Art
 - de, De with UPOS PRON; Lemma: de; Feats: Case=Nom|Definite=Def|Number=Plur|PronType=Prs
 
 - dem, Dem with UPOS PRON; Lemma: de; Feats: Case=Acc|Definite=Def|Number=Plur|PronType=Prs
 - dom, Dom with UPOS PRON; Lemma: de; Feats: Definite=Def|Number=Plur|PronType=Prs

In [25]:
def setNewAnnotation (udfile, outfile, tokenlist, upos, lemmadict, featsdict):
    ''' writes a changed annotation from <udfile> to <outfile> using a <UPOS> and dictionaries with new values '''
    
    changes = Counter()
    for tok in tokenlist:
        changes[tok] = 0
        
    with open (outfile, "w") as w:
        with open (udfile, "r") as u:
            for line in u:
                if re.match(r'\d', line):
                    info = line.strip().split('\t')
                    if (info[1] in tokenlist) and (info[3]==upos):
                        if (info[2] != lemmadict[info[1]]) or (info[5] != featsdict[info[1]]):
                            info[2] = lemmadict[info[1]]
                            info[5] = featsdict[info[1]]
                            newline = '\t'.join(info) + '\n'
                            changes[info[1]] += 1
                            w.write(newline)
                        else:
                            w.write(line)                    
                    else:
                        w.write(line)
                else:
                    w.write(line)
    w.close()
    return changes

In [26]:
detlemmadict = {'den': 'den', 'Den':'den', 'det':'den', 'Det':'den', 'de':'de', 'De':'de', 'dom':'de', 'Dom':'de'}
pronlemmadict = {'den': 'den', 'Den':'den', 'det':'den', 'Det':'den', 'de':'de', 'De':'de', 'dom':'de', 'Dom':'de', 'dem':'de'}
detfeatsdict = {'den': 'Definite=Def|Gender=Com|Number=Sing|PronType=Art', 
                'Den':'Definite=Def|Gender=Com|Number=Sing|PronType=Art',
                'det':'Definite=Def|Gender=Neut|Number=Sing|PronType=Art', 
                'Det':'Definite=Def|Gender=Neut|Number=Sing|PronType=Art', 
                'de':'Definite=Def|Number=Plur|PronType=Art', 
                'De':'Definite=Def|Number=Plur|PronType=Art', 
                'dom':'Definite=Def|Number=Plur|PronType=Art', 
                'Dom':'Definite=Def|Number=Plur|PronType=Art'}
pronfeatsdict = {'den': 'Definite=Def|Gender=Com|Number=Sing|PronType=Prs', 
                'Den':'Definite=Def|Gender=Com|Number=Sing|PronType=Prs',
                'det':'Definite=Def|Gender=Neut|Number=Sing|PronType=Prs', 
                'Det':'Definite=Def|Gender=Neut|Number=Sing|PronType=Prs', 
                'de':'Case=Nom|Definite=Def|Number=Plur|PronType=Prs', 
                'De':'Case=Nom|Definite=Def|Number=Plur|PronType=Prs', 
                'dom':'Definite=Def|Number=Plur|PronType=Prs', 
                'Dom':'Definite=Def|Number=Plur|PronType=Prs', 
                'dem':'Case=Acc|Definite=Def|Number=Plur|PronType=Prs'}
    

We test before doing it permanently...

In [27]:
trialfiles = ['trial_240105-dev.conllu', 'trial_240105-test.conllu', 'trial_240105-train.conllu']
devresult = setNewAnnotation(trialfiles[0], 'trial-dev.conllu', pronlemmadict.keys(), 'PRON', pronlemmadict, pronfeatsdict)

In [28]:
testresult = setNewAnnotation(trialfiles[1], 'trial-test.conllu', pronlemmadict.keys(), 'PRON', pronlemmadict, pronfeatsdict)
trainresult = setNewAnnotation(trialfiles[2], 'trial-train.conllu', pronlemmadict.keys(), 'PRON', pronlemmadict, pronfeatsdict)

In [29]:
print(devresult, '\n', testresult, '\n', trainresult)

Counter({'dom': 17, 'den': 3, 'Dom': 3, 'de': 1, 'Den': 0, 'det': 0, 'Det': 0, 'De': 0, 'dem': 0}) 
 Counter({'det': 8, 'dom': 7, 'den': 2, 'Den': 1, 'de': 1, 'Dom': 1, 'Det': 0, 'De': 0, 'dem': 0}) 
 Counter({'den': 125, 'Den': 24, 'dom': 13, 'Dom': 4, 'det': 3, 'de': 2, 'Det': 0, 'De': 0, 'dem': 0})


We can then check the result using *findDescriptions*

### Setting new features for adjectives

We have arrived at the following principles for annotation of ADJ features:

    - Base forms such as STOR: Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing
    - Singular, neuter forms: STOR-t: Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing
As a rule there are two alternatives for forms ending in *-a* or *-e*:

    - if definite: Case=Nom|Definite=Def|Degree=Pos 
    - if indefinite, plural: Case=Nom|Definite=Ind|Degree=Pos|Number=Plur
Om adjektivet har samma form i utrum och neutrum, som SVART faller **Gender** bort. Om det har samma form i bestämd och obestämd form, som GRÅ, faller **Definite** bort, och vid samma form i singularis och pluralis, som BRA, faller dessutom **Number** bort.

Om adjektivet är komparerat: STÖRRE: **Case=Nom|Degree=Cmp**

Om adjektivet står i superlativ finns två former, STÖRST: **Case=Nom|Definite=Ind|Degree=Sup** och STÖRSTA: **Case=Nom|Definite=Def|Degree=Sup**

Om adjektivet står i genitiv gäller för alla fall att Case=Nom ersätts med: **Case=Gen**

Ordningstal som TREDJE eller ELFTE anses inte kunna kompareras och annoteras: **Case=Nom|Number=Sing|NumType=Ord**
    
Presens participformer så som *bedårande*: **Case=Nom|Degree=Pos|Tense=Pres|VerbForm=Part**

Perfekt participformer som *fruktad*, *berömd*:

    - sing. obestämd, FRUKTA-d: Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing|Tense=Past|VerbForm=Part
    - sing, obestämd, FRUKTA-t: Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part
    - plur obestämd, FRUKTA-de: Case=Nom|Definite=Ind|Degree=Pos|Number=Plur|Tense=Past|VerbForm=Part, eller
    - best., den/det/de FRUKTA-de: Case=Nom|Definite=Def|Degree=Pos|Tense=Past|VerbForm=Part


In [31]:
def extractProps(proplist):
    ''' this extracts info from a complete token description '''
    retlist = [proplist[1], proplist[2], proplist[3]]
    if proplist[3] == 'DET':
        matchObj1 = re.search(r'Definite=(\w\w\w)', proplist[5])
        matchObj2 = re.search(r'Number=(\w\w\w\w?)', proplist[5])
        matchObj3 = re.search(r'Gender=(\w\w\w\w?)', proplist[5])
        if matchObj1:
            retlist.append(matchObj1.group(1))
        if matchObj2:
            retlist.append(matchObj2.group(1))
            if matchObj2.group(1) == 'Sing' and matchObj3:
                retlist.append(matchObj3.group(1))
    elif re.search(r'Poss=Yes', proplist[4]):
        retlist.append('Poss')
        matchObj1 = re.search(r'Number=(\w\w\w\w?)', proplist[5])
        matchObj2 = re.search(r'Gender=(\w\w\w\w?)', proplist[5])
        if matchObj1:
            retlist.append(matchObj1.group(1))
            if matchObj1.group(1) == 'Sing' and matchObj2:
                retlist.append(matchObj2.group(1))
    elif proplist[3] == 'NOUN':
        matchObj1 = re.search(r'Number=(\w\w\w\w?)', proplist[5])
        matchObj2 = re.search(r'Gender=(\w\w\w\w?)', proplist[5])
        if matchObj1:
            retlist.append(matchObj1.group(1))
            if matchObj1.group(1) == 'Sing' and matchObj2:
                retlist.append(matchObj2.group(1))
    elif proplist[4] == 'ORD':
        retlist.append('Ord')
    return retlist

def createFeats (feats, myprops, sentprops):
    ''' returns new features for adjectives, not complete and will miss several tokens that are now annotated wrongly '''
    
    if (myprops[0][-1] in ['f', 'g', 'l', 'm', 'n', 'p', 'r', 's', 'v']) and (myprops[0] == myprops[1]+'t'):
        nfeats = 'Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing'
    elif myprops[0][-2:] == 'ad' and myprops[1].endswith('a'):
        nfeats = 'Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing|Tense=Past|VerbForm=Part'
    elif (myprops[0][-3:] == 'ade') and ((myprops[1].endswith('a')) or (myprops[1].endswith('d'))):
        if re.search(r'Plur', feats):
            nfeats = 'Case=Nom|Definite=Ind|Degree=Pos|Number=Plur|Tense=Past|VerbForm=Part'
        else: 
            nfeats = 'Case=Nom|Definite=Def|Degree=Pos|Tense=Past|VerbForm=Part'        
    elif myprops[0][-2:] == 'at' and ((myprops[1].endswith('a')) or (myprops[1].endswith('d'))):
        nfeats = 'Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part'
    elif myprops[0][-3:] == 'sta' and re.search(r'Sup', feats):
        nfeats = 'Case=Nom|Definite=Def|Degree=Sup'
    elif myprops[0][-2:] == 'st' and re.search(r'Sup', feats):
        nfeats = 'Case=Nom|Definite=Ind|Degree=Sup'
    elif myprops[0][-2:] == 're' and re.search(r'Cmp', feats):
        nfeats = 'Case=Nom|Degree=Cmp'
    elif myprops[0][-3:] != 'sta' and myprops[0] == myprops[1] + 'a' and re.search(r'=Def', feats):
        nfeats = 'Case=Nom|Definite=Def|Degree=Pos'
    elif myprops[0][-3:] != 'sta' and myprops[0] == myprops[1] + 'a' and re.search(r'Plur', feats):
        nfeats = 'Case=Nom|Definite=Ind|Degree=Pos|Number=Plur'
    elif myprops[-1] == 'Ord':
        nfeats = 'Case=Nom|Number=Sing|NumType=Ord'
    elif myprops[0][-3:] == 'nde':
        nfeats = 'Case=Nom|Degree=Pos|Tense=Pres|VerbForm=Part'
    else:
        nfeats = feats
    return nfeats

                
def changeAdjFeats (svudfile, outfile, newfeatsdict, stopix):
    ''' Writes updated features for ADJs to <OUTFILE> given feats collected from the function createFeats '''
    ''' The last parameter is used for breaking reading the input file '''
    oldfeats = {}
    oldinfo = {}
    newfeats = {}
    changes = 0
    adjix = 0
    six = 0
    
    with open (outfile, "w") as w:
        with open (svudfile, "r") as f:
            props = defaultdict()
            for line in f:
                line = line.rstrip()
                if len(line) == 0:
                    for ix in props:
                        if props[ix][2] == 'ADJ':
                            adjix += 1
                            oldfeats[adjix] = oldfeats[ix]
                            newfeats[adjix] = createFeats(oldfeats[ix], props[ix], props)
                            
                            if newfeats[adjix] != oldfeats[adjix]:
                                changes += 1
                            newinfo = oldinfo[ix]
                            newinfo[5] = newfeats[adjix]
                            w.write('\t'.join(newinfo))
                            w.write('\n')
                        else:
                            w.write('\t'.join(oldinfo[ix]))
                            w.write('\n')
                    w.write('\n')        
                    props = defaultdict()
                elif re.match(r'\d', line):
                    six += 1
                    #if six == stopix:
                        #break
                    info = line.split('\t')
                    oldfeats[info[0]] = info[5]
                    oldinfo[info[0]] = info
                    props[info[0]] = extractProps(info)
                else:
                    w.write(line+'\n')
        f.close()
    w.close()
    return oldfeats, newfeats, adjix, changes


An example:

In [32]:
oldstuff, newstuff, adjs, changed = changeAdjFeats ('trial_240106-dev.conllu', 'trial-dev.conllu', {}, 1000)
print('\nHave read', adjs, 'adjectives, and changed', changed)


Have read 1257 adjectives, and changed 322
