# Import

In [1]:
import udapi

# Classes

In [2]:
class FilterList(list):
    def filter(self, **kwargs):
        matched_tokens = []
        for tb, node in self:
            found = True
            for k, v in kwargs.items():
                if isinstance(v, str):
                    if k in node.feats.keys():
                        if node.feats[k] != v:
                            found = False
                            break
                    elif hasattr(node, k):
                        if getattr(node, k) != v:
                            found = False
                            break
                    else:
                        if v is not None:
                            found = False
                            break
                elif callable(v):
                    if not v(node):
                        found = False
                        break
                else:
                    found = False    
            if found:
                matched_tokens.append((tb, node))
        return matched_tokens
    
    def content(self, address=False, form=True, lemma=False, upos=False, deprel=False, feats=False, text=False, treebank=False, sorting_fn=lambda node: node[1].form.lower()):
        output = list()
        for tb, node in sorted(self, key=sorting_fn):
            line = ''
            line += f"id='{node.address()}'" + '\t' if address else ''
            line += f"treebank='{tb}'" + '\t' if treebank else ''
            line += f"form='{node.form}'" + '\t' if form else ''
            line += f"lemma='{node.lemma}'" + '\t' if lemma else ''
            line += f"upos='{node.upos}'" + '\t' if upos else ''
            line += f"deprel='{node.deprel}'" + '\t' if deprel else ''
            line += f"feats='{node.feats.__str__()}'" + '\t' if feats else ''
            line += f"text='{node.root.compute_text()}'" + '\t' if text else ''
            
            if line not in output:
                output.append(line)
        
        return output
        

class Treebanks:
    def __init__(self, treebanks) -> None:
        self.treebanks = treebanks
        for key, tb in self.treebanks.items():
            setattr(self, key, tb)

    
    @property
    def nodes(self):
        for key, tb in self.treebanks.items():
            for node in tb.nodes:
                yield key, node

    def filter(self, **kwargs):
        matched_tokens = FilterList()
        for tb, node in self.nodes:
            found = True
            for k, v in kwargs.items():
                if isinstance(v, str):
                    if k in node.feats.keys():
                        if node.feats[k] != v:
                            found = False
                            break
                    elif hasattr(node, k):
                        if getattr(node, k) != v:
                            found = False
                            break
                    else:
                        if v is not None:
                            found = False
                            break
                elif callable(v):
                    if not v(node):
                        found = False
                        break
                else:
                    found = False    
            if found:
                matched_tokens.append((tb, node))
        return matched_tokens
    
    
    

# Loading Treebanks

In [3]:
# talbanken_path = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/output/temp/sv5.conllu'
# pud_path = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/pud/sv_pud-ud-test.conllu'
# lines_path_train = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/lines/sv_lines-ud-train.conllu'
# lines_path_dev = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/lines/sv_lines-ud-dev.conllu'
# lines_path_test = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/lines/sv_lines-ud-test.conllu'


talbanken_path = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/output/temp/test1.conllu'
pud_path = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/pud/test1.conllu'
lines_path_train = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/lines/test1_train.conllu'
lines_path_dev = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/lines/test1_dev.conllu'
lines_path_test = '/home/norrman/GitHub/UD_Swedish-Talbanken/not-to-release/lines/test1_test.conllu'

talbanken_data = udapi.Document(talbanken_path)
pud_data = udapi.Document(pud_path)
lines_data_train = udapi.Document(lines_path_train)
lines_data_dev = udapi.Document(lines_path_dev)
lines_data_test = udapi.Document(lines_path_test)

treebanks = Treebanks({'talbanken': talbanken_data, 'pud': pud_data, 'lines_dev': lines_data_dev, 'lines_test': lines_data_test, 'lines_test': lines_data_train})

# Testing Loads

In [4]:
for node in treebanks.nodes:
    print(node)
    break

('talbanken', Node<sv-ud-train-1#1, Individuell>)


In [5]:
for node in treebanks.lines_dev.nodes:
    print(node)
    break

<sv_lines-ud-dev-doc1-3177#1, När>


In [6]:
matches = treebanks.filter(upos='ADJ',
                           VerbForm='Part',
                           func1=lambda node: node.parent.lemma == 'bli' or any(child.lemma == 'bli' for child in node.children))
print(*matches.content(lemma=True,
                       treebank=True,
                       sorting_fn=lambda node: node[1].lemma[::-1]), sep='\n')

treebank='talbanken'	form='avtrubbade'	lemma='avtrubbad'	
treebank='pud'	form='avskedad'	lemma='avskedad'	
treebank='pud'	form='dödade'	lemma='dödad'	
treebank='pud'	form='förbluffad'	lemma='förbluffad'	
treebank='pud'	form='tvingade'	lemma='tvingad'	
treebank='pud'	form='förälskad'	lemma='förälskad'	
treebank='lines_test'	form='förälskad'	lemma='förälskad'	
treebank='pud'	form='delat'	lemma='delad'	
treebank='lines_test'	form='glad'	lemma='glad'	
treebank='lines_test'	form='jätteglad'	lemma='jätteglad'	
treebank='pud'	form='filmad'	lemma='filmad'	
treebank='talbanken'	form='komplicerat'	lemma='komplicerad'	
treebank='pud'	form='komplicerad'	lemma='komplicerad'	
treebank='talbanken'	form='deprimerad'	lemma='deprimerad'	
treebank='lines_dev'	form='generad'	lemma='generad'	
treebank='lines_test'	form='generad'	lemma='generad'	
treebank='talbanken'	form='intresserad'	lemma='intresserad'	
treebank='talbanken'	form='irriterad'	lemma='irriterad'	
treebank='pud'	form='irriterad'	lemma='irrite

In [7]:
treebanks.filter()

[('talbanken', Node<sv-ud-train-1#1, Individuell>),
 ('talbanken', Node<sv-ud-train-1#2, beskattning>),
 ('talbanken', Node<sv-ud-train-1#3, av>),
 ('talbanken', Node<sv-ud-train-1#4, arbetsinkomster>),
 ('talbanken', Node<sv-ud-train-2#1, Genom>),
 ('talbanken', Node<sv-ud-train-2#2, skattereformen>),
 ('talbanken', Node<sv-ud-train-2#3, införs>),
 ('talbanken', Node<sv-ud-train-2#4, individuell>),
 ('talbanken', Node<sv-ud-train-2#5, beskattning>),
 ('talbanken', Node<sv-ud-train-2#6, (>),
 ('talbanken', Node<sv-ud-train-2#7, särbeskattning>),
 ('talbanken', Node<sv-ud-train-2#8, )>),
 ('talbanken', Node<sv-ud-train-2#9, av>),
 ('talbanken', Node<sv-ud-train-2#10, arbetsinkomster>),
 ('talbanken', Node<sv-ud-train-2#11, .>),
 ('talbanken', Node<sv-ud-train-3#1, Det>),
 ('talbanken', Node<sv-ud-train-3#2, innebär>),
 ('talbanken', Node<sv-ud-train-3#3, bl.a.>),
 ('talbanken', Node<sv-ud-train-3#4, att>),
 ('talbanken', Node<sv-ud-train-3#5, endast>),
 ('talbanken', Node<sv-ud-train-3#

# Lemmas

In [8]:
matches = treebanks.filter(upos='ADJ',
                 func1=lambda node: ((node.lemma.endswith('a') or
                                     node.lemma.endswith('t') or
                                     node.lemma.endswith('as') or
                                     node.lemma.endswith('t') or
                                     node.lemma.endswith('t')) and 
                                     not (node.lemma.endswith('t') and 
                                          node.lemma[-2] in ['k', 's', 'p', 'f']) and
                                     not node.lemma.endswith(':e') and
                                     not node.lemma.endswith(':a')),
                 func2=lambda node: node.feats['VerbForm'] != 'Part').content(lemma=True)

print(len(matches))
print(*matches, sep='\n')
print(len(matches))



258
form='absolut'	lemma='absolut'	
form='absoluta'	lemma='absolut'	
form='akut'	lemma='akut'	
form='akuta'	lemma='akut'	
form='alerta'	lemma='alert'	
form='andra'	lemma='andra'	
form='Andra'	lemma='andra'	
form='annorlunda'	lemma='annorlunda'	
form='arrogant'	lemma='arrogant'	
form='bastant'	lemma='bastant'	
form='bekant'	lemma='bekant'	
form='bekanta'	lemma='bekant'	
form='betalt'	lemma='betala'	
form='bevänt'	lemma='bevänt'	
form='bjärta'	lemma='bjärt'	
form='Blotta'	lemma='blott'	
form='blotta'	lemma='blott'	
form='blöt'	lemma='blöt'	
form='bosatta'	lemma='bosatt'	
form='bra'	lemma='bra'	
form='Bra'	lemma='bra'	
form='brant'	lemma='brant'	
form='brunrosa'	lemma='brunrosa'	
form='bäst'	lemma='bra'	
form='bästa'	lemma='bra'	
form='Bästa'	lemma='bra'	
form='bäste'	lemma='bra'	
form='Bättre'	lemma='bra'	
form='bättre'	lemma='bra'	
form='båda'	lemma='båda'	
form='Båda'	lemma='båda'	
form='desamma'	lemma='desamma'	
form='desperat'	lemma='desperat'	
form='desperata'	lemma='desperat'	
form

In [9]:
matches = treebanks.filter(upos='ADJ',
                           VerbForm='Part',
                           func1=lambda node: node.lemma.endswith('a')).content(lemma=True,
                                                                                feats=True)

print(len(matches))
print(*matches, sep='\n')
print(len(matches))

0

0


In [10]:
treebanks.filter(upos='ADJ',
                 func1=lambda tok: '_' in tok.lemma).content(feats=True,
                                                                        lemma=True)

["form='s k'\tlemma='så_kallad'\tfeats='Abbr=Yes'\t",
 "form='s.k.'\tlemma='så_kallad'\tfeats='Abbr=Yes'\t"]

In [11]:
treebanks.filter(form='mellersta').content(text=True, lemma=True)

["form='mellersta'\tlemma='mellersta'\ttext='De är koncentrerade till södra och mellersta Sveriges industribygder.'\t"]