In [1]:
from pymorphy2 import MorphAnalyzer

In [2]:
from rich import print, inspect

In [3]:
morph = MorphAnalyzer()

In [4]:
x = morph.parse("всё")

In [5]:
form0 = x[0]

In [6]:
form0.tag.grammemes

frozenset({'PRCL'})

In [7]:
import spacy_udpipe

In [8]:
model = spacy_udpipe.load_from_path(
    lang="ru",
    path="./data/models/russian-syntagrus-ud-2.5-191206.udpipe",
    meta={"description": "Custom 'hr' model"}
)

In [10]:
doc = model("Всё решено!")

In [11]:
tags = list(doc)

In [30]:
tags[1].iob_strings()

('', 'I', 'O', 'B')

In [12]:
tags[1].morph.to_dict()

{'Aspect': 'Perf',
 'Gender': 'Neut',
 'Number': 'Sing',
 'Tense': 'Past',
 'Variant': 'Short',
 'VerbForm': 'Part',
 'Voice': 'Pass'}

In [31]:
from yargy.morph import Case, Gender, Number, Grams, Form
from yargy.token import Token
from yargy.span import Span

In [14]:
class UdCase(Case):
    
    def __init__(self, grams):
        self.mask = [
            (_ in grams)
            for _ in ['Nom', 'Gen', 'Dat', 'Acc', 'Abl', 'Loc', 'Voc']
        ]
        self.fixed = False

In [15]:
class UdGender(Gender):
    def __init__(self, grams):
        self.male = 'Masc' in grams
        self.female = 'Fem' in grams
        self.neutral = 'Neut' in grams
        # https://github.com/OpenCorpora/opencorpora/issues/795
        self.bi = False
        self.general = 'Com' in grams

In [16]:
class UdNumber(Number):
    def __init__(self, grams):
        self.single = 'Sing' in grams
        self.plural = 'Plur' in grams
        self.only_single = 'Stan' in grams # not actually exists in universal dependencies
        self.only_plural = 'Ptan' in grams

In [17]:
class UdGrams(Grams):

    def __init__(self, values):
        self.values = values

    @property
    def gender(self):
        return UdGender(self)

    @property
    def number(self):
        return UdNumber(self)

    @property
    def case(self):
        return UdCase(self)

    def __contains__(self, value):
        return value in self.values

    def __repr__(self):
        values = sorted(self.values)
        return 'Grams({values})'.format(
            values=','.join(values)
        )

    def _repr_pretty_(self, printer, cycle):
        printer.text(repr(self))

In [56]:
class UdForm(Form):
    
    def __init__(self, normalized, grams):
        self.normalized = normalized
        self.grams = grams

    def __repr__(self):
        return 'Form({self.normalized!r}, {self.grams!r})'.format(self=self)

    def _repr_pretty_(self, printer, cycle):
        printer.text(repr(self))

In [37]:
import re

In [45]:
re.match(r'[\n\r]+', tags[0].text) is not None

False

In [55]:
tags[0].pos_

'PRON'

In [54]:
frozenset(tags[0].morph.to_dict().values())

frozenset({'Inan', 'Neut', 'Nom', 'Sing'})

In [58]:
class UdMorphTokenizer():
    def __init__(self, model):
        self.model = model
        
    def get_token_type(self, token):
        if token.is_ascii:
            return "LATIN"
        elif token.is_digit:
            return "INT"
        elif token.is_punct:
            return "PUNCT"
        elif re.match(r'[\n\r]+', token.text) is not None:
            return "EOL"
        elif re.match(r'[а-яёЁА-Я]+', token.text) is not None:
            return "RU"
        else:
            return "OTHER"
        
    def tokenize(self, doc):
        for token in doc:
            span = Span(token.idx, token.idx + len(token.text))
            token_type = self.get_token_type(token)
            yield token, Token(token.text, span, token_type)
            
    def get_morph(self, token):
        pos = token.pos_
        norm = token.norm_
        features = set(token.morph.to_dict().values())
        features.add(pos)
        grams = UdGrams(features)
        return UdForm(norm, grams)
        

    def __call__(self, text):
        tokens = self.tokenize(self.model(text))
        for spacy_token, yargy_token in tokens:
            if yargy_token.type == "RU":
                forms = self.get_morph(spacy_token)
                yield token.morphed([forms])
            else:
                yield token