In [1]:
import gzip

from dataclasses import dataclass 
from typing import Iterator, Optional

import pandas as pd

from yargy import Parser, rule, and_, or_
from yargy.interpretation import fact
from yargy.pipelines import morph_pipeline
from yargy.predicates import gram, is_capitalized, gte, lte, dictionary
from yargy.relations import gnc_relation

In [2]:
@dataclass
class Text:
    label: str
    title: str
    text: str

def read_texts(fn: str) -> Iterator[Text]:
    with gzip.open(fn, 'rt', encoding='utf-8') as f:
        for line in f:
            yield Text(*line.strip().split('\t'))

In [3]:
gnc = gnc_relation()

Person_id = fact(
    'Person_id',
    ['name', 'birth_date', 'birth_place']
)

Name = fact(
    'Name',
    ['surname', 'name', 'patronym']
)

SURNAME = rule(and_(gram('Surn'), is_capitalized())).interpretation(Name.surname.inflected()).match(gnc)

FIRST_NAME = rule(and_(gram('Name'), is_capitalized())).interpretation(Name.name.inflected()).match(gnc)

PATRONYM = rule(and_(gram('Patr'), is_capitalized())).interpretation(Name.patronym.inflected()).match(gnc)

NAME = or_(
    rule(SURNAME, FIRST_NAME, PATRONYM),
    rule(FIRST_NAME, PATRONYM, SURNAME),
    rule(FIRST_NAME, SURNAME),
    rule(SURNAME, FIRST_NAME),
    rule(FIRST_NAME, PATRONYM),
    rule(SURNAME)
).interpretation(Name)

BIRTH_VERB = morph_pipeline([
    'родился',
    'был рождён'
    'уроженец'
])

Date = fact(
    'Date',
    ['year', 'month', 'day']
)

MONTHS = {
    'январь': 1,
    'февраль': 2,
    'март': 3,
    'апрель': 4,
    'мая': 5,
    'июнь': 6,
    'июль': 7,
    'август': 8,
    'сентябрь': 9,
    'октябрь': 10,
    'ноябрь': 11,
    'декабрь': 12
}

DAY = and_(gte(1), lte(31)).interpretation(Date.day.custom(int))

MONTH = and_(gte(1), lte(12)).interpretation(Date.month.custom(int))

YEAR = and_(gte(1000), lte(2024)).interpretation(Date.year.custom(int))

MONTH_NAME = dictionary(MONTHS).interpretation(Date.month.normalized().custom(MONTHS.__getitem__))

YEAR_WORDS = morph_pipeline([
    'год'
])

DATE = or_(
    rule(DAY, '.', MONTH, '.', YEAR),
    rule(DAY, MONTH_NAME, YEAR, YEAR_WORDS.optional()),
    rule(MONTH_NAME, YEAR, YEAR_WORDS.optional()),
    rule(YEAR, YEAR_WORDS.optional()),
).interpretation(Date)

Place = fact(
    'Place',
    ['type', 'name']
)

PLACE_TYPE = morph_pipeline([
    'город',
    'посёлок',
    'село',
    'деревня',
    'слобода',
    'станица',
    'хутор',
    'разъезд',
    'станция'
])

PLACE = rule(
    PLACE_TYPE.optional().interpretation(Place.type.inflected()),
    and_(or_(gram('NOUN'), gram('ADJF')), is_capitalized()).repeatable(min=1).interpretation(Place.name.inflected())
).interpretation(Place)

PREPOSITION = morph_pipeline(['в', 'на'])

PERSON_ID_RULE = rule(
    NAME.interpretation(Person_id.name),
    rule(
        BIRTH_VERB,
        or_(
            rule(
                PREPOSITION.optional(),
                DATE.interpretation(Person_id.birth_date),
                PREPOSITION,
                PLACE.interpretation(Person_id.birth_place)
            ),
            rule(
                PREPOSITION,
                PLACE.interpretation(Person_id.birth_place),
                PREPOSITION.optional(),
                DATE.interpretation(Person_id.birth_date)
            ),
            rule(
                PREPOSITION.optional(),
                DATE.interpretation(Person_id.birth_date)
            ),
            rule(
                PREPOSITION,
                PLACE.interpretation(Person_id.birth_place)
            )
        )
    ).optional()
).interpretation(Person_id)

parser = Parser(PERSON_ID_RULE)

In [4]:
@dataclass
class Entry:
    name: str
    birth_date: Optional[str]
    birth_place: Optional[str]

    def __init__(self, person_id):
        name = [person_id.name.surname, person_id.name.name, person_id.name.patronym]
        self.name = (' '.join(filter(None, name))).title()

        if person_id.birth_date:
            birth_date = [
                person_id.birth_date.day,
                person_id.birth_date.month,
                person_id.birth_date.year
            ]

            if birth_date[1] and birth_date[1] < 10:
                birth_date[1] = '0' + str(person_id.birth_date.month)
            
            self.birth_date = '.'.join(map(str, filter(None, birth_date)))
        else:
            self.birth_date = None

        if person_id.birth_place:
            birth_place = [person_id.birth_place.type, person_id.birth_place.name.title()]
            self.birth_place = ' '.join(filter(None, birth_place))
        else:
            self.birth_place = None

In [5]:
texts = [
    'Липатов Карп Иванович родился в 1885 году в деревне Покровка',
    'Андрей Курносенко родился в Севастополе',
    'Юлия Курносенкова родилась в июле 2000 года'
]

for text in texts:
    for match in parser.findall(text):
        print(Entry(match.fact))

Entry(name='Липатов Карп Иванович', birth_date='1885', birth_place='деревня Покровка')
Entry(name='Курносенко Андрей', birth_date=None, birth_place='Севастополь')
Entry(name='Курносенкова Юлия', birth_date='07.2000', birth_place=None)


In [6]:
result = []

for news in read_texts('data/news.txt.gz'):
    try:
        for match in parser.findall(news.text):
            result.append(Entry(match.fact))
    except:
       pass

result = pd.DataFrame(result)

In [7]:
print(len(result))
print(result.dropna().head(15))

32062
                      name  birth_date        birth_place
5881   Яковлев Яковлевюрий        1928             Москва
7003               Смирнов  18.12.1937  город Калининград
8031    Караченцов Николай  27.10.1944             Москва
12081        Доценко Игорь        1953        Хмельницкая
14776      Лиснянская Инна        1928               Баку
16885            Гамильтон        1922             Лондон
20889              Рахлина        1938          Ленинград
20933    Высоцкий Владимир        1938             Москва
21156                Монро        1969          Ленинград
22226               Ельчин  11.03.1989          Ленинград
27837    Гришковец Евгений        1967           Кемерово
28563  Серебрякова Зинаида  28.06.1913       Царское Село
29324          Кваша Игорь        1933             Москва
29579     Жеребцова Полина        1985            Грозный
31600      Ремизов Алексей        1877             Москва
