In [12]:
import gzip
from yargy import Parser, rule, and_, not_, or_
from yargy.predicates import gram, dictionary, gte, lte
from yargy.interpretation import fact
from yargy.relations import gnc_relation
from yargy.pipelines import morph_pipeline
from dataclasses import dataclass
from typing import Optional


In [13]:
gz_path = 'news.txt.gz'
extracted_path = 'news.txt'

In [14]:
with gzip.open(gz_path, 'rb') as f_in:
    with open(extracted_path, 'wb') as f_out:
        f_out.write(f_in.read())

In [15]:
content = []

with open(extracted_path, 'r', encoding='utf-8') as file:
    for line in file:
        category, title_content = line.strip().split('\t', 1)
        content.append(title_content)

In [16]:
Person = fact(
    'Person',
    ['first_name', 'last_name', 'birth_verb', 'prep_birth', 'birth_date', 'birth_place']
)

gnc = gnc_relation()

LAST = and_(
    gram('Surn'),
    not_(gram('Abbr')),
).interpretation(
    Person.last_name
).match(gnc)

FIRST = and_(
    gram('Name'),
    not_(gram('Abbr')),
).interpretation(
    Person.first_name
).match(gnc)

In [17]:
MONTHS = dictionary({
    'январь', 'февраль', 'март', 'апрель', 'мая', 'июнь', 'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь'
})

DATE_WORD = morph_pipeline([
    'году', 'года'
])

DATE = rule(
    and_(gte(1), lte(31)).optional(),
    MONTHS.optional(), 
    and_(lte(2024)),
    DATE_WORD.optional()
).interpretation(
    Person.birth_date
)

BIRTH_VERB = morph_pipeline([
    'родился', 'родилась', 'родились'
]).interpretation(
    Person.birth_verb
)

PREP_BIRTH = morph_pipeline([
    'в'
]).interpretation(
    Person.prep_birth
)

CITY_WORD = morph_pipeline([
    'городе', 'области', 'графстве'
])


BIRTH_PLACE = rule(
    CITY_WORD.optional(),
    gram('Geox').repeatable(max=6)
).interpretation(
    Person.birth_place
)

In [18]:
DATE_PLACE = rule(
    PREP_BIRTH.optional(),
    DATE,
    PREP_BIRTH.optional(),
    BIRTH_PLACE
)

PLACE_DATE = rule(
    PREP_BIRTH.optional(),
    BIRTH_PLACE,
    PREP_BIRTH.optional(),
    DATE
)

DATE_AND_PLACE = or_(
    DATE_PLACE,
    PLACE_DATE
)

DATE_OR_PLACE = or_(
    rule(
        PREP_BIRTH.optional(),
        DATE,
    ),
    rule(
        PREP_BIRTH.optional(),
        BIRTH_PLACE
    )
)

BIRTH = or_(
    DATE_AND_PLACE,
    DATE_OR_PLACE
)

In [19]:
Person = rule(
    FIRST,
    LAST,
    BIRTH_VERB,
    BIRTH.optional()
).interpretation(
    Person
)

In [20]:
@dataclass
class Entry:
    name: str
    birth_date: Optional[str]
    birth_place: Optional[str]

In [21]:
entries = []

parser = Parser(Person)

for item in content:
    for match in parser.findall(item):
        person = match.fact
        entry = Entry(
            name=f"{person.first_name} {person.last_name}",
            birth_date=person.birth_date,
            birth_place=person.birth_place
        )
        entries.append(entry)

for entry in entries:
    print(entry)

Entry(name='Андрей Курносенко', birth_date=None, birth_place='Севастополе')
Entry(name='Иосиф Кобзон', birth_date=None, birth_place=None)
Entry(name='Сергея Безрукова', birth_date=None, birth_place=None)
Entry(name='Анны Матисон', birth_date=None, birth_place=None)
Entry(name='ЯковлевЮрий Яковлев', birth_date='1928 году', birth_place='Москве')
Entry(name='Николай Караченцов', birth_date='27 октября 1944 года', birth_place='Москве')
Entry(name='Сергей Довлатов', birth_date='1941 году', birth_place=None)
Entry(name='Аня Титова', birth_date='1984 году', birth_place=None)
Entry(name='Игорь Доценко', birth_date='1953 году', birth_place=None)
Entry(name='Андрей Сердюков', birth_date='1962 году', birth_place=None)
Entry(name='Анатолий Сморгонский', birth_date='1979 году', birth_place=None)
Entry(name='Инна Лиснянская', birth_date='1928 году', birth_place='Баку')
Entry(name='Колин Дэвис', birth_date='1927 году', birth_place='графстве Суррей')
Entry(name='Эрик Хобсбаум', birth_date='1917 году',