In [3]:
import pandas as pd
import numpy as np
import re
import typing

In [24]:
classes = [
    "B-Country", "I-Country",
    "B-Republic", "I-Republic",
    "B-Krai", "I-Krai",
    "B-Oblast", "I-Oblast",
    "B-AO", "I-AO",
    "B-District", "I-District",
    "B-City", "I-City",
    "B-Selo", "I-Selo",
    "B-Village", "I-Village",
    "B-Poselok", "I-Poselok",
    "B-Municipality", "I-Municipality",
    "B-Street", "I-Street",
    "B-Avenue", "I-Avenue",
    "B-Boulevard", "I-Boulevard",
    "B-Lane", "I-Lane",
    "B-Highway", "I-Highway",
    "B-Embankment", "I-Embankment",
    "B-Microdistrict", "I-Microdistrict",
    "B-Quarter", "I-Quarter",
    "B-House", "I-House",
    "B-Building", "I-Building",
    "B-Structure", "I-Structure",
    "B-Flat", "I-Flat",
    "O" # Others
]

In [31]:
def calculate_class_ratio(classes : list, generator_of_sentence : typing.Generator[str]) -> dict[str, float]:
    """
    :param classes: list of classes
    :param generator_of_sentence: generator of sentences, where each sentence : "someWord someClass"
    :return:
    """
    result = dict.fromkeys(classes, 0)
    for sentence in generator_of_sentence:
        word, word_class = sentence.split()
        result[word_class] += 1
    for word_class in result:
        result[word_class] /= sum(result.values())
    return result


In [32]:
def pandas_proxy_for_class_ratio(classes : list, df : pd.DataFrame, annot_column : str = 'annotations') -> dict[str, float]:
    """
    :param classes:
    :param df:
    :param annot_column:
    :return:
    """
    df_split = df.assign(
        split_column=df[annot_column].str.split('\n')
    ).explode('split_column')['split_column']
    return calculate_class_ratio(classes, df_split)


In [33]:
df = pd.read_csv('dataset.csv')
df.head(5)

Unnamed: 0,id,text,annotations
0,1,Я живу в городе Москва.,Я O\nживу O\nв O\nгороде B-City\nМосква I-City...
1,2,Из деревни Ольгино я поехал в Санкт-Петербург.,Из O\nдеревни B-Village\nОльгино I-Village\nя ...
2,3,Моя квартира находится в квартире 101 в микрор...,Моя O\nквартира O\nнаходится O\nв O\nквартире ...
3,4,Автономный округ включает Чукотский автономный...,Автономный O\nокруг O\nвключает O\nЧукотский B...
4,5,Я гулял по проспекту Ленина в Хабаровском крае.,Я O\nгулял O\nпо O\nпроспекту B-Avenue\nЛенина...


In [34]:
print(pandas_proxy_for_class_ratio(classes, df, 'annotations'))

{'B-Country': 0.003838771593090211, 'I-Country': 0.0, 'B-Republic': 0.007707072089230439, 'I-Republic': 0.0, 'B-Krai': 0.007766816166125508, 'I-Krai': 0.007827492818578726, 'B-Oblast': 0.007889124039386908, 'I-Oblast': 0.007951732518014575, 'B-AO': 0.0060115062512913035, 'I-AO': 0.012095579449872206, 'B-District': 0.014283933513701233, 'I-District': 0.014490492385498594, 'B-City': 0.02310488180058412, 'I-City': 0.0021500153082903606, 'B-Selo': 0.010773189263510934, 'I-Selo': 0.01089025929987844, 'B-Village': 0.01100989880286718, 'I-Village': 0.011132193239736375, 'B-Poselok': 0.01125723190855884, 'I-Poselok': 0.011385108155184231, 'B-Municipality': 0.006909551763066643, 'I-Municipality': 0.016234200142824274, 'B-Street': 0.011786762129990073, 'I-Street': 0.011927011579798743, 'B-Avenue': 0.014484761714543536, 'I-Avenue': 0.014697132242859658, 'B-Boulevard': 0.004971938321467432, 'I-Boulevard': 0.004996719944958491, 'B-Lane': 0.0025108747657314272, 'I-Lane': 0.002517179217974195, 'B-Hig