In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [47]:
import pandas as pd
from num2words import num2words
import re
from ast import literal_eval
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=16)

from sklearn.model_selection import train_test_split

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [10]:
common_voice = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/Wikiann_ru/Wikiann_augmented.csv'
wikineural = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/Babelscape_Wikineural/Wikineural_augmented.csv'
multinerd = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/MultiNerd/MultiNERD_augmented.csv'

In [11]:
markup = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [12]:
df_cv = pd.read_csv(common_voice)
df_wiki = pd.read_csv(wikineural)
df_multinerd = pd.read_csv(multinerd)

In [13]:
df = pd.concat([df_wiki, df_cv, df_multinerd], ignore_index=True)

In [14]:
df['augmented_ner_tags'] = df['augmented_ner_tags'].parallel_apply(literal_eval)
df['augmented_tokens'] = df['augmented_tokens'].parallel_apply(literal_eval)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14891), Label(value='0 / 14891')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14891), Label(value='0 / 14891')))…

In [15]:
df.drop(['tokens', 'ner_tags', 'len_check', 'augmented_length_check'], axis=1, inplace=True)

In [17]:
roman_map = {'i': 1,'v': 5,'x': 10,'l': 50,'c': 100,'d': 500,'m': 1000}

In [35]:
def length_check(tokens, ner_tags):
    return 'ok' if len(tokens) == len(ner_tags) else 'WRONG'

In [19]:
def is_roman_numeral(token):
    roman_numeral_pattern = r"^(?=[mdclxvi])M*(c[md]|d?c{0,3})(x[cl]|l?x{0,3})(i[xv]|v?i{0,3})$"
    return bool(re.match(roman_numeral_pattern, token, re.IGNORECASE))

In [20]:
def roman_to_arabic(tokens):
    for index, token in enumerate(tokens):
        if is_roman_numeral(token):
            total = 0
            prev_value = 0
            for numeral in reversed(token):
                current_value = roman_map[numeral]
                if current_value >= prev_value:
                    total += current_value
                else:
                    total -= current_value
                prev_value = current_value
            tokens[index] = str(total)
    return tokens

In [21]:
trash = df['augmented_tokens'][27729][11]
trash

'lı'

In [22]:
del df['augmented_tokens'][27729][11]
del df['augmented_ner_tags'][27729][11]

In [23]:
df['augmented_tokens'] = df['augmented_tokens'].parallel_apply(lambda x: roman_to_arabic(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14891), Label(value='0 / 14891')))…

In [28]:
def replace_numerical_with_words(tokens, ner_tags):
    adjusted_ner_tags = []
    
    for i, token in enumerate(tokens):
        if token.isnumeric():
            index = i
            new_tokens = num2words(int(token), lang='ru').split()
            tag_to_insert = ner_tags[i]
            count = len(new_tokens)
            del ner_tags[i]
            del tokens[i] 
            for _ in range(len(new_tokens)):
                ner_tags.insert(index, tag_to_insert)
                tokens.insert(index, new_tokens[_])
                index += 1 
                
    return tokens, ner_tags

In [31]:
df[['augmented_tokens', 'augmented_ner_tags']] = df.parallel_apply(lambda row: pd.Series(replace_numerical_with_words(row['augmented_tokens'], row['augmented_ner_tags'])), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14891), Label(value='0 / 14891')))…

In [36]:
df['augmented_length_check'] = df.apply(lambda row: length_check(row['augmented_tokens'], row['augmented_ner_tags']), axis=1)
df['augmented_length_check'].value_counts()

ok    238244
Name: augmented_length_check, dtype: int64

In [39]:
def convert_to_sentence(lst_str):
    #lst = literal_eval(lst_str)
    return ' '.join(lst_str)

In [40]:
df['sentence'] = df['augmented_tokens'].parallel_apply(lambda x: convert_to_sentence(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14891), Label(value='0 / 14891')))…

In [43]:
df.drop('augmented_length_check', axis=1, inplace=True)

In [51]:
df = df[['augmented_tokens', 'sentence', 'augmented_ner_tags']]

In [44]:
def print_tags_frequency(tags):
    freq = {}
    for row in tags:
        for tag in set(row):
            if tag in freq:
                freq[tag] += 1
            else:
                freq[tag] = 1
    freq
    for item in sorted(freq.keys()):
        print(f'% of rows token {item} appears in is {freq[item]/len(tags):.4f}')

In [45]:
print_tags_frequency(df['augmented_ner_tags'])

% of rows token 0 appears in is 0.9316
% of rows token 1 appears in is 0.2917
% of rows token 2 appears in is 0.1954
% of rows token 3 appears in is 0.1708
% of rows token 4 appears in is 0.0997
% of rows token 5 appears in is 0.4122
% of rows token 6 appears in is 0.0833
% of rows token 7 appears in is 0.1870
% of rows token 8 appears in is 0.0898


In [53]:
y = df['augmented_ner_tags']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, random_state=12)

In [56]:
print_tags_frequency(X_train['augmented_ner_tags'])

% of rows token 0 appears in is 0.9324
% of rows token 1 appears in is 0.2912
% of rows token 2 appears in is 0.1948
% of rows token 3 appears in is 0.1704
% of rows token 4 appears in is 0.0996
% of rows token 5 appears in is 0.4125
% of rows token 6 appears in is 0.0834
% of rows token 7 appears in is 0.1874
% of rows token 8 appears in is 0.0902


In [57]:
print_tags_frequency(X_test['augmented_ner_tags'])

% of rows token 0 appears in is 0.9293
% of rows token 1 appears in is 0.2930
% of rows token 2 appears in is 0.1971
% of rows token 3 appears in is 0.1723
% of rows token 4 appears in is 0.0998
% of rows token 5 appears in is 0.4112
% of rows token 6 appears in is 0.0832
% of rows token 7 appears in is 0.1860
% of rows token 8 appears in is 0.0886


In [58]:
X_train.to_csv('train_data.csv')
X_test.to_csv('test_data.csv')