In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import string
import re
from num2words import num2words

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=16)

from functools import lru_cache

import pymorphy3
import random

morph = pymorphy3.MorphAnalyzer()
punctuation = string.punctuation

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
df_train = pd.read_parquet('wikiann-train.parquet')
df_test = pd.read_parquet('wikiann-test.parquet')
df_val = pd.read_parquet('wikiann-validation.parquet')

In [4]:
markup = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}

In [5]:
df_train

Unnamed: 0,tokens,ner_tags,langs,spans
0,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]","[ru, ru, ru, ru]","[PER: Илизаров , Гавриил Абрамович]"
1,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]","[ru, ru, ru, ru, ru, ru]",[LOC: Рыбницкий район]
2,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...","[ru, ru, ru, ru, ru, ru, ru, ru, ru, ru, ru, r...",[LOC: Демблин]
3,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]","[ru, ru, ru, ru, ru, ru, ru]",[LOC: Стадион имени С. Дарюса и С. Гиренаса]
4,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]","[ru, ru, ru, ru, ru]",[PER: Майкл Томас]
...,...,...,...,...
19995,"[перенаправление, Каманьяб, (, деревня, )]","[0, 5, 6, 6, 6]","[ru, ru, ru, ru, ru]",[LOC: Каманьяб ( деревня )]
19996,"[Центральное, телевидение, Гостелерадио, СССР]","[3, 4, 4, 4]","[ru, ru, ru, ru]",[ORG: Центральное телевидение Гостелерадио СССР]
19997,"[перенаправление, Ясвина, ,, Лариса, Владимиро...","[0, 1, 2, 2, 2]","[ru, ru, ru, ru, ru]","[PER: Ясвина , Лариса Владимировна]"
19998,"[Согласно, переписи, населения, 1897, года, в,...","[0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ru, ru, ru, ru, ru, ru, ru, ru, ru, ru, ru, r...",[ORG: переписи населения 1897 года]


In [6]:
df_main = pd.concat([df_train, df_test, df_val], ignore_index=True)

In [7]:
df_main

Unnamed: 0,tokens,ner_tags,langs,spans
0,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]","[ru, ru, ru, ru]","[PER: Илизаров , Гавриил Абрамович]"
1,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]","[ru, ru, ru, ru, ru, ru]",[LOC: Рыбницкий район]
2,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...","[ru, ru, ru, ru, ru, ru, ru, ru, ru, ru, ru, r...",[LOC: Демблин]
3,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]","[ru, ru, ru, ru, ru, ru, ru]",[LOC: Стадион имени С. Дарюса и С. Гиренаса]
4,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]","[ru, ru, ru, ru, ru]",[PER: Майкл Томас]
...,...,...,...,...
39995,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]","[ru, ru, ru, ru, ru, ru, ru]","[PER: Клаус Рот, LOC: Великобритания]"
39996,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]","[ru, ru, ru, ru, ru, ru]",[LOC: Ширнесс]
39997,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]","[ru, ru, ru, ru, ru, ru]",[PER: Карл II ( герцог Орлеанский )]
39998,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]","[ru, ru, ru, ru, ru, ru, ru, ru, ru, ru, ru]",[ORG: IOPS]


In [8]:
def length_check(tokens, ner_tags):
    return 'ok' if len(tokens) == len(ner_tags) else 'WRONG'

In [9]:
df_main['tokens'] = df_main['tokens'].apply(lambda x: x.tolist())
df_main['ner_tags'] = df_main['ner_tags'].apply(lambda x: x.tolist())
df_main['len_check'] = df_main.apply(lambda row: length_check(row['tokens'], row['ner_tags']), axis=1)
df_main['augmented_tokens'] = df_main['tokens']
df_main['augmented_ner_tags'] = df_main['ner_tags']
df_main.drop('langs', inplace=True, axis=1)
df_main.drop('spans', inplace=True, axis=1)

In [10]:
df_main

Unnamed: 0,tokens,ner_tags,len_check,augmented_tokens,augmented_ner_tags
0,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]",ok,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]"
1,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]",ok,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]"
2,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ..."
3,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]"
4,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]",ok,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]"
...,...,...,...,...,...
39995,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]",ok,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]"
39996,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]",ok,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]"
39997,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]",ok,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]"
39998,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]"


In [11]:
df_main['len_check'].value_counts()

ok    40000
Name: len_check, dtype: int64

In [12]:
# Takes a list of tokens, and changes 1 or 2 (based on number of tokens)
# random tokens to its lemma, simulating transcribed speech system mistake,
# with the following conditions:
# -  original token should be at elast 7 characters long,
#    since transcribed speech system will probably recognize short words more accurately
# -  token should not start with uppercase, to aviod changing unique names
# -  the original token and its lemma must not differ in more than 5 characters.
#    since transcribed words should not be too different (assuming we arent using utter trash
#    transcription system.)
def simulate_mistakes(tokens):
    suitable_for_augmentation = []
    
    @lru_cache(maxsize=512)
    def get_lemma(word):
        parsed_word = morph.parse(word)[0]
        lemma = parsed_word.normal_form
        return lemma
    
    def lemmatize_tokens(tokens, suitable_for_augmentation, k):
        to_augment = random.sample(suitable_for_augmentation, k)
        for i in to_augment:
            tokens[i] = get_lemma(tokens[i])
    
    for i in range(len(tokens)):
        word = tokens[i]
        if len(word) > 6 and not word[0].isupper():
            lemma = get_lemma(word)
            if lemma != word:
                difference_counter = abs(len(word) - len(lemma))
                for index in range(min(len(word), len(lemma))):
                    if word[index] != lemma[index]:
                        difference_counter += 1

                if difference_counter < 6:
                        suitable_for_augmentation.append(i)
                    
    if suitable_for_augmentation:
        if len(tokens) <=10:
            lemmatize_tokens(tokens, suitable_for_augmentation, 1)
        elif len(suitable_for_augmentation) > 3:
            lemmatize_tokens(tokens, suitable_for_augmentation, 2)
       
    return tokens

In [13]:
df_main['augmented_tokens'] = df_main['augmented_tokens'].parallel_apply(lambda x: simulate_mistakes(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

In [14]:
def split_hyphen(tokens, ner_tags):
    
    for i in range(len(tokens)):
        if '-' in tokens[i] or '—' in tokens[i]:
            new_tokens = tokens[i].split('-')
            new_tokens = tokens[i].split('—')
            tag_to_insert = ner_tags[i]
            del ner_tags[i]
            del tokens[i] 
            index = i
            
            for _ in range(len(new_tokens)):
                ner_tags.insert(index, tag_to_insert)
                tokens.insert(index, new_tokens[_])
                index += 1 
               
    return tokens, ner_tags

In [15]:
df_main[['augmented_tokens', 'augmented_ner_tags']] = df_main.parallel_apply(lambda row: pd.Series(split_hyphen(row['tokens'], row['ner_tags'])), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

In [16]:
df_main['augmented_length_check'] = df_main.apply(lambda row: length_check(row['augmented_tokens'], row['augmented_ner_tags']), axis=1)
df_main['augmented_length_check'].value_counts()

ok    40000
Name: augmented_length_check, dtype: int64

In [17]:
df_main

Unnamed: 0,tokens,ner_tags,len_check,augmented_tokens,augmented_ner_tags,augmented_length_check
0,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]",ok,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]",ok
1,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]",ok,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]",ok
2,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok
3,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok
4,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]",ok,"[Майкл, Томас, (, 1987, 1991, )]","[1, 2, 0, 0, 0, 0]",ok
...,...,...,...,...,...,...
39995,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]",ok,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]",ok
39996,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]",ok,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]",ok
39997,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]",ok,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]",ok
39998,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok


In [18]:
def cubes_squares_punct_etc(tokens, ner_tags):
    
    def switch_ner_tags(ner_tags, i):
        ner_tags[i-1], ner_tags[i] = ner_tags[i], ner_tags[i-1]
        
    def switch_tokens(tokens, i):
        tokens[i-1], tokens[i] = tokens[i], tokens[i-1]
        
    def m_km(tokens, i):
        if tokens[i-1] == 'километр':
            switch_tokens(tokens, i)
            switch_ner_tags(ner_tags, i)
        elif tokens[i-1] == 'метр':
            switch_tokens(tokens, i)
            switch_ner_tags(ner_tags, i)
    
    square, cube, half, third = '²', '³', '½', '⅓'
    thousand, million, billion = 'тыс', 'млн', 'млрд'
    km, m = 'км', 'м'
   
    for i in range(len(tokens)):
        tokens[i] = re.sub(r'[^\w\s]+', '', tokens[i])
        if tokens[i] == half: tokens[i] = 'с половиной'
        elif tokens[i] == third: tokens[i] = 'одна треть'
        elif tokens[i] == thousand: tokens[i] = 'тысяч'
        elif tokens[i] == million: tokens[i] = 'миллионов'
        elif tokens[i] == billion: tokens[i] = 'миллиадров'
        elif tokens[i] == km: tokens[i] = 'километр'
        elif tokens[i] == m: tokens[i] = 'метр'
        
        elif tokens[i] == square:
            tokens[i] = 'квадратный'
            m_km(tokens, i)
        
        elif tokens[i] == cube:
            tokens[i] = 'кубический'
            m_km(tokens, i)
            
    return tokens, ner_tags

In [19]:
df_main[['augmented_tokens', 'augmented_ner_tags']] = df_main.parallel_apply(lambda row: pd.Series(cubes_squares_punct_etc(row['augmented_tokens'], row['augmented_ner_tags'])), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

In [20]:
df_main['augmented_length_check'] = df_main.apply(lambda row: length_check(row['augmented_tokens'], row['augmented_ner_tags']), axis=1)
df_main['augmented_length_check'].value_counts()

ok    40000
Name: augmented_length_check, dtype: int64

In [21]:
df_main

Unnamed: 0,tokens,ner_tags,len_check,augmented_tokens,augmented_ner_tags,augmented_length_check
0,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]",ok,"[Илизаров, , Гавриил, Абрамович]","[1, 2, 2, 2]",ok
1,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]",ok,"[, , Рыбницкий, район, , ]","[0, 0, 5, 6, 0, 0]",ok
2,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok
3,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok,"[Стадион, имени, С, Дарюса, и, С, Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok
4,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]",ok,"[Майкл, Томас, , 1987, 1991, ]","[1, 2, 0, 0, 0, 0]",ok
...,...,...,...,...,...,...
39995,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]",ok,"[, Клаус, Рот, , Великобритания, , ]","[0, 1, 2, 0, 5, 0, 0]",ok
39996,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]",ok,"[Оутон, , , , Ширнесс, ]","[0, 0, 0, 0, 5, 0]",ok
39997,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]",ok,"[Карл, II, , герцог, Орлеанский, ]","[1, 2, 2, 2, 2, 2]",ok
39998,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok,"[Крайне, высокая, скорость, IOPS, , операций, ...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok


In [22]:
df_main['augmented_tokens'][0]

['Илизаров', '', 'Гавриил', 'Абрамович']

In [23]:
def filter_non_words(tokens, ner_tags):
    
    # Remove 'г's, using that just in case there are multiple occurences in one row
    if 'г' in tokens:
        g_indexes = [index for index, value in enumerate(tokens) if value == 'г']
        tokens = [value for index, value in enumerate(tokens) if index not in g_indexes]
        
    filtered_tags = [tag if token.isalnum() else np.nan for token, tag in zip(tokens, ner_tags)]
    filtered_tokens = [token for token in tokens if token != '' ]
    return filtered_tokens, [tag for tag in filtered_tags if pd.notnull(tag)]

In [24]:
df_main[['augmented_tokens', 'augmented_ner_tags']] = df_main.parallel_apply(lambda row: pd.Series(filter_non_words(row['augmented_tokens'], row['augmented_ner_tags'])), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

In [25]:
df_main

Unnamed: 0,tokens,ner_tags,len_check,augmented_tokens,augmented_ner_tags,augmented_length_check
0,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]",ok,"[Илизаров, Гавриил, Абрамович]","[1, 2, 2]",ok
1,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]",ok,"[Рыбницкий, район]","[5, 6]",ok
2,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...",ok
3,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok,"[Стадион, имени, С, Дарюса, и, С, Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok
4,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]",ok,"[Майкл, Томас, 1987, 1991]","[1, 2, 0, 0]",ok
...,...,...,...,...,...,...
39995,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]",ok,"[Клаус, Рот, Великобритания]","[1, 2, 5]",ok
39996,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]",ok,"[Оутон, Ширнесс]","[0, 5]",ok
39997,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]",ok,"[Карл, II, герцог, Орлеанский]","[1, 2, 2, 2]",ok
39998,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok,"[Крайне, высокая, скорость, IOPS, операций, вв...","[0, 0, 0, 3, 0, 0, 0, 0]",ok


In [26]:
df_main['augmented_length_check'] = df_main.apply(lambda row: length_check(row['augmented_tokens'], row['augmented_ner_tags']), axis=1)
df_main['augmented_length_check'].value_counts()

ok       39998
WRONG        2
Name: augmented_length_check, dtype: int64

In [27]:
df_main[df_main['augmented_length_check']== 'WRONG']

Unnamed: 0,tokens,ner_tags,len_check,augmented_tokens,augmented_ner_tags,augmented_length_check
9484,"[По, результатам, по, крайней, мере, одного, и...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",ok,"[По, результатам, по, крайней, мере, одного, и...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",WRONG
11647,"[__NOTOC__, —, деревянная, дощечка, ,, служивш...","[0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0]",ok,"[__NOTOC__, деревянная, дощечка, служившая, в,...","[0, 0, 0, 0, 5, 0, 0, 0, 0]",WRONG


In [28]:
df_main.drop(df_main[df_main['augmented_length_check'] == 'WRONG'].index, inplace=True)
df_main.reset_index(drop=True, inplace=True)

In [29]:
def look_for_trash(trash):
    for i in range(len(df_main)):
        if trash in df_main['augmented_tokens'][i]:
            print(i)
            print(df_main['augmented_tokens'][i])

In [30]:
trash_1 = '6½'
look_for_trash(trash_1)

22932
['Вильфрид', 'Паульсен', '6½']


In [31]:
del df_main['augmented_tokens'][22932][2]
del df_main['augmented_ner_tags'][22932][2]

In [32]:
trash_2 = '¹'
look_for_trash(trash_2)

18021
['Владимир', 'Петтай', 'Петрозаводск', '¹']


In [33]:
del df_main['augmented_tokens'][18021][3]
del df_main['augmented_ner_tags'][18021][3]

In [34]:
trash_3 = '9½'
look_for_trash(trash_3)

24425
['Затем', 'занимался', 'земледелием', '9½', 'десятин', 'надельной', 'земли']


In [35]:
del df_main['augmented_tokens'][24425][3]
del df_main['augmented_ner_tags'][24425][3]

In [36]:
def replace_numerical_with_words(tokens, ner_tags):
    adjusted_ner_tags = []
    
    for i, token in enumerate(tokens):
        if token.isnumeric():
            index = i
            new_tokens = num2words(int(token), lang='ru').split()
            tag_to_insert = ner_tags[i]
            count = len(new_tokens)
            del ner_tags[i]
            del tokens[i] 
            for _ in range(len(new_tokens)):
                ner_tags.insert(index, tag_to_insert)
                tokens.insert(index, new_tokens[_])
                index += 1 
                
    return tokens, ner_tags

In [37]:
df_main[['augmented_tokens', 'augmented_ner_tags']] = df_main.parallel_apply(lambda row: pd.Series(replace_numerical_with_words(row['augmented_tokens'], row['augmented_ner_tags'])), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

In [38]:
df_main

Unnamed: 0,tokens,ner_tags,len_check,augmented_tokens,augmented_ner_tags,augmented_length_check
0,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]",ok,"[Илизаров, Гавриил, Абрамович]","[1, 2, 2]",ok
1,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]",ok,"[Рыбницкий, район]","[5, 6]",ok
2,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...",ok
3,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok,"[Стадион, имени, С, Дарюса, и, С, Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok
4,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]",ok,"[Майкл, Томас, одна, тысяча, девятьсот, восемь...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",ok
...,...,...,...,...,...,...
39993,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]",ok,"[Клаус, Рот, Великобритания]","[1, 2, 5]",ok
39994,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]",ok,"[Оутон, Ширнесс]","[0, 5]",ok
39995,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]",ok,"[Карл, II, герцог, Орлеанский]","[1, 2, 2, 2]",ok
39996,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok,"[Крайне, высокая, скорость, IOPS, операций, вв...","[0, 0, 0, 3, 0, 0, 0, 0]",ok


In [39]:
def lowercasing(tokens):
    return [token.lower() for token in tokens]

In [40]:
df_main['augmented_tokens'] = df_main['augmented_tokens'].parallel_apply(lambda x: lowercasing(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

In [41]:
df_main

Unnamed: 0,tokens,ner_tags,len_check,augmented_tokens,augmented_ner_tags,augmented_length_check
0,"[Илизаров, ,, Гавриил, Абрамович]","[1, 2, 2, 2]",ok,"[илизаров, гавриил, абрамович]","[1, 2, 2]",ok
1,"[', '', Рыбницкий, район, '', ']","[0, 0, 5, 6, 0, 0]",ok,"[рыбницкий, район]","[5, 6]",ok
2,"[За, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok,"[за, образцовое, выполнение, заданий, командов...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...",ok
3,"[Стадион, имени, С., Дарюса, и, С., Гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok,"[стадион, имени, с, дарюса, и, с, гиренаса]","[5, 6, 6, 6, 6, 6, 6]",ok
4,"[Майкл, Томас, (, 1987—1991, )]","[1, 2, 0, 0, 0]",ok,"[майкл, томас, одна, тысяча, девятьсот, восемь...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",ok
...,...,...,...,...,...,...
39993,"[**, Клаус, Рот, (, Великобритания, ), .]","[0, 1, 2, 0, 5, 0, 0]",ok,"[клаус, рот, великобритания]","[1, 2, 5]",ok
39994,"[Оутон, (, ), ,, Ширнесс, .]","[0, 0, 0, 0, 5, 0]",ok,"[оутон, ширнесс]","[0, 5]",ok
39995,"[Карл, II, (, герцог, Орлеанский, )]","[1, 2, 2, 2, 2, 2]",ok,"[карл, ii, герцог, орлеанский]","[1, 2, 2, 2]",ok
39996,"[Крайне, высокая, скорость, IOPS, (, операций,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok,"[крайне, высокая, скорость, iops, операций, вв...","[0, 0, 0, 3, 0, 0, 0, 0]",ok


In [42]:
df_main['augmented_length_check'].value_counts()

ok    39998
Name: augmented_length_check, dtype: int64

In [46]:
df_main.to_csv('Wikiann_augmented.csv', index=False)

In [44]:
df = pd.read_csv('processed_and_augmented_Wikiann.csv')

In [45]:
df

Unnamed: 0,tokens,ner_tags,len_check,augmented_tokens,augmented_ner_tags,augmented_length_check
0,"['Илизаров', ',', 'Гавриил', 'Абрамович']","[1, 2, 2, 2]",ok,"['илизаров', 'гавриил', 'абрамович']","[1, 2, 2]",ok
1,"[""'"", ""''"", 'Рыбницкий', 'район', ""''"", ""'""]","[0, 0, 5, 6, 0, 0]",ok,"['рыбницкий', 'район']","[5, 6]",ok
2,"['За', 'образцовое', 'выполнение', 'заданий', ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",ok,"['за', 'образцовое', 'выполнение', 'заданий', ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...",ok
3,"['Стадион', 'имени', 'С.', 'Дарюса', 'и', 'С.'...","[5, 6, 6, 6, 6, 6, 6]",ok,"['стадион', 'имени', 'с', 'дарюса', 'и', 'с', ...","[5, 6, 6, 6, 6, 6, 6]",ok
4,"['Майкл', 'Томас', '(', '1987—1991', ')']","[1, 2, 0, 0, 0]",ok,"['майкл', 'томас', 'одна', 'тысяча', 'девятьсо...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",ok
...,...,...,...,...,...,...
39993,"['**', 'Клаус', 'Рот', '(', 'Великобритания', ...","[0, 1, 2, 0, 5, 0, 0]",ok,"['клаус', 'рот', 'великобритания']","[1, 2, 5]",ok
39994,"['Оутон', '(', ')', ',', 'Ширнесс', '.']","[0, 0, 0, 0, 5, 0]",ok,"['оутон', 'ширнесс']","[0, 5]",ok
39995,"['Карл', 'II', '(', 'герцог', 'Орлеанский', ')']","[1, 2, 2, 2, 2, 2]",ok,"['карл', 'ii', 'герцог', 'орлеанский']","[1, 2, 2, 2]",ok
39996,"['Крайне', 'высокая', 'скорость', 'IOPS', '(',...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0]",ok,"['крайне', 'высокая', 'скорость', 'iops', 'опе...","[0, 0, 0, 3, 0, 0, 0, 0]",ok
