In [1]:
import os
import sys
import random
import pickle

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from tqdm import tqdm
from pandarallel import pandarallel

import pymorphy2
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize

from sklearn.model_selection import train_test_split

SEED = 1
random.seed(SEED)

pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=14, use_memory_fs=False)

INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
abbr = pd.read_csv("../data/abbr.csv")
lenta = pd.read_csv("../data/lenta.csv")

In [3]:
class AbbrInfo:
    def __init__(self, abbr_id, abbr, abbr_count):
        self.abbr_id = abbr_id 
        self.abbr = abbr
        self.abbr_count = abbr_count

ABBR_LIST_KEY = "<ABBR_LIST_KEY>"
        
def create_abbr_tree(abbr, abbr_list_key = ABBR_LIST_KEY):        
    tree = {}
    for norm_desc, norm_abbr, abbr_id, abbr_count in abbr[["desc_norm", 
                                                           "abbr_norm", 
                                                           "abbr_id", 
                                                           "abbr_count"]].values:
        words = norm_desc.split(" ")

        curr_tree = tree
        for word in words:
            if word not in curr_tree:
                curr_tree[word] = {}
            curr_tree = curr_tree[word]


        if abbr_list_key not in curr_tree:
            curr_tree[abbr_list_key] = []

        curr_tree[abbr_list_key].append(AbbrInfo(abbr_id, norm_abbr, abbr_count))
    return tree

abbr_tree = create_abbr_tree(abbr)

In [4]:
OUTSIDE_LABEL = "_"
BEGIN_LABEL = "B"
END_LABEL = "E"
INSIDE_LABEL = "I"
ONE_WORD_LABEL = "W"

def choice_abbr(abbr_list: list, 
                weighted_choice: bool = True, 
                add_to_zeros: float = 0):
    abbr_counts = []
    
    if weighted_choice:
        for abbr_info in abbr_list:
            cnt = abbr_info.abbr_count
            if cnt == 0:
                cnt = add_to_zeros
            abbr_counts.append(cnt)
    else:
        abbr_counts = None
    
    
    return random.choices(abbr_list, weights=abbr_counts, k=1)[0]

def get_text_labels(text, 
                    abbr_tree, 
                    weighted_choice: bool = None, 
                    add_to_zeros: float = None):
    text = text.split(" ")
    labels = [OUTSIDE_LABEL for i in range(len(text))]

    curr_node = abbr_tree
    desc_start = None

    word_i = 0
    while word_i < len(text):
        curr_i = word_i
        while curr_i < len(text) and text[curr_i] in curr_node:
            curr_node = curr_node[text[curr_i]]
            curr_i += 1

        if ABBR_LIST_KEY in curr_node: 

            abbr_id = choice_abbr(curr_node[ABBR_LIST_KEY], weighted_choice, add_to_zeros).abbr_id

            labels[word_i] = f"{BEGIN_LABEL}-{abbr_id}"
            for j in range(word_i + 1, curr_i - 1): 
                labels[j] = f"{INSIDE_LABEL}-{abbr_id}"
            labels[curr_i - 1] = f"{END_LABEL}-{abbr_id}"

            if word_i == curr_i - 1:
                labels[word_i] = f"{ONE_WORD_LABEL}-{abbr_id}"

            word_i = curr_i - 1

        curr_node = abbr_tree
        word_i += 1
    return " ".join(labels)

def replace_word_by_abbr(text, labels, abbr, p_replace: float = 0.2):
    text = text.split(" ")
    labels = labels.split(" ")
    
    new_text = []
    new_labels = []

    i = 0
    while i < len(text):
        label = labels[i]
        if label == OUTSIDE_LABEL:
            new_text.append(text[i])
            new_labels.append(OUTSIDE_LABEL)

        mode = label[0]

        if mode in [ONE_WORD_LABEL, BEGIN_LABEL]:
            abbr_id = int(label[2:])
            replaced = random.choices([False, True], weights=[(1 - p_replace), p_replace])[0]
            if replaced:
                norm_abbr = abbr[abbr.abbr_id == abbr_id].abbr_norm.iloc[0].split(" ")
                
                if len(norm_abbr) == 1:
                    new_text.append(norm_abbr[0])
                    new_labels.append(f"{ONE_WORD_LABEL}-{str(abbr_id)}")
                else:
                    new_text.append(norm_abbr[0])
                    new_labels.append(f"{BEGIN_LABEL}-{str(abbr_id)}")
                    for word in norm_abbr[1:-1]:
                        new_text.append(word)
                        new_labels.append(f"{INSIDE_LABEL}-{str(abbr_id)}")
                    new_text.append(norm_abbr[-1])
                    new_labels.append(f"{END_LABEL}-{str(abbr_id)}")

            while i < len(text) and labels[i] != OUTSIDE_LABEL and int(labels[i][2:]) == abbr_id:
                if not replaced:
                    new_text.append(text[i])
                    new_labels.append(OUTSIDE_LABEL)
                i += 1
        else:
            i += 1

    new_text = " ".join(new_text)
    new_labels = " ".join(new_labels)
    
    return pd.Series({"new_text": new_text, "new_labels": new_labels})

In [5]:
morph = pymorphy2.MorphAnalyzer(lang="ru", 
                                units=[pymorphy2.units.DictionaryAnalyzer()])

def norm_tokenize(line):
    tokenized_norm = []
    for word in word_tokenize(line):
        parse_list = morph.parse(str(word))
        if parse_list != []:
            norm_form = parse_list[0].normal_form
        else:
            norm_form = word
        tokenized_norm.append(norm_form)
    return tokenized_norm

In [6]:
lenta["text_tokenized"] = lenta["text"].parallel_apply(lambda x: " ".join(word_tokenize(x)))
lenta["text_norm"] = lenta["text"].parallel_apply(lambda x: " ".join(norm_tokenize(x)))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36019), Label(value='0 / 36019')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36019), Label(value='0 / 36019')))…

In [7]:
lenta["labels"] = lenta["text_norm"].parallel_apply(lambda x: get_text_labels(x, abbr_tree))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36019), Label(value='0 / 36019')))…

In [8]:
lenta[["text_new", "labels_new"]] = (
    lenta[["text_norm", "labels"]]
        .parallel_apply(lambda x: replace_word_by_abbr(x["text_norm"], x["labels"], 
                                                       abbr, p_replace=0.3), axis=1)
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36019), Label(value='0 / 36019')))…

In [9]:
lenta.to_csv("../data/lenta_t5_v1.csv", index=False, header=True)
# lenta = pd.read_csv("../data/lenta_t5_v1.csv", index_col=False)
# lenta

In [11]:
def get_upd_text_and_labels(line):
    text_tokenized = line.text_tokenized.split(" ")
    labels = line.labels.split(" ")
    text_new = line.text_new.split(" ")
    labels_new = line.labels_new.split(" ")
    i, j = 0, 0
    text_upd = []
    labels_upd = []
    while i < len(text_tokenized) and j < len(text_new):
        label = labels[i]
        label_new = labels_new[j]

        if label_new == "_" or label == "_":
            text_upd.append(text_tokenized[i])
            labels_upd.append("_")
            i += 1
            j += 1        
        else:
            abbr_id = label_new.split("-")[1]
            text_upd.append(text_new[j])
            j += 1
            desc_list = []
            while i < len(text_tokenized) and labels[i] != "_" and labels[i].split("-")[1] == abbr_id:
                desc_list.append(text_tokenized[i])
                i += 1
            labels_upd.append("=".join(desc_list))
    return pd.Series({"text_upd": " ".join(text_upd), "labels_upd": " ".join(labels_upd)})

In [12]:
lenta[["text_upd", "labels_upd"]] = (
    lenta.parallel_apply(get_upd_text_and_labels, axis=1)
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=36019), Label(value='0 / 36019')))…

In [13]:
lenta_filt = lenta[["text_upd", "labels_upd"]]

In [14]:
lenta_train, lenta_test = train_test_split(lenta_filt, test_size=0.2, shuffle=True, random_state=SEED)

In [15]:
lenta_train.to_csv("../data/lenta_train_t5.csv", index=False, header=True)
lenta_test.to_csv("../data/lenta_test_t5.csv", index=False, header=True)