In [19]:
import os
import sys
import string 

import pandas as pd
import numpy as np
import random
import itertools

from matplotlib import pyplot as plt

from tqdm import tqdm

from pandarallel import pandarallel

import pymorphy2
import nltk
import pickle

import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from keras.preprocessing.sequence import pad_sequences

import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.phrases import Phrases, Phraser

import time

sys.path.append("..")
from src import *

SEED = 1
def init_random_seed(value=0):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    torch.backends.cudnn.deterministic = True
init_random_seed(SEED)
    
pd.set_option('display.max_colwidth', 255)
tqdm.pandas()
pandarallel.initialize(progress_bar=True, nb_workers=8, use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [7]:
abbr = pd.read_csv("../data/abbr.csv")
lenta_train = pd.read_csv("../data/lenta_train.csv")
lenta_test = pd.read_csv("../data/lenta_test.csv")

In [8]:
id2abbr = {}
abbr2id = {}
id2desc = {}
desc2id = {}
for idx, abbr_name, desc in abbr[["abbr_id", "abbr_norm", "desc_norm"]].values:
    id2abbr[idx] = abbr_name
    abbr2id[abbr_name] = idx
    id2desc[idx] = desc
    desc2id[desc] = idx

In [9]:
lenta_train["labels_new"] = (
    lenta_train["labels_new"].str.replace("W", "")
                            .str.replace("B", "")
                            .str.replace("E", "")
                            .str.replace("-", "")
)
train_texts = list(map(lambda x: x.split(" "), lenta_train["text_new"].to_list()))
train_labels = list(map(lambda x: x.split(" "), lenta_train["labels_new"].to_list()))

lenta_test["labels_new"] = (
    lenta_test["labels_new"].str.replace("W", "")
                            .str.replace("B", "")
                            .str.replace("E", "")
                            .str.replace("-", "")
)
test_texts = list(map(lambda x: x.split(" "), lenta_test["text_new"].to_list()))
test_labels = list(map(lambda x: x.split(" "), lenta_test["labels_new"].to_list()))

In [10]:
PAD_TOKEN = "<PAD>"
PAD_TOKEN_ID = 0
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
PAD_LABEL = "<NOLABEL>"
PAD_LABEL_ID = 0

EMPTY_LABEL = "_"
EMPTY_LABEL_ID = 1

train_texts_global = list(itertools.chain(*train_texts))
train_labels_global = list(itertools.chain(*train_labels))
train_labels_global = list(filter(lambda x: x != EMPTY_LABEL, train_labels_global))

UNIQUE_TOKENS = [PAD_TOKEN] + list(set(train_texts_global))
UNIQUE_LABELS = [PAD_LABEL, EMPTY_LABEL] + list(set(train_labels_global))

token2id = {label: i for i, label in enumerate(UNIQUE_TOKENS)}
id2token = {i: label for label, i in token2id.items()}

label2id = {label: i for i, label in enumerate(UNIQUE_LABELS)}
id2label = {i: label for label, i in label2id.items()}

MAX_SENTENCE_LEN = lenta_train.text_new.str.split(" ").str.len().max()
train_size = len(train_texts)
test_size = len(test_texts)
TOKENS_NUM = len(UNIQUE_TOKENS)
LABELS_NUM = len(UNIQUE_LABELS)

print(MAX_SENTENCE_LEN, train_size, test_size, TOKENS_NUM, LABELS_NUM)

282 403411 100853 635711 1162


In [3]:
model_name = os.path.join("../models/w2v", "emb_64.word2vec")
model = gensim.models.Word2Vec.load(model_name)

In [5]:
def lev_dist(a, b):
    a, b = list(a), list(b)
    def recursive(i, j):
        if i == 0 or j == 0:
            return max(i, j)
        elif a[i - 1] == b[j - 1]:
            return recursive(i - 1, j - 1)
        else:
            return 1 + min(
                recursive(i, j - 1),
                recursive(i - 1, j),
                recursive(i - 1, j - 1)
            )
    return recursive(len(a), len(b))


def get_desc_find_by_dist(word, w2v_model, topn=5, dist=prefix_dist):
    desc_score_dist = []
    for desc, score in w2v_model.wv.most_similar(word, topn=topn):
        desc = desc.replace("_", " ")
        desc_score_dist.append([desc, score, dist(word, desc)])
    desc_score_dist = sorted(desc_score_dist, key=lambda x: x[2])
    return desc_score_dist[0][0]

def get_desc_first(word, w2v_model):
    first_desc = w2v_model.wv.most_similar(word, topn=1)[0][0]
    first_desc = first_desc.replace("_", " ")
    return first_desc

def get_desc_find_in_dict(word, w2v_model, desc2id):
    for desc, score in w2v_model.wv.most_similar(word, topn=10):
        desc = desc.replace("_", " ")
        if desc in desc2id.keys():
            return desc
        
    return get_desc_first(word, w2v_model)
        

def get_token2desc(tokens, w2v_model, desc2id, 
                   get_desc_f=get_desc_find_by_dist, topn=5, dist=intersection_dist):
    abbr_detection = AbbrDetection()

    token2desc = {}
    for token in tqdm(tokens):
        if abbr_detection.word_is_abbr(token):
            if token in w2v_model.wv.key_to_index:
                desc = get_desc_find_by_dist(token, w2v_model, 
                                             topn=topn, dist=intersection_dist)
                label = desc2id.get(desc, "_")
            else:
                label = "_"
        else:
            label = "_"
        token2desc[token] = label
    return token2desc

In [74]:
lenta_test.sample(1000).to_csv("../lenta_sample.csv", index=False)

In [12]:
token2desc = get_token2desc(tokens=list(token2id.keys()), 
                            w2v_model=model, 
                            desc2id=desc2id, 
                            topn=10, 
                            dist=lev_dist)

100%|██████████| 635711/635711 [03:53<00:00, 2717.40it/s]


In [73]:
with open("../desc2id.pickle", "wb") as f: 
    pickle.dump(desc2id, f)

In [None]:
with open("../token2desc.pickle", "wb") as f: 
    pickle.dump(token2desc, f)

In [20]:
text = "ЦБ прорабатывает вопрос включения финграмотности в нацпроекты"

In [21]:
def tokenize(line):
    return word_tokenize(line)

morph = pymorphy2.MorphAnalyzer(lang="ru", 
                                units=[pymorphy2.units.DictionaryAnalyzer()])
def normalize(word):
    word = str(word).lower()
    parse_list = morph.parse(word)
    if parse_list != []:
        return parse_list[0].normal_form
    else:
        return word

def get_tokenized_normal_form(line):
    line = str(line)
    new_line_arr = []
    for token in tokenize(line):
        new_line_arr.append(normalize(token))
    return " ".join(new_line_arr)

In [47]:
def get_preds(norm_text):
    preds = []
    for word in norm_text:
        if word in token2desc:
            label = token2desc[word]  
        else:
            label = "_"
        preds.append(str(label))
    return preds

In [68]:
def get_new_text(text, preds):
    new_text = []
    for i in range(len(preds)):
        label = preds[i]
        if label == "_":
            new_text.append(text[i])
        else:
            new_text.append(id2desc[int(label)])
    return new_text

In [76]:
# text = test_texts[7]
preds = get_preds(text)
new_text = get_new_text(text, preds)

res = []
for text_word, label, text_upd in zip(text, preds, new_text):
    if label != "_":
        print(text_word, label, text_upd, sep="\t")

пер	692	период
мн	928	международный
млн	1037	миллион


In [78]:
print(" ".join(text))
p

"в пер холод перебор газ украина достигать 80 миллион кубометр в сутки , передавать риа новость . в сообщение , обнародовать `` газпром '' по итог состояться в четверг совещание , указываться , что из-за это с 19 по 25 январь европа недополучить 326 миллион кубометр газ . `` в ситуация , когда сильный мороз обрушиться практически на всё европейский страна , украина , пользоваться свой положение страны-транзитера , обеспечивать себя газ за счёт потребитель в европа , незаконно превышать не только собственный лимит потребление , но и забирать дополнительный поставка российский газ в европа , - заявить в ход это совещание заместитель председатель правление компания александр ананенков . - наступить холод показать , что украина являться единственный транзитный государство , который грубый образ попирать мн норма ведение газовый бизнес . фактически это означать полный отсутствие контроль в энергетический сфера украина '' . в сообщение также отмечаться , что с 16 по 25 январь `` газпром '' д

In [None]:
" ".join(text)

In [None]:
# history = []
for dist in [lev_dist]:
    for topn in [50]:
        print(dist.__name__, topn)
        token2desc = get_token2desc(tokens=list(token2id.keys()), 
                                    w2v_model=model, 
                                    desc2id=desc2id, 
                                    topn=topn, 
                                    dist=dist)

        preds = []
        for text in tqdm(test_texts):
            labels = []
            for word in text:
                if word in token2desc:
                    label = token2desc[word]  
                else:
                    label = "_"
                labels.append(str(label))
            preds.append(labels)

        test_labels_global = list(itertools.chain(*test_labels))
        test_preds_global = list(itertools.chain(*preds))

        test_labels_global_upd = []
        for label_id in test_labels_global:
            if label_id == "_":
                test_labels_global_upd.append("_")
            else:
                test_labels_global_upd.append(id2desc.get(int(label_id), "_"))

        test_preds_global_upd = []
        for label_id in test_preds_global:
            if label_id == "_":
                test_preds_global_upd.append("_")
            else:
                test_preds_global_upd.append(id2desc.get(int(label_id), "_"))

        f1 = f1_score(test_labels_global_upd, test_preds_global_upd, average="macro")
        filtred_acc = get_filtred_accuracy_score(test_labels_global_upd, test_preds_global_upd)
        print(f1, filtred_acc)