In [1]:
import json, os
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np
from pymorphy2 import MorphAnalyzer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
from rich import print
tqdm.pandas()
morph = MorphAnalyzer()
stops = set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from pandas import Panel


In [2]:
import numpy as np

In [3]:
pd.set_option('display.max_colwidth', 1000)

In [4]:
PATH_TO_DATA = './data/'

In [5]:
files = [
    os.path.join(PATH_TO_DATA, file) 
    for file in os.listdir(PATH_TO_DATA) 
    if any(x in file for x in {'ng', 'russia', 'habr'})
]

In [6]:
files

['./data/russia_today_5.jsonlines',
 './data/habrahabr_0.jsonlines',
 './data/russia_today_2.jsonlines',
 './data/russia_today_4.jsonlines',
 './data/russia_today_3.jsonlines',
 './data/russia_today_0.jsonlines',
 './data/ng_1.jsonlines',
 './data/russia_today_7.jsonlines',
 './data/habrahabr_2.jsonlines',
 './data/ng_0.jsonlines',
 './data/habrahabr_1.jsonlines',
 './data/russia_today_6.jsonlines',
 './data/habrahabr_3.jsonlines',
 './data/russia_today_1.jsonlines']

Объединим файлы в один датасет.

In [7]:
data = pd.concat([pd.read_json(file, lines=True) for file in files], axis=0, ignore_index=True)

In [8]:
data = data.loc[:, ['keywords', 'title', 'summary', 'content']]

In [9]:
data.shape

(13194, 4)

In [10]:
def evaluate(true_kws, predicted_kws):
    assert len(true_kws) == len(predicted_kws)
    
    precisions = []
    recalls = []
    f1s = []
    jaccards = []
    
    for i in range(len(true_kws)):
        
        true_kw = set(true_kws[i])
        predicted_kw = set(predicted_kws[i])
        
        tp = len(true_kw & predicted_kw)
        union = len(true_kw | predicted_kw)
        fp = len(predicted_kw - true_kw)
        fn = len(true_kw - predicted_kw)
        
        if (tp+fp) == 0:
            prec = 0
        else:
            prec = tp / (tp + fp)
        
        if (tp+fn) == 0:
            rec = 0
        else:
            rec = tp / (tp + fn)
        if (prec+rec) == 0:
            f1 = 0
        else:
            f1 = (2*(prec*rec))/(prec+rec)
            
        jac = tp / union
        
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        jaccards.append(jac)
    print('Precision - ', round(np.mean(precisions), 2))
    print('Recall - ', round(np.mean(recalls), 2))
    print('F1 - ', round(np.mean(f1s), 2))
    print('Jaccard - ', round(np.mean(jaccards), 2))
    
    
        

Проверим, что всё работает как надо.

In [11]:
evaluate(data['keywords'], data['keywords'])

## Токенизация, удаление стоп-слов и нормализация.

In [12]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

In [13]:
def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.normal_form for word in words if word.tag.POS == 'NOUN']

    return words

In [14]:
data['content_norm'] = data['content'].progress_apply(normalize)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [15]:
data['title_norm'] = data['title'].progress_apply(normalize)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [16]:
data['summary_norm'] = data['summary'].progress_apply(normalize)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [17]:
data['all_norm'] = data.loc[:, ['content_norm', 'title_norm', 'summary_norm']].progress_apply(lambda x: x[0]+x[1]+x[2], axis=1)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [18]:
data['content_norm_str'] = data['content_norm'].apply(' '.join)

In [19]:
# можно заодно сделать нграммы
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)

In [20]:
tfidf.fit(data['content_norm_str'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=2, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [21]:
id2word = tfidf.get_feature_names()

Преобразуем наши тексты в векторы, где на позиции i стоит tfidf коэффициент слова i из словаря.

In [22]:
texts_vectors = tfidf.transform(data['content_norm_str'])

Отсортируем векторы текстов по этим коэффициентам и возьмем топ-10.

In [23]:
## так как матрица в tfidf в спарс формате,  ее нельзя просто так отсортировать
## перевести ее в обычный формат для всех данных тоже не получится - не хватит памяти
## поэтому пройдем по строчкам, переведем строчку в обычный array и отсортируем ее
keywords = []

for row in tqdm(range(texts_vectors.shape[0])):
    row_data = texts_vectors.getrow(row)
    top_inds = row_data.toarray().argsort()[0,:-11:-1]
    keywords.append([id2word[w] for w in top_inds])

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [24]:
keywords[:3]

[['девочка',
  'сочи',
  'юля',
  'тутберидзе',
  'анорексия',
  'юлия',
  'карьера',
  'тренер',
  'завершение карьера',
  'урманов'],
 ['кндр',
  'пхеньян',
  'меркель',
  'германия',
  'сша',
  'корея',
  'учение',
  'гуам',
  'габриэль президент',
  'габриэль'],
 ['форум',
  'катар',
  'кубинка',
  'цаста',
  'уралвагонзавод',
  'предприятие',
  'алексей заквасин',
  'заквасин',
  'август',
  'уровень открытость']]

In [25]:
evaluate(data['keywords'], keywords)

-----------
## Baseline
- Precision -  0.11
- Recall -  0.14
- F1 -  0.11
- Jaccard -  0.06
------------

# Solution

## 1 - LogReg + FastText word ranking
Построим классификатор, который будет предсказывать подходит ли данное слово к тексту/заголовку в качестве keyword-а

Слова для ранжирования возьмем из нормальзованного текста

In [40]:
from gensim.models.fasttext import load_facebook_vectors

In [88]:
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz

--2021-04-10 23:09:39--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4496459151 (4.2G) [application/octet-stream]
Saving to: ‘cc.ru.300.bin.gz’


2021-04-10 23:16:27 (10.5 MB/s) - ‘cc.ru.300.bin.gz’ saved [4496459151/4496459151]



In [89]:
#!gzip -d cc.ru.300.bin.gz

In [41]:
model = load_facebook_vectors("./cc.ru.300.bin")

In [42]:
def text2vec(tokens):
    vector = model[tokens[0]] / len(tokens)
    if len(tokens) > 1:
        for tok in tokens[1:]:
            vector += model[tok] / len(tokens)

    return vector

In [44]:
np.random.seed(42)
Xs = []
ys = []
for text, keywords in tqdm(data.loc[:, ['all_norm', 'keywords']].values):
    if len(text) > 0:
        text_vec = text2vec(text)
        keywords = set(keywords)
        for word in keywords:
            arr = np.concatenate([text_vec, text2vec(word.split())])
            y = 1
            Xs.append(arr)
            ys.append(y)

        no_keyword_list = list(filter(lambda x: x not in keywords, text))
        if len(no_keyword_list) > 0:
            for word in np.random.choice(no_keyword_list, size=len(keywords)):
                arr = np.concatenate([text_vec, text2vec([word])])
                y = 0
                Xs.append(arr)
                ys.append(y)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [45]:
Xs = np.stack(Xs)
ys = np.array(ys)

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import uniform

In [47]:
clf = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0)
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
space = {
    "C": uniform(loc=0, scale=100),
    'penalty': ['l1', 'l2']
}
search = RandomizedSearchCV(
    clf, space, cv=cv, n_jobs=-1,
    n_iter=200, scoring='f1',
    refit=True, verbose=1,
  keywordsndom_state=42
)

In [48]:
search.fit(Xs, ys)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 760 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 24.3min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
                   error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=200,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=0,
                                                solver='saga', tol=0.01,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=200, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3c509655e0>,
                                        'penalty': ['l1', 'l2']},
                   pre_dispat

In [50]:
yp = search.predict(Xs)

In [51]:
print(classification_report(ys, yp, target_names=['not keyword', 'keyword']))

In [52]:
def rank_keywords(model, text_source: pd.Series):
    _rank_keywords = []
    for text in tqdm(text_source):
        if len(text) == 0:
            _rank_keywords.append([])
        else:
            text_vec = text2vec(text)
            words = set(text)
            X_vectors = []
            X_words = []
            for word in words:
                arr = np.concatenate([text_vec, text2vec([word])])
                X_vectors.append(arr)
                X_words.append(word)

            X_vectors = np.stack(X_vectors)
            target = model.predict(X_vectors)
            keywords = []
            for i, w in enumerate(X_words):
                if target[i] == 1:
                    keywords.append(w)

            _rank_keywords.append(keywords)
    return _rank_keywords

- Ranking keywords from title

In [53]:
evaluate(data['keywords'], rank_keywords(search, data['title_norm']))

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




- Ranking keywords from summary

In [54]:
evaluate(data['keywords'], rank_keywords(search, data['summary_norm']))

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




- Ranking keywords from content

In [55]:
evaluate(data['keywords'], rank_keywords(search, data['content_norm']))

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




Лучшим вариантом является ранжирование ключевых слов из summary - мы получаем прирост Precision без значительного падения Recall (т.е больший F1 в итоге)

## 2 - RuBERT word ranking
Развиваем предыдущую идею - построим классификатор на основе RuBERT, будем подавать текст(заголовок, summary или content) в качестве первого предложения, а потенциальное ключевое слово в качестве второго

In [20]:
import os
os.environ['WANDB_DISABLED']= 'true'
import torch
import transformers as tr
import datasets as ds

In [21]:
data['all'] = data.loc[:, ['title', 'content', 'summary']].progress_apply(lambda x: x[0] + ' ' + x[1] + ' ' + x[2], axis=1)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [21]:
# example generation
texts      = []
text_pairs = []
labels     = []
np.random.seed(42)
for text, keywords, negatives in tqdm(data.loc[:, ['all', 'keywords', 'all_norm']].values):
    for word in keywords:
        texts.append(text)
        text_pairs.append(word)
        labels.append(1)
        
    no_keyword_list = list(filter(lambda x: x not in keywords, negatives))
    if len(no_keyword_list) > 0:
        for wrong_keyword in np.random.choice(no_keyword_list, size=len(keywords)):
            texts.append(text)
            text_pairs.append(wrong_keyword)
            labels.append(0)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [43]:
tokenizer = tr.AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = tr.AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
dataset = ds.Dataset.from_dict({
    'text': texts,
    'text_pair': text_pairs,
    'label': labels
})

In [24]:
def encode(batch):
    encoding = tokenizer(batch['text'], batch['text_pair'], padding='max_length', truncation='only_first', max_length=256)
    return encoding

In [25]:
dataset = dataset.map(encode, batched=True, batch_size=1000, num_proc=20)























In [26]:
dataset = dataset.remove_columns(['text', 'text_pair'])

In [27]:
dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

Для повторения экспериментов в данной конфигурации необходимо использовать 
видеокарту с 24Gb+ видеопамяти(я использовал rtx 3090). Если такой нет - нужно уменьшить train_batch_size и кратно увеличить gradient_accumulation_steps. Пример:
- train_batch_size = 32, gradient_accumulation_steps = 32 (~12gb памяти)
- train_batch_size = 16, gradient_accumulation_steps = 64
- train_batch_size = 8,  gradient_accumulation_steps = 128

Так же надо поставить transformers из исходников
```
pip install git+https://https://github.com/huggingface/transformers.git
```
Если этого делать не хочется, то надо убрать опцию fp16_backend и (если не установлен apex) fp16, fp16_opt_level

In [44]:
training_args = tr.TrainingArguments(
    output_dir='./rubert/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=64,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./rubert/logs',            # directory for storing logs
    logging_steps=100,
    gradient_accumulation_steps=16,
    fp16=True,
    fp16_opt_level='O2',
    fp16_backend='amp'
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [45]:
trainer = tr.Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset
)

In [46]:
trainer.train()

Step,Training Loss
100,0.3606
200,0.1872
300,0.1549
400,0.1468
500,0.1267
600,0.119


TrainOutput(global_step=639, training_loss=0.17837474267807366, metrics={'train_runtime': 4413.2218, 'train_samples_per_second': 0.145, 'total_flos': 1.792335618141143e+17, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 715356672, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1527808, 'train_mem_gpu_alloc_delta': 2155103744, 'train_mem_cpu_peaked_delta': 2539520, 'train_mem_gpu_peaked_delta': 14878468096})

In [20]:
def rubert_rank_keywords(model, tokenizer, text_source: pd.Series, candidate_source: pd.Series, pred_device = 'cuda:0'):
    pred_device = torch.device(pred_device)
    model = model.to(pred_device)
    _rank_keywords = []
    for text, candidates in tqdm(zip(text_source, candidate_source), total=len(text_source)):
        if len(candidates) == 0:
            _rank_keywords.append([])
        else:
            texts = []
            words = []
            for word in candidates:
                texts.append(text)
                words.append(word)
                
            examples = {k:v.to(pred_device) for k, v in tokenizer(
                text=texts,
                text_pair=words,
                padding=True,
                truncation='only_first',
                max_length=256,
                return_tensors='pt'
            ).items()}
            with torch.no_grad():
                predictions = torch.argmax(torch.softmax((model(**examples)[0]), -1), -1).cpu().numpy()
                
            keywords = []
            for i, w in enumerate(words):
                if predictions[i] == 1:
                    keywords.append(w)
            
            _rank_keywords.append(keywords)
            
    return _rank_keywords

In [None]:
keywords_title = rubert_rank_keywords(model, tokenizer, data['all'], data['title_norm'])
keywords_summary = rubert_rank_keywords(model, tokenizer, data['all'], data['summary_norm'])

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))

- Keywords from title

In [51]:
evaluate(data['keywords'], keywords_title)

- Keywords from summary

In [52]:
evaluate(data['keywords'], keywords_summary)

В данном случае так же лучше работает извлечение keywords из summary - не проседаем по recall и улучшаем precision

# 3 - Rerank existing keywords
В примерах встречаются кейворды, которые не содержатся в соответствующих текстах вообще никак. Так же часто для keywords разных примеров используются одинаковые слова. Из этого направшивается идея собрать полное множество возможных кейвордов и для каждого примера ранжировать это самое множество с поомщью какой-либо модельки, отбирая в итоге топ5/10 самых релевантных кейвордов. Это не совсем extraction, но задачу решает.

In [104]:
def rank_all_keywords(model, text_source: pd.Series, all_keywords):
    _rank_keywords = []
    for text in tqdm(text_source):
        if len(text) == 0:
            _rank_keywords.append([])
        else:
            text_vec = text2vec(text)
            X_vectors = []
            X_words = []
            for word in all_keywords:
                try:
                    arr = np.concatenate([text_vec, text2vec([word])])
                    X_vectors.append(arr)
                    X_words.append(word)
                except Exception as e:
                    print(e)
                    print(text_vec.shape)
                    print(text2vec([word]).shape)
                    print(word)
                    return

            X_vectors = np.stack(X_vectors)
            target = model.predict(X_vectors)
            keywords = []
            for i, w in enumerate(X_words):
                if target[i] == 1:
                    keywords.append(w)
                    
            #keywords = list(reversed(sorted(keywords, key=lambda x: x[1])))
            #keywords = keywords[:top_n]

            _rank_keywords.append(keywords)
    return _rank_keywords

In [105]:
all_keywords = []
for keywords in data['keywords']:
    all_keywords += keywords

# not really all(about 15000 keywords) cuz it is 23:31 and i'm running out of time
all_keywords = [word for word, count in Counter(all_keywords).most_common(5000)]

In [106]:
all_keywords.__len__()

5000

In [108]:
ranked_keywords_top10 = rank_all_keywords(search, data['title_norm'], all_keywords)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




KeyboardInterrupt: 

In [95]:
ranked_keywords_top5 = [kw[:5] for kw in ranked_keywords_top10]

In [98]:
data.keywords[0]

['в россии',
 'олимпийские игры 2014 в сочи',
 'спорт',
 'спортсмен',
 'эксклюзив rt',
 'юлия липницкая',
 'фигурное катание']

In [99]:
ranked_keywords_top10[0]

[('RC', 1.0),
 ('pd', 1.0),
 ('v8', 1.0),
 ('CNC', 1.0),
 ('5g', 1.0),
 ('HA', 1.0),
 ('arm', 1.0),
 ('vk', 1.0),
 ('ar', 1.0),
 ('3d', 1.0)]

In [96]:
evaluate(data['keywords'], ranked_keywords_top10)

In [97]:
evaluate(data['keywords'], ranked_keywords_top5)

In [109]:
#it does not work, sadly