In [1]:
import json, os
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np
from pymorphy2 import MorphAnalyzer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
from rich import print
tqdm.pandas()
morph = MorphAnalyzer()
stops = set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from pandas import Panel


In [2]:
import numpy as np

In [3]:
pd.set_option('display.max_colwidth', 1000)

In [4]:
PATH_TO_DATA = './data/'

In [5]:
files = [
    os.path.join(PATH_TO_DATA, file) 
    for file in os.listdir(PATH_TO_DATA) 
    if any(x in file for x in {'ng', 'russia', 'habr'})
]

In [6]:
files

['./data/russia_today_5.jsonlines',
 './data/habrahabr_0.jsonlines',
 './data/russia_today_2.jsonlines',
 './data/russia_today_4.jsonlines',
 './data/russia_today_3.jsonlines',
 './data/russia_today_0.jsonlines',
 './data/ng_1.jsonlines',
 './data/russia_today_7.jsonlines',
 './data/habrahabr_2.jsonlines',
 './data/ng_0.jsonlines',
 './data/habrahabr_1.jsonlines',
 './data/russia_today_6.jsonlines',
 './data/habrahabr_3.jsonlines',
 './data/russia_today_1.jsonlines']

–û–±—ä–µ–¥–∏–Ω–∏–º —Ñ–∞–π–ª—ã –≤ –æ–¥–∏–Ω –¥–∞—Ç–∞—Å–µ—Ç.

In [7]:
data = pd.concat([pd.read_json(file, lines=True) for file in files], axis=0, ignore_index=True)

In [8]:
data = data.loc[:, ['keywords', 'title', 'summary', 'content']]

In [9]:
data.shape

(13194, 4)

In [10]:
def evaluate(true_kws, predicted_kws):
    assert len(true_kws) == len(predicted_kws)
    
    precisions = []
    recalls = []
    f1s = []
    jaccards = []
    
    for i in range(len(true_kws)):
        
        true_kw = set(true_kws[i])
        predicted_kw = set(predicted_kws[i])
        
        tp = len(true_kw & predicted_kw)
        union = len(true_kw | predicted_kw)
        fp = len(predicted_kw - true_kw)
        fn = len(true_kw - predicted_kw)
        
        if (tp+fp) == 0:
            prec = 0
        else:
            prec = tp / (tp + fp)
        
        if (tp+fn) == 0:
            rec = 0
        else:
            rec = tp / (tp + fn)
        if (prec+rec) == 0:
            f1 = 0
        else:
            f1 = (2*(prec*rec))/(prec+rec)
            
        jac = tp / union
        
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        jaccards.append(jac)
    print('Precision - ', round(np.mean(precisions), 2))
    print('Recall - ', round(np.mean(recalls), 2))
    print('F1 - ', round(np.mean(f1s), 2))
    print('Jaccard - ', round(np.mean(jaccards), 2))
    
    
        

–ü—Ä–æ–≤–µ—Ä–∏–º, —á—Ç–æ –≤—Å—ë —Ä–∞–±–æ—Ç–∞–µ—Ç –∫–∞–∫ –Ω–∞–¥–æ.

In [11]:
evaluate(data['keywords'], data['keywords'])

## –¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è, —É–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤ –∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è.

In [12]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'¬´¬ª‚Äî‚Ä¶‚Äú‚Äù*‚Ññ‚Äì'
stops = set(stopwords.words('russian'))

In [13]:
def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.normal_form for word in words if word.tag.POS == 'NOUN']

    return words

In [14]:
data['content_norm'] = data['content'].progress_apply(normalize)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [15]:
data['title_norm'] = data['title'].progress_apply(normalize)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [16]:
data['summary_norm'] = data['summary'].progress_apply(normalize)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [17]:
data['all_norm'] = data.loc[:, ['content_norm', 'title_norm', 'summary_norm']].progress_apply(lambda x: x[0]+x[1]+x[2], axis=1)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [18]:
data['content_norm_str'] = data['content_norm'].apply(' '.join)

In [19]:
# –º–æ–∂–Ω–æ –∑–∞–æ–¥–Ω–æ —Å–¥–µ–ª–∞—Ç—å –Ω–≥—Ä–∞–º–º—ã
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)

In [20]:
tfidf.fit(data['content_norm_str'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=2, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [21]:
id2word = tfidf.get_feature_names()

–ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –Ω–∞—à–∏ —Ç–µ–∫—Å—Ç—ã –≤ –≤–µ–∫—Ç–æ—Ä—ã, –≥–¥–µ –Ω–∞ –ø–æ–∑–∏—Ü–∏–∏ i —Å—Ç–æ–∏—Ç tfidf –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç —Å–ª–æ–≤–∞ i –∏–∑ —Å–ª–æ–≤–∞—Ä—è.

In [22]:
texts_vectors = tfidf.transform(data['content_norm_str'])

–û—Ç—Å–æ—Ä—Ç–∏—Ä—É–µ–º –≤–µ–∫—Ç–æ—Ä—ã —Ç–µ–∫—Å—Ç–æ–≤ –ø–æ —ç—Ç–∏–º –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç–∞–º –∏ –≤–æ–∑—å–º–µ–º —Ç–æ–ø-10.

In [23]:
## —Ç–∞–∫ –∫–∞–∫ –º–∞—Ç—Ä–∏—Ü–∞ –≤ tfidf –≤ —Å–ø–∞—Ä—Å —Ñ–æ—Ä–º–∞—Ç–µ,  –µ–µ –Ω–µ–ª—å–∑—è –ø—Ä–æ—Å—Ç–æ —Ç–∞–∫ –æ—Ç—Å–æ—Ä—Ç–∏—Ä–æ–≤–∞—Ç—å
## –ø–µ—Ä–µ–≤–µ—Å—Ç–∏ –µ–µ –≤ –æ–±—ã—á–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç –¥–ª—è –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö —Ç–æ–∂–µ –Ω–µ –ø–æ–ª—É—á–∏—Ç—Å—è - –Ω–µ —Ö–≤–∞—Ç–∏—Ç –ø–∞–º—è—Ç–∏
## –ø–æ—ç—Ç–æ–º—É –ø—Ä–æ–π–¥–µ–º –ø–æ —Å—Ç—Ä–æ—á–∫–∞–º, –ø–µ—Ä–µ–≤–µ–¥–µ–º —Å—Ç—Ä–æ—á–∫—É –≤ –æ–±—ã—á–Ω—ã–π array –∏ –æ—Ç—Å–æ—Ä—Ç–∏—Ä—É–µ–º –µ–µ
keywords = []

for row in tqdm(range(texts_vectors.shape[0])):
    row_data = texts_vectors.getrow(row)
    top_inds = row_data.toarray().argsort()[0,:-11:-1]
    keywords.append([id2word[w] for w in top_inds])

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [24]:
keywords[:3]

[['–¥–µ–≤–æ—á–∫–∞',
  '—Å–æ—á–∏',
  '—é–ª—è',
  '—Ç—É—Ç–±–µ—Ä–∏–¥–∑–µ',
  '–∞–Ω–æ—Ä–µ–∫—Å–∏—è',
  '—é–ª–∏—è',
  '–∫–∞—Ä—å–µ—Ä–∞',
  '—Ç—Ä–µ–Ω–µ—Ä',
  '–∑–∞–≤–µ—Ä—à–µ–Ω–∏–µ –∫–∞—Ä—å–µ—Ä–∞',
  '—É—Ä–º–∞–Ω–æ–≤'],
 ['–∫–Ω–¥—Ä',
  '–ø—Ö–µ–Ω—å—è–Ω',
  '–º–µ—Ä–∫–µ–ª—å',
  '–≥–µ—Ä–º–∞–Ω–∏—è',
  '—Å—à–∞',
  '–∫–æ—Ä–µ—è',
  '—É—á–µ–Ω–∏–µ',
  '–≥—É–∞–º',
  '–≥–∞–±—Ä–∏—ç–ª—å –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç',
  '–≥–∞–±—Ä–∏—ç–ª—å'],
 ['—Ñ–æ—Ä—É–º',
  '–∫–∞—Ç–∞—Ä',
  '–∫—É–±–∏–Ω–∫–∞',
  '—Ü–∞—Å—Ç–∞',
  '—É—Ä–∞–ª–≤–∞–≥–æ–Ω–∑–∞–≤–æ–¥',
  '–ø—Ä–µ–¥–ø—Ä–∏—è—Ç–∏–µ',
  '–∞–ª–µ–∫—Å–µ–π –∑–∞–∫–≤–∞—Å–∏–Ω',
  '–∑–∞–∫–≤–∞—Å–∏–Ω',
  '–∞–≤–≥—É—Å—Ç',
  '—É—Ä–æ–≤–µ–Ω—å –æ—Ç–∫—Ä—ã—Ç–æ—Å—Ç—å']]

In [25]:
evaluate(data['keywords'], keywords)

-----------
## Baseline
- Precision -  0.11
- Recall -  0.14
- F1 -  0.11
- Jaccard -  0.06
------------

# Solution

## 1 - LogReg + FastText word ranking
–ü–æ—Å—Ç—Ä–æ–∏–º –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä, –∫–æ—Ç–æ—Ä—ã–π –±—É–¥–µ—Ç –ø—Ä–µ–¥—Å–∫–∞–∑—ã–≤–∞—Ç—å –ø–æ–¥—Ö–æ–¥–∏—Ç –ª–∏ –¥–∞–Ω–Ω–æ–µ —Å–ª–æ–≤–æ –∫ —Ç–µ–∫—Å—Ç—É/–∑–∞–≥–æ–ª–æ–≤–∫—É –≤ –∫–∞—á–µ—Å—Ç–≤–µ keyword-–∞

–°–ª–æ–≤–∞ –¥–ª—è —Ä–∞–Ω–∂–∏—Ä–æ–≤–∞–Ω–∏—è –≤–æ–∑—å–º–µ–º –∏–∑ –Ω–æ—Ä–º–∞–ª—å–∑–æ–≤–∞–Ω–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞

In [40]:
from gensim.models.fasttext import load_facebook_vectors

In [88]:
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz

--2021-04-10 23:09:39--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4496459151 (4.2G) [application/octet-stream]
Saving to: ‚Äòcc.ru.300.bin.gz‚Äô


2021-04-10 23:16:27 (10.5 MB/s) - ‚Äòcc.ru.300.bin.gz‚Äô saved [4496459151/4496459151]



In [89]:
#!gzip -d cc.ru.300.bin.gz

In [41]:
model = load_facebook_vectors("./cc.ru.300.bin")

In [42]:
def text2vec(tokens):
    vector = model[tokens[0]] / len(tokens)
    if len(tokens) > 1:
        for tok in tokens[1:]:
            vector += model[tok] / len(tokens)

    return vector

In [44]:
np.random.seed(42)
Xs = []
ys = []
for text, keywords in tqdm(data.loc[:, ['all_norm', 'keywords']].values):
    if len(text) > 0:
        text_vec = text2vec(text)
        keywords = set(keywords)
        for word in keywords:
            arr = np.concatenate([text_vec, text2vec(word.split())])
            y = 1
            Xs.append(arr)
            ys.append(y)

        no_keyword_list = list(filter(lambda x: x not in keywords, text))
        if len(no_keyword_list) > 0:
            for word in np.random.choice(no_keyword_list, size=len(keywords)):
                arr = np.concatenate([text_vec, text2vec([word])])
                y = 0
                Xs.append(arr)
                ys.append(y)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [45]:
Xs = np.stack(Xs)
ys = np.array(ys)

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import uniform

In [47]:
clf = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0)
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
space = {
    "C": uniform(loc=0, scale=100),
    'penalty': ['l1', 'l2']
}
search = RandomizedSearchCV(
    clf, space, cv=cv, n_jobs=-1,
    n_iter=200, scoring='f1',
    refit=True, verbose=1,
  keywordsndom_state=42
)

In [48]:
search.fit(Xs, ys)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 760 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 24.3min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
                   error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=200,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=0,
                                                solver='saga', tol=0.01,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=200, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3c509655e0>,
                                        'penalty': ['l1', 'l2']},
                   pre_dispat

In [50]:
yp = search.predict(Xs)

In [51]:
print(classification_report(ys, yp, target_names=['not keyword', 'keyword']))

In [52]:
def rank_keywords(model, text_source: pd.Series):
    _rank_keywords = []
    for text in tqdm(text_source):
        if len(text) == 0:
            _rank_keywords.append([])
        else:
            text_vec = text2vec(text)
            words = set(text)
            X_vectors = []
            X_words = []
            for word in words:
                arr = np.concatenate([text_vec, text2vec([word])])
                X_vectors.append(arr)
                X_words.append(word)

            X_vectors = np.stack(X_vectors)
            target = model.predict(X_vectors)
            keywords = []
            for i, w in enumerate(X_words):
                if target[i] == 1:
                    keywords.append(w)

            _rank_keywords.append(keywords)
    return _rank_keywords

- Ranking keywords from title

In [53]:
evaluate(data['keywords'], rank_keywords(search, data['title_norm']))

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




- Ranking keywords from summary

In [54]:
evaluate(data['keywords'], rank_keywords(search, data['summary_norm']))

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




- Ranking keywords from content

In [55]:
evaluate(data['keywords'], rank_keywords(search, data['content_norm']))

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




–õ—É—á—à–∏–º –≤–∞—Ä–∏–∞–Ω—Ç–æ–º —è–≤–ª—è–µ—Ç—Å—è —Ä–∞–Ω–∂–∏—Ä–æ–≤–∞–Ω–∏–µ –∫–ª—é—á–µ–≤—ã—Ö —Å–ª–æ–≤ –∏–∑ summary - –º—ã –ø–æ–ª—É—á–∞–µ–º –ø—Ä–∏—Ä–æ—Å—Ç Precision –±–µ–∑ –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ–≥–æ –ø–∞–¥–µ–Ω–∏—è Recall (—Ç.–µ –±–æ–ª—å—à–∏–π F1 –≤ –∏—Ç–æ–≥–µ)

## 2 - RuBERT word ranking
–†–∞–∑–≤–∏–≤–∞–µ–º –ø—Ä–µ–¥—ã–¥—É—â—É—é –∏–¥–µ—é - –ø–æ—Å—Ç—Ä–æ–∏–º –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä –Ω–∞ –æ—Å–Ω–æ–≤–µ RuBERT, –±—É–¥–µ–º –ø–æ–¥–∞–≤–∞—Ç—å —Ç–µ–∫—Å—Ç(–∑–∞–≥–æ–ª–æ–≤–æ–∫, summary –∏–ª–∏ content) –≤ –∫–∞—á–µ—Å—Ç–≤–µ –ø–µ—Ä–≤–æ–≥–æ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è, –∞ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω–æ–µ –∫–ª—é—á–µ–≤–æ–µ —Å–ª–æ–≤–æ –≤ –∫–∞—á–µ—Å—Ç–≤–µ –≤—Ç–æ—Ä–æ–≥–æ

In [20]:
import os
os.environ['WANDB_DISABLED']= 'true'
import torch
import transformers as tr
import datasets as ds

In [21]:
data['all'] = data.loc[:, ['title', 'content', 'summary']].progress_apply(lambda x: x[0] + ' ' + x[1] + ' ' + x[2], axis=1)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [21]:
# example generation
texts      = []
text_pairs = []
labels     = []
np.random.seed(42)
for text, keywords, negatives in tqdm(data.loc[:, ['all', 'keywords', 'all_norm']].values):
    for word in keywords:
        texts.append(text)
        text_pairs.append(word)
        labels.append(1)
        
    no_keyword_list = list(filter(lambda x: x not in keywords, negatives))
    if len(no_keyword_list) > 0:
        for wrong_keyword in np.random.choice(no_keyword_list, size=len(keywords)):
            texts.append(text)
            text_pairs.append(wrong_keyword)
            labels.append(0)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




In [43]:
tokenizer = tr.AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = tr.AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
dataset = ds.Dataset.from_dict({
    'text': texts,
    'text_pair': text_pairs,
    'label': labels
})

In [24]:
def encode(batch):
    encoding = tokenizer(batch['text'], batch['text_pair'], padding='max_length', truncation='only_first', max_length=256)
    return encoding

In [25]:
dataset = dataset.map(encode, batched=True, batch_size=1000, num_proc=20)























In [26]:
dataset = dataset.remove_columns(['text', 'text_pair'])

In [27]:
dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

–î–ª—è –ø–æ–≤—Ç–æ—Ä–µ–Ω–∏—è —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–æ–≤ –≤ –¥–∞–Ω–Ω–æ–π –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å 
–≤–∏–¥–µ–æ–∫–∞—Ä—Ç—É —Å 24Gb+ –≤–∏–¥–µ–æ–ø–∞–º—è—Ç–∏(—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª rtx 3090). –ï—Å–ª–∏ —Ç–∞–∫–æ–π –Ω–µ—Ç - –Ω—É–∂–Ω–æ —É–º–µ–Ω—å—à–∏—Ç—å train_batch_size –∏ –∫—Ä–∞—Ç–Ω–æ —É–≤–µ–ª–∏—á–∏—Ç—å gradient_accumulation_steps. –ü—Ä–∏–º–µ—Ä:
- train_batch_size = 32, gradient_accumulation_steps = 32 (~12gb –ø–∞–º—è—Ç–∏)
- train_batch_size = 16, gradient_accumulation_steps = 64
- train_batch_size = 8,  gradient_accumulation_steps = 128

–¢–∞–∫ –∂–µ –Ω–∞–¥–æ –ø–æ—Å—Ç–∞–≤–∏—Ç—å transformers –∏–∑ –∏—Å—Ö–æ–¥–Ω–∏–∫–æ–≤
```
pip install git+https://https://github.com/huggingface/transformers.git
```
–ï—Å–ª–∏ —ç—Ç–æ–≥–æ –¥–µ–ª–∞—Ç—å –Ω–µ —Ö–æ—á–µ—Ç—Å—è, —Ç–æ –Ω–∞–¥–æ —É–±—Ä–∞—Ç—å –æ–ø—Ü–∏—é fp16_backend –∏ (–µ—Å–ª–∏ –Ω–µ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω apex) fp16, fp16_opt_level

In [44]:
training_args = tr.TrainingArguments(
    output_dir='./rubert/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=64,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./rubert/logs',            # directory for storing logs
    logging_steps=100,
    gradient_accumulation_steps=16,
    fp16=True,
    fp16_opt_level='O2',
    fp16_backend='amp'
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [45]:
trainer = tr.Trainer(
    model=model,                         # the instantiated ü§ó Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset
)

In [46]:
trainer.train()

Step,Training Loss
100,0.3606
200,0.1872
300,0.1549
400,0.1468
500,0.1267
600,0.119


TrainOutput(global_step=639, training_loss=0.17837474267807366, metrics={'train_runtime': 4413.2218, 'train_samples_per_second': 0.145, 'total_flos': 1.792335618141143e+17, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 715356672, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1527808, 'train_mem_gpu_alloc_delta': 2155103744, 'train_mem_cpu_peaked_delta': 2539520, 'train_mem_gpu_peaked_delta': 14878468096})

In [20]:
def rubert_rank_keywords(model, tokenizer, text_source: pd.Series, candidate_source: pd.Series, pred_device = 'cuda:0'):
    pred_device = torch.device(pred_device)
    model = model.to(pred_device)
    _rank_keywords = []
    for text, candidates in tqdm(zip(text_source, candidate_source), total=len(text_source)):
        if len(candidates) == 0:
            _rank_keywords.append([])
        else:
            texts = []
            words = []
            for word in candidates:
                texts.append(text)
                words.append(word)
                
            examples = {k:v.to(pred_device) for k, v in tokenizer(
                text=texts,
                text_pair=words,
                padding=True,
                truncation='only_first',
                max_length=256,
                return_tensors='pt'
            ).items()}
            with torch.no_grad():
                predictions = torch.argmax(torch.softmax((model(**examples)[0]), -1), -1).cpu().numpy()
                
            keywords = []
            for i, w in enumerate(words):
                if predictions[i] == 1:
                    keywords.append(w)
            
            _rank_keywords.append(keywords)
            
    return _rank_keywords

In [None]:
keywords_title = rubert_rank_keywords(model, tokenizer, data['all'], data['title_norm'])
keywords_summary = rubert_rank_keywords(model, tokenizer, data['all'], data['summary_norm'])

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))

- Keywords from title

In [51]:
evaluate(data['keywords'], keywords_title)

- Keywords from summary

In [52]:
evaluate(data['keywords'], keywords_summary)

–í –¥–∞–Ω–Ω–æ–º —Å–ª—É—á–∞–µ —Ç–∞–∫ –∂–µ –ª—É—á—à–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –∏–∑–≤–ª–µ—á–µ–Ω–∏–µ keywords –∏–∑ summary - –Ω–µ –ø—Ä–æ—Å–µ–¥–∞–µ–º –ø–æ recall –∏ —É–ª—É—á—à–∞–µ–º precision

# 3 - Rerank existing keywords
–í –ø—Ä–∏–º–µ—Ä–∞—Ö –≤—Å—Ç—Ä–µ—á–∞—é—Ç—Å—è –∫–µ–π–≤–æ—Ä–¥—ã, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ —Å–æ–¥–µ—Ä–∂–∞—Ç—Å—è –≤ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–∏—Ö —Ç–µ–∫—Å—Ç–∞—Ö –≤–æ–æ–±—â–µ –Ω–∏–∫–∞–∫. –¢–∞–∫ –∂–µ —á–∞—Å—Ç–æ –¥–ª—è keywords —Ä–∞–∑–Ω—ã—Ö –ø—Ä–∏–º–µ—Ä–æ–≤ –∏—Å–ø–æ–ª—å–∑—É—é—Ç—Å—è –æ–¥–∏–Ω–∞–∫–æ–≤—ã–µ —Å–ª–æ–≤–∞. –ò–∑ —ç—Ç–æ–≥–æ –Ω–∞–ø—Ä–∞–≤—à–∏–≤–∞–µ—Ç—Å—è –∏–¥–µ—è —Å–æ–±—Ä–∞—Ç—å –ø–æ–ª–Ω–æ–µ –º–Ω–æ–∂–µ—Å—Ç–≤–æ –≤–æ–∑–º–æ–∂–Ω—ã—Ö –∫–µ–π–≤–æ—Ä–¥–æ–≤ –∏ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –ø—Ä–∏–º–µ—Ä–∞ —Ä–∞–Ω–∂–∏—Ä–æ–≤–∞—Ç—å —ç—Ç–æ —Å–∞–º–æ–µ –º–Ω–æ–∂–µ—Å—Ç–≤–æ —Å –ø–æ–æ–º—â—å—é –∫–∞–∫–æ–π-–ª–∏–±–æ –º–æ–¥–µ–ª—å–∫–∏, –æ—Ç–±–∏—Ä–∞—è –≤ –∏—Ç–æ–≥–µ —Ç–æ–ø5/10 —Å–∞–º—ã—Ö —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –∫–µ–π–≤–æ—Ä–¥–æ–≤. –≠—Ç–æ –Ω–µ —Å–æ–≤—Å–µ–º extraction, –Ω–æ –∑–∞–¥–∞—á—É —Ä–µ—à–∞–µ—Ç.

In [104]:
def rank_all_keywords(model, text_source: pd.Series, all_keywords):
    _rank_keywords = []
    for text in tqdm(text_source):
        if len(text) == 0:
            _rank_keywords.append([])
        else:
            text_vec = text2vec(text)
            X_vectors = []
            X_words = []
            for word in all_keywords:
                try:
                    arr = np.concatenate([text_vec, text2vec([word])])
                    X_vectors.append(arr)
                    X_words.append(word)
                except Exception as e:
                    print(e)
                    print(text_vec.shape)
                    print(text2vec([word]).shape)
                    print(word)
                    return

            X_vectors = np.stack(X_vectors)
            target = model.predict(X_vectors)
            keywords = []
            for i, w in enumerate(X_words):
                if target[i] == 1:
                    keywords.append(w)
                    
            #keywords = list(reversed(sorted(keywords, key=lambda x: x[1])))
            #keywords = keywords[:top_n]

            _rank_keywords.append(keywords)
    return _rank_keywords

In [105]:
all_keywords = []
for keywords in data['keywords']:
    all_keywords += keywords

# not really all(about 15000 keywords) cuz it is 23:31 and i'm running out of time
all_keywords = [word for word, count in Counter(all_keywords).most_common(5000)]

In [106]:
all_keywords.__len__()

5000

In [108]:
ranked_keywords_top10 = rank_all_keywords(search, data['title_norm'], all_keywords)

HBox(children=(FloatProgress(value=0.0, max=13194.0), HTML(value='')))




KeyboardInterrupt: 

In [95]:
ranked_keywords_top5 = [kw[:5] for kw in ranked_keywords_top10]

In [98]:
data.keywords[0]

['–≤ —Ä–æ—Å—Å–∏–∏',
 '–æ–ª–∏–º–ø–∏–π—Å–∫–∏–µ –∏–≥—Ä—ã 2014 –≤ —Å–æ—á–∏',
 '—Å–ø–æ—Ä—Ç',
 '—Å–ø–æ—Ä—Ç—Å–º–µ–Ω',
 '—ç–∫—Å–∫–ª—é–∑–∏–≤ rt',
 '—é–ª–∏—è –ª–∏–ø–Ω–∏—Ü–∫–∞—è',
 '—Ñ–∏–≥—É—Ä–Ω–æ–µ –∫–∞—Ç–∞–Ω–∏–µ']

In [99]:
ranked_keywords_top10[0]

[('RC', 1.0),
 ('pd', 1.0),
 ('v8', 1.0),
 ('CNC', 1.0),
 ('5g', 1.0),
 ('HA', 1.0),
 ('arm', 1.0),
 ('vk', 1.0),
 ('ar', 1.0),
 ('3d', 1.0)]

In [96]:
evaluate(data['keywords'], ranked_keywords_top10)

In [97]:
evaluate(data['keywords'], ranked_keywords_top5)

In [109]:
#it does not work, sadly