In [1]:
from pathlib import Path
import pickle

from navec import Navec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader
from tqdm import tqdm
import nltk
import numpy as np
import pandas as pd
import torch

from deepxml.attentionxml import AttentionXML, CorNetAttentionXML
from deepxml.data_utils import Tokenizer
from deepxml.dataset import MultiLabelDataset
from deepxml.evaluation import get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10
from deepxml.meshprobenet import MeSHProbeNet, CorNetMeSHProbeNet
from deepxml.models import Model
from deepxml.xmlcnn import CorNetXMLCNN, XMLCNN

nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/daniil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/daniil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
torch.cuda.is_available()

True

In [3]:
df = pd.read_parquet("../data/habr_posts_dataset.parquet")

In [4]:
df.head()

Unnamed: 0,post_id,author,title,tags,text
0,807711,Kaspersky_Lab,Security Week 2416: уязвимость в серверных мат...,"[Блог компании «Лаборатория Касперского», Инфо...",На прошлой неделе исследователи компании Binar...
1,807709,markshevchenko,Вычислительные выражения: Подробнее про типы-о...,"[.NET, Функциональное программирование, F#]",В предыдущем посте мы познакомились с концепци...
2,807707,ru_vds,Угадай местоположение льдины с арктическим ЦОД...,"[Блог компании RUVDS.com, Хостинг, Системное а...","Как вы наверняка знаете, 12 апреля RUVDS успеш..."
3,807705,shaddyk,Запустили проект с НСИС по повышению качества ...,"[Блог компании HFLabs, Открытые данные, IT-ком...",НСИС — оператор единой автоматизированной инфо...
4,807703,VokaMut,Тестируем AI на создании прикладного приложения,"[Веб-разработка, Искусственный интеллект, Natu...","Всем привет, я Григорий Тумаков, CTO в Моризо ..."


In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [6]:
tokenizer = Tokenizer(language="russian")

In [7]:
tokenizer.build_vocab(train_df["text"].to_list())

In [8]:
train_tokens = tokenizer(train_df["text"].to_list())
val_tokens = tokenizer(val_df["text"].to_list())
test_tokens = tokenizer(test_df["text"].to_list())

Converting token to id: 100%|██████████| 2800/2800 [00:00<00:00, 6916.21it/s]
Converting token to id: 100%|██████████| 700/700 [00:00<00:00, 5612.18it/s]
Converting token to id: 100%|██████████| 876/876 [00:00<00:00, 7435.63it/s]


In [9]:
mlb = MultiLabelBinarizer(sparse_output=True)
train_labels = mlb.fit_transform(train_df["tags"].to_list())
val_labels = mlb.transform(val_df["tags"].to_list())
test_labels = mlb.transform(test_df["tags"].to_list())



In [10]:
train_loader = DataLoader(MultiLabelDataset(train_tokens, train_labels),
                          8, shuffle=True)

val_loader = DataLoader(MultiLabelDataset(val_tokens, val_labels),
                        8, shuffle=False)

test_loader = DataLoader(MultiLabelDataset(test_tokens, test_labels),
                         8, shuffle=False)

In [11]:
navec = Navec.load("../data/navec_hudlit_v1_12B_500K_300d_100q.tar")

In [12]:
words_embeddings = np.zeros(shape=(len(tokenizer.vocab), 300))

In [14]:
for word, idx in tqdm(tokenizer.vocab.items()):
    words_embeddings[idx] = navec.get(word, navec["<unk>"])

In [15]:
results_df = pd.DataFrame(columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])

In [16]:
model_config = {"dynamic_pool_length": 8,
                "bottleneck_dim": 100,
                "num_filters": 64,
                "dropout": 0.5,
                "emb_trainable": False}

In [19]:
model = Model(network=XMLCNN,
              **model_config, labels_num=train_labels.shape[1], emb_size=300, vocab_size=len(tokenizer.vocab),
              emb_init=words_embeddings)

In [20]:
%%time
model.train(train_loader, val_loader, nb_epoch=10)

0 800 train loss: 0.1425434 valid loss: 0.0325206 P@5: 0.08343 N@5: 0.12186 early stop: 0
0 1600 train loss: 0.0382701 valid loss: 0.0317664 P@5: 0.06943 N@5: 0.10260 early stop: 0
0 2400 train loss: 0.0362993 valid loss: 0.0305024 P@5: 0.08943 N@5: 0.12659 early stop: 0
1 400 train loss: 0.0346231 valid loss: 0.0304501 P@5: 0.08086 N@5: 0.12645 early stop: 0
1 1200 train loss: 0.0341054 valid loss: 0.0318948 P@5: 0.09229 N@5: 0.13734 early stop: 0
1 2000 train loss: 0.0341609 valid loss: 0.0304062 P@5: 0.09371 N@5: 0.13955 early stop: 0
1 2800 train loss: 0.0333776 valid loss: 0.0312565 P@5: 0.09400 N@5: 0.15221 early stop: 0
2 800 train loss: 0.0330080 valid loss: 0.0301471 P@5: 0.10343 N@5: 0.15320 early stop: 0
2 1600 train loss: 0.0326168 valid loss: 0.0294227 P@5: 0.09886 N@5: 0.15501 early stop: 0
2 2400 train loss: 0.0320364 valid loss: 0.0296667 P@5: 0.09743 N@5: 0.15105 early stop: 0
3 400 train loss: 0.0328258 valid loss: 0.0297506 P@5: 0.10600 N@5: 0.16906 early stop: 0
3 1

In [21]:
test_res = model.predict(test_loader)

                                                          

In [22]:
metrics = [metric(test_res[1], test_labels) for metric in [get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10]]
metrics

[0.2465753424657534,
 0.16552511415525115,
 0.11883561643835616,
 0.2465753424657534,
 0.2595369389003099,
 0.3134625093429662]

In [28]:
def save_model(model, name):
    with open(Path(f"../data/models/{name}.pickle"), "wb") as f:
        pickle.dump(model, f)

In [29]:
save_model(model, "XMLCNN")

In [31]:
results_df = pd.concat([results_df, 
                        pd.DataFrame([["XMLCNN"]+metrics+["3min 36s"]+["40 Mb"]], 
                                     columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])])

  results_df = pd.concat([results_df,


In [32]:
model = Model(network=CorNetXMLCNN,
              **model_config, labels_num=train_labels.shape[1], emb_size=300, vocab_size=len(tokenizer.vocab),
              emb_init=words_embeddings)

In [33]:
%%time
model.train(train_loader, val_loader, nb_epoch=10)

0 800 train loss: 0.0683950 valid loss: 0.0307038 P@5: 0.07000 N@5: 0.10285 early stop: 0
0 1600 train loss: 0.0310858 valid loss: 0.0300879 P@5: 0.08714 N@5: 0.12623 early stop: 0
0 2400 train loss: 0.0302093 valid loss: 0.0298995 P@5: 0.07486 N@5: 0.11478 early stop: 0
1 400 train loss: 0.0299098 valid loss: 0.0295354 P@5: 0.09629 N@5: 0.13964 early stop: 0
1 1200 train loss: 0.0298271 valid loss: 0.0290523 P@5: 0.10086 N@5: 0.15748 early stop: 0
1 2000 train loss: 0.0292776 valid loss: 0.0285338 P@5: 0.11600 N@5: 0.17862 early stop: 0
1 2800 train loss: 0.0289089 valid loss: 0.0281247 P@5: 0.12343 N@5: 0.19264 early stop: 0
2 800 train loss: 0.0276769 valid loss: 0.0275592 P@5: 0.14429 N@5: 0.23023 early stop: 0
2 1600 train loss: 0.0279593 valid loss: 0.0269707 P@5: 0.15400 N@5: 0.24929 early stop: 0
2 2400 train loss: 0.0269973 valid loss: 0.0264142 P@5: 0.15914 N@5: 0.25806 early stop: 0
3 400 train loss: 0.0266423 valid loss: 0.0258331 P@5: 0.18343 N@5: 0.29126 early stop: 0
3 1

In [34]:
test_res = model.predict(test_loader)

                                                          

In [35]:
metrics = [metric(test_res[1], test_labels) for metric in [get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10]]
metrics

[0.4463470319634703,
 0.23493150684931507,
 0.154337899543379,
 0.4463470319634703,
 0.3954260144172659,
 0.4465979302296313]

In [36]:
save_model(model, "CorNetXMLCNN")

In [38]:
results_df = pd.concat([results_df, 
                        pd.DataFrame([["CorNetXMLCNN"]+metrics+["3min 37s"] + ["43 Mb"]], 
                                     columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])])

In [39]:
model_config = {"hidden_size": 300,
                "n_layers": 2,
                "n_probes": 5,
                "dropout": 0.5,
                "emb_trainable": False}

In [40]:
model = Model(network=MeSHProbeNet,
              **model_config, labels_num=train_labels.shape[1], emb_size=300, vocab_size=len(tokenizer.vocab),
              emb_init=words_embeddings)

In [41]:
%%time
model.train(train_loader, val_loader, nb_epoch=10)

0 800 train loss: 0.0602205 valid loss: 0.0310268 P@5: 0.06886 N@5: 0.10071 early stop: 0
0 1600 train loss: 0.0311347 valid loss: 0.0307127 P@5: 0.07600 N@5: 0.11368 early stop: 0
0 2400 train loss: 0.0303034 valid loss: 0.0306265 P@5: 0.05000 N@5: 0.08104 early stop: 0
1 400 train loss: 0.0302248 valid loss: 0.0301865 P@5: 0.08314 N@5: 0.12906 early stop: 0
1 1200 train loss: 0.0302712 valid loss: 0.0294630 P@5: 0.09943 N@5: 0.14848 early stop: 0
1 2000 train loss: 0.0295120 valid loss: 0.0290530 P@5: 0.10571 N@5: 0.16264 early stop: 0
1 2800 train loss: 0.0287745 valid loss: 0.0288528 P@5: 0.11000 N@5: 0.16318 early stop: 0
2 800 train loss: 0.0282127 valid loss: 0.0283309 P@5: 0.11171 N@5: 0.18130 early stop: 0
2 1600 train loss: 0.0649474 valid loss: 0.0280779 P@5: 0.12743 N@5: 0.19698 early stop: 0
2 2400 train loss: 0.0272808 valid loss: 0.0277157 P@5: 0.12629 N@5: 0.19919 early stop: 0
3 400 train loss: 0.0279123 valid loss: 0.0275076 P@5: 0.13000 N@5: 0.19700 early stop: 0
3 1

In [42]:
test_res = model.predict(test_loader)

                                                          

In [43]:
metrics = [metric(test_res[1], test_labels) for metric in [get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10]]
metrics

[0.408675799086758,
 0.22808219178082192,
 0.1545662100456621,
 0.408675799086758,
 0.3824655151875472,
 0.4403073827378376]

In [44]:
save_model(model, "MeSHProbeNet")

In [45]:
results_df = pd.concat([results_df, 
                        pd.DataFrame([["MeSHProbeNet"]+metrics+["39min 53s"]+["313 Mb"]], 
                                     columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])])

In [47]:
model = Model(network=CorNetMeSHProbeNet,
              **model_config, labels_num=train_labels.shape[1], emb_size=300, vocab_size=len(tokenizer.vocab),
              emb_init=words_embeddings)

In [48]:
%%time
model.train(train_loader, val_loader, nb_epoch=10)

0 800 train loss: 0.0549944 valid loss: 0.0309916 P@5: 0.06029 N@5: 0.07631 early stop: 0
0 1600 train loss: 0.0314117 valid loss: 0.0305396 P@5: 0.07514 N@5: 0.11204 early stop: 0
0 2400 train loss: 0.0308404 valid loss: 0.0303925 P@5: 0.06943 N@5: 0.09779 early stop: 0
1 400 train loss: 0.0302049 valid loss: 0.0296810 P@5: 0.09086 N@5: 0.14447 early stop: 0
1 1200 train loss: 0.0289069 valid loss: 0.0292488 P@5: 0.11114 N@5: 0.17299 early stop: 0
1 2000 train loss: 0.0302046 valid loss: 0.0291146 P@5: 0.09714 N@5: 0.14324 early stop: 0
1 2800 train loss: 0.0288426 valid loss: 0.0282682 P@5: 0.11143 N@5: 0.17320 early stop: 0
2 800 train loss: 0.0277113 valid loss: 0.0277409 P@5: 0.13086 N@5: 0.20136 early stop: 0
2 1600 train loss: 0.0275913 valid loss: 0.0269092 P@5: 0.15600 N@5: 0.24277 early stop: 0
2 2400 train loss: 0.0269427 valid loss: 0.0268199 P@5: 0.14657 N@5: 0.22285 early stop: 0
3 400 train loss: 0.0269015 valid loss: 0.0266107 P@5: 0.15400 N@5: 0.23918 early stop: 0
3 1

In [49]:
test_res = model.predict(test_loader)

                                                          

In [50]:
metrics = [metric(test_res[1], test_labels) for metric in [get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10]]
metrics

[0.4520547945205479,
 0.24885844748858446,
 0.1636986301369863,
 0.4520547945205479,
 0.4144862814755454,
 0.4701097685721493]

In [51]:
save_model(model, "CorNetMeSHProbeNet")

In [52]:
results_df = pd.concat([results_df, 
                        pd.DataFrame([["CorNetMeSHProbeNet"]+metrics+["26min 9s"]+["317 Mb"]], 
                                     columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])])

In [53]:
model_config = {"hidden_size": 256,
                "layers_num": 1,
                "linear_size": [256],
                "dropout": 0.5,
                "emb_trainable": False}

In [54]:
model = Model(network=AttentionXML,
              **model_config, labels_num=train_labels.shape[1], emb_size=300, vocab_size=len(tokenizer.vocab),
              emb_init=words_embeddings)

In [55]:
%%time
model.train(train_loader, val_loader, nb_epoch=10)

0 800 train loss: 0.0571790 valid loss: 0.0304442 P@5: 0.07829 N@5: 0.11631 early stop: 0
0 1600 train loss: 0.0301680 valid loss: 0.0299156 P@5: 0.07600 N@5: 0.11359 early stop: 0
0 2400 train loss: 0.0298240 valid loss: 0.0296295 P@5: 0.08429 N@5: 0.12316 early stop: 0
1 400 train loss: 0.0291400 valid loss: 0.0297673 P@5: 0.09086 N@5: 0.13023 early stop: 0
1 1200 train loss: 0.0295991 valid loss: 0.0291480 P@5: 0.10257 N@5: 0.14387 early stop: 0
1 2000 train loss: 0.0289475 valid loss: 0.0287991 P@5: 0.11286 N@5: 0.16845 early stop: 0
1 2800 train loss: 0.0280205 valid loss: 0.0280255 P@5: 0.11914 N@5: 0.19280 early stop: 0
2 800 train loss: 0.0272995 valid loss: 0.0275432 P@5: 0.13571 N@5: 0.21224 early stop: 0
2 1600 train loss: 0.0274761 valid loss: 0.0269058 P@5: 0.15000 N@5: 0.23798 early stop: 0
2 2400 train loss: 0.0263523 valid loss: 0.0264282 P@5: 0.16143 N@5: 0.25232 early stop: 0
3 400 train loss: 0.0263287 valid loss: 0.0266501 P@5: 0.17571 N@5: 0.26852 early stop: 0
3 1

In [28]:
test_res = model.predict(test_loader)

                                                          

In [29]:
metrics = [metric(test_res[1], test_labels) for metric in [get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10]]
metrics

[0.4737442922374429,
 0.24794520547945206,
 0.1552511415525114,
 0.4737442922374429,
 0.4208846112050576,
 0.46421247403766547]

In [56]:
save_model(model, "AttentionXML")

In [57]:
results_df = pd.concat([results_df, 
                        pd.DataFrame([["AttentionXML"]+metrics+["10min 47s"]+["84 Mb"]], 
                                     columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])])

In [59]:
model = Model(network=CorNetAttentionXML,
              **model_config, labels_num=train_labels.shape[1], emb_size=300, vocab_size=len(tokenizer.vocab), 
              emb_init=words_embeddings)

In [60]:
%%time
model.train(train_loader, val_loader, nb_epoch=10)

0 800 train loss: 0.0528414 valid loss: 0.0304479 P@5: 0.07829 N@5: 0.11598 early stop: 0
0 1600 train loss: 0.0301051 valid loss: 0.0297904 P@5: 0.08000 N@5: 0.11930 early stop: 0
0 2400 train loss: 0.0303168 valid loss: 0.0295530 P@5: 0.08657 N@5: 0.12231 early stop: 0
1 400 train loss: 0.0292458 valid loss: 0.0293464 P@5: 0.09743 N@5: 0.13797 early stop: 0
1 1200 train loss: 0.0288400 valid loss: 0.0288343 P@5: 0.10886 N@5: 0.16221 early stop: 0
1 2000 train loss: 0.0291567 valid loss: 0.0285651 P@5: 0.11371 N@5: 0.17429 early stop: 0
1 2800 train loss: 0.0274044 valid loss: 0.0278816 P@5: 0.13200 N@5: 0.20033 early stop: 0
2 800 train loss: 0.0272165 valid loss: 0.0274133 P@5: 0.14229 N@5: 0.22571 early stop: 0
2 1600 train loss: 0.0268046 valid loss: 0.0268896 P@5: 0.15143 N@5: 0.23941 early stop: 0
2 2400 train loss: 0.0263644 valid loss: 0.0267172 P@5: 0.15029 N@5: 0.23674 early stop: 0
3 400 train loss: 0.0263225 valid loss: 0.0260991 P@5: 0.16800 N@5: 0.26342 early stop: 0
3 1

In [61]:
test_res = model.predict(test_loader)

                                                          

In [62]:
metrics = [metric(test_res[1], test_labels) for metric in [get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10]]
metrics

[0.4452054794520548,
 0.2458904109589041,
 0.15924657534246575,
 0.4452054794520548,
 0.40768301065668583,
 0.4597818721359381]

In [63]:
save_model(model, "CorNetAttentionXML")

In [64]:
results_df = pd.concat([results_df, 
                        pd.DataFrame([["CorNetAttentionXML"]+metrics+["10min 48s"]+["88 Mb"]], 
                                     columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])])

In [65]:
results_df

Unnamed: 0,model_name,P@1,P@5,P@10,N@1,N@5,N@10,time,size
0,XMLCNN,0.246575,0.165525,0.118836,0.246575,0.259537,0.313463,3min 36s,40 Mb
1,CorNetXMLCNN,0.446347,0.234932,0.154338,0.446347,0.395426,0.446598,2min 21s,43 Mb
2,MeSHProbeNet,0.408676,0.228082,0.154566,0.408676,0.382466,0.440307,39min 53s,313 Mb
3,CorNetMeSHProbeNet,0.452055,0.248858,0.163699,0.452055,0.414486,0.47011,26min 9s,317 Mb
4,AttentionXML,0.473744,0.247945,0.155251,0.473744,0.420885,0.464212,10min 47s,84 Mb
5,CorNetAttentionXML,0.445205,0.24589,0.159247,0.445205,0.407683,0.459782,10min 48s,88 Mb
