In [1]:
from pathlib import Path
import pickle

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

import pandas as pd
import torch

from deepxml.evaluation import get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10
from deepxml.models import Model

from models.dataset import RuBERTDataset
from models.rubert import RuBERTXML, CorNetRuBERTXML

In [2]:
torch.cuda.is_available()

True

In [16]:
# df = pd.read_parquet("../data/habr_posts_dataset.parquet")
df = pd.read_parquet("habr_posts_dataset.parquet")

In [17]:
df.head()

Unnamed: 0,post_id,author,title,tags,text
0,807711,Kaspersky_Lab,Security Week 2416: уязвимость в серверных мат...,"[Блог компании «Лаборатория Касперского», Инфо...",На прошлой неделе исследователи компании Binar...
1,807709,markshevchenko,Вычислительные выражения: Подробнее про типы-о...,"[.NET, Функциональное программирование, F#]",В предыдущем посте мы познакомились с концепци...
2,807707,ru_vds,Угадай местоположение льдины с арктическим ЦОД...,"[Блог компании RUVDS.com, Хостинг, Системное а...","Как вы наверняка знаете, 12 апреля RUVDS успеш..."
3,807705,shaddyk,Запустили проект с НСИС по повышению качества ...,"[Блог компании HFLabs, Открытые данные, IT-ком...",НСИС — оператор единой автоматизированной инфо...
4,807703,VokaMut,Тестируем AI на создании прикладного приложения,"[Веб-разработка, Искусственный интеллект, Natu...","Всем привет, я Григорий Тумаков, CTO в Моризо ..."


In [18]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [19]:
tokenizer = AutoTokenizer.from_pretrained(
    "cointegrated/rubert-tiny2", truncation=True, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [20]:
mlb = MultiLabelBinarizer(sparse_output=True)
train_labels = mlb.fit_transform(train_df["tags"].to_list())
val_labels = mlb.transform(val_df["tags"].to_list())
test_labels = mlb.transform(test_df["tags"].to_list())



In [48]:
train_dataset = RuBERTDataset(train_df["text"].to_list(), train_labels, tokenizer)
val_dataset = RuBERTDataset(val_df["text"].to_list(), val_labels, tokenizer)
test_dataset = RuBERTDataset(test_df["text"].to_list(), test_labels, tokenizer)

In [49]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [23]:
results_df = pd.DataFrame(columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])

In [32]:
model = Model(network=RuBERTXML,
              labels_num=train_labels.shape[1], hidden_layers=[400], bert_model="cointegrated/rubert-tiny2")

In [33]:
%%time
model.train(train_loader, val_loader, nb_epoch=10, optimizer=AdamW, opt_params={"lr": 1e-4},)

0 800 train loss: 0.2407662 valid loss: 0.0338888 P@5: 0.08343 N@5: 0.11952 early stop: 0
0 1600 train loss: 0.0307194 valid loss: 0.0303173 P@5: 0.08286 N@5: 0.12052 early stop: 0
0 2400 train loss: 0.0298801 valid loss: 0.0300523 P@5: 0.07829 N@5: 0.11525 early stop: 0
1 400 train loss: 0.0299025 valid loss: 0.0299339 P@5: 0.07400 N@5: 0.10944 early stop: 0
1 1200 train loss: 0.0300023 valid loss: 0.0298613 P@5: 0.07857 N@5: 0.11686 early stop: 0
1 2000 train loss: 0.0299301 valid loss: 0.0297785 P@5: 0.07629 N@5: 0.11035 early stop: 0
1 2800 train loss: 0.0295203 valid loss: 0.0296518 P@5: 0.09171 N@5: 0.12902 early stop: 0
2 800 train loss: 0.0296700 valid loss: 0.0296501 P@5: 0.08657 N@5: 0.12796 early stop: 0
2 1600 train loss: 0.0290420 valid loss: 0.0292697 P@5: 0.10086 N@5: 0.15679 early stop: 0
2 2400 train loss: 0.0293642 valid loss: 0.0287500 P@5: 0.12571 N@5: 0.20467 early stop: 0
3 400 train loss: 0.0281392 valid loss: 0.0278220 P@5: 0.14229 N@5: 0.23909 early stop: 0
3 1

In [50]:
test_res = model.predict(test_loader)



Predict:   1%|          | 1/110 [00:00<00:21,  5.15it/s][A
Predict:   2%|▏         | 2/110 [00:00<00:20,  5.25it/s][A
Predict:   3%|▎         | 3/110 [00:00<00:17,  6.25it/s][A
Predict:   4%|▎         | 4/110 [00:00<00:20,  5.21it/s][A
Predict:   5%|▍         | 5/110 [00:00<00:19,  5.52it/s][A
Predict:   5%|▌         | 6/110 [00:01<00:16,  6.19it/s][A
Predict:   6%|▋         | 7/110 [00:01<00:17,  5.91it/s][A
Predict:   7%|▋         | 8/110 [00:01<00:15,  6.56it/s][A
Predict:   8%|▊         | 9/110 [00:01<00:16,  6.00it/s][A
Predict:   9%|▉         | 10/110 [00:01<00:20,  4.93it/s][A
Predict:  10%|█         | 11/110 [00:02<00:21,  4.65it/s][A
Predict:  11%|█         | 12/110 [00:02<00:27,  3.58it/s][A
Predict:  12%|█▏        | 13/110 [00:02<00:26,  3.67it/s][A
Predict:  13%|█▎        | 14/110 [00:02<00:21,  4.38it/s][A
Predict:  14%|█▎        | 15/110 [00:03<00:29,  3.25it/s][A
Predict:  15%|█▍        | 16/110 [00:03<00:28,  3.33it/s][A
Predict:  15%|█▌        | 17/11

In [53]:
metrics = [metric(test_res[1], test_labels) for metric in [get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10]]
metrics

[0.5399543378995434,
 0.2732876712328767,
 0.17328767123287672,
 0.5399543378995434,
 0.47282464052461043,
 0.5259509535945165]

In [56]:
def save_model(model, name):
    # with open(Path(f"../data/models/{name}.pickle"), "wb") as f:
    with open(Path(f"{name}.pickle"), "wb") as f:
        pickle.dump(model, f)

In [59]:
save_model(model, "RuBERTXML")

In [61]:
results_df = pd.concat([results_df,
                        pd.DataFrame([["RuBERTXML"]+metrics+["13min 35s"]+["120 Mb"]],
                                     columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])])

In [69]:
model = Model(network=CorNetRuBERTXML,
              labels_num=train_labels.shape[1], hidden_layers=[400], bert_model="cointegrated/rubert-tiny2")

In [70]:
%%time
model.train(train_loader, val_loader, nb_epoch=10, optimizer=AdamW, opt_params={"lr": 1e-4},)

0 800 train loss: 0.2074429 valid loss: 0.0333869 P@5: 0.07171 N@5: 0.10719 early stop: 0
0 1600 train loss: 0.0310983 valid loss: 0.0302001 P@5: 0.08200 N@5: 0.10888 early stop: 0
0 2400 train loss: 0.0291592 valid loss: 0.0299249 P@5: 0.07600 N@5: 0.10997 early stop: 0
1 400 train loss: 0.0298579 valid loss: 0.0298116 P@5: 0.07057 N@5: 0.10779 early stop: 0
1 1200 train loss: 0.0300618 valid loss: 0.0298756 P@5: 0.08571 N@5: 0.12199 early stop: 0
1 2000 train loss: 0.0296968 valid loss: 0.0298106 P@5: 0.08343 N@5: 0.11705 early stop: 0
1 2800 train loss: 0.0295364 valid loss: 0.0293820 P@5: 0.09771 N@5: 0.14873 early stop: 0
2 800 train loss: 0.0290321 valid loss: 0.0290704 P@5: 0.10943 N@5: 0.18650 early stop: 0
2 1600 train loss: 0.0285790 valid loss: 0.0282579 P@5: 0.14000 N@5: 0.23133 early stop: 0
2 2400 train loss: 0.0282506 valid loss: 0.0273351 P@5: 0.17657 N@5: 0.28758 early stop: 0
3 400 train loss: 0.0265690 valid loss: 0.0262836 P@5: 0.19171 N@5: 0.31602 early stop: 0
3 1

In [71]:
test_res = model.predict(test_loader)


Predict:   0%|          | 0/110 [00:00<?, ?it/s][A
Predict:   1%|          | 1/110 [00:00<00:17,  6.27it/s][A
Predict:   2%|▏         | 2/110 [00:00<00:19,  5.61it/s][A
Predict:   4%|▎         | 4/110 [00:00<00:15,  6.95it/s][A
Predict:   5%|▍         | 5/110 [00:00<00:15,  6.93it/s][A
Predict:   6%|▋         | 7/110 [00:00<00:12,  8.24it/s][A
Predict:   7%|▋         | 8/110 [00:01<00:11,  8.55it/s][A
Predict:   8%|▊         | 9/110 [00:01<00:12,  7.78it/s][A
Predict:   9%|▉         | 10/110 [00:01<00:13,  7.25it/s][A
Predict:  10%|█         | 11/110 [00:01<00:13,  7.59it/s][A
Predict:  11%|█         | 12/110 [00:01<00:13,  7.38it/s][A
Predict:  12%|█▏        | 13/110 [00:01<00:12,  7.89it/s][A
Predict:  14%|█▎        | 15/110 [00:01<00:10,  8.86it/s][A
Predict:  15%|█▍        | 16/110 [00:02<00:10,  9.04it/s][A
Predict:  15%|█▌        | 17/110 [00:02<00:10,  9.06it/s][A
Predict:  16%|█▋        | 18/110 [00:02<00:10,  8.89it/s][A
Predict:  17%|█▋        | 19/110 [00:02

In [72]:
metrics = [metric(test_res[1], test_labels) for metric in [get_p_1, get_p_5, get_p_10, get_n_1, get_n_5, get_n_10]]
metrics

[0.5228310502283106,
 0.27922374429223745,
 0.17591324200913241,
 0.5228310502283106,
 0.47363629364451026,
 0.5253108609645815]

In [73]:
save_model(model, "CorNetRuBERTXML")

In [74]:
results_df = pd.concat([results_df,
                        pd.DataFrame([["CorNetRuBERTXML"]+metrics+["13min 47s"]+["130 Mb"]],
                                     columns=["model_name", "P@1", "P@5", "P@10", "N@1", "N@5", "N@10", "time", "size"])])

In [75]:
results_df

Unnamed: 0,model_name,P@1,P@5,P@10,N@1,N@5,N@10,time,size
0,RuBERTXML,0.539954,0.273288,0.173288,0.539954,0.472825,0.525951,13min 35s,120 Mb
0,CorNetRuBERTXML,0.522831,0.279224,0.175913,0.522831,0.473636,0.525311,13min 47s,130 Mb
